33 using namespace TableFunctions_Namespace;
36 std::vector<const T*>
pluck_ptrs(
const std::vector<std::vector<T>>& data,
37 const int64_t start_idx,
38 const int64_t end_idx) {
39 std::vector<const T*> raw_ptrs;
42 CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
43 for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
44 raw_ptrs.emplace_back(data[col_idx].data());
50 std::vector<const T*>
pluck_ptrs(
const std::vector<T*>& data,
51 const int64_t start_idx,
52 const int64_t end_idx) {
53 std::vector<const T*> raw_ptrs;
56 CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
57 for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
58 raw_ptrs.emplace_back(data[col_idx]);
75 const std::vector<std::string> ml_frameworks = {
"onedal",
"mlpack"};
76 const int32_t num_frameworks = ml_frameworks.size();
78 const std::vector<int32_t> ml_framework_string_ids =
81 #if defined(HAVE_ONEDAL) || defined(HAVE_MLPACK)
82 bool found_available_framework =
false;
83 auto framework_found_actions = [&output_availability,
85 &found_available_framework](
const int64_t out_row_idx) {
86 output_availability[out_row_idx] =
true;
87 if (!found_available_framework) {
88 output_default[out_row_idx] =
true;
89 found_available_framework =
true;
91 output_default[out_row_idx] =
false;
96 #if !defined(HAVE_ONEDAL) || !defined(HAVE_MLPACK)
97 auto framework_not_found_actions = [&output_availability,
98 &output_default](
const int64_t out_row_idx) {
99 output_availability[out_row_idx] =
false;
100 output_default[out_row_idx] =
false;
104 for (int32_t out_row_idx = 0; out_row_idx < num_frameworks; ++out_row_idx) {
105 output_ml_frameworks[out_row_idx] = ml_framework_string_ids[out_row_idx];
106 if (ml_frameworks[out_row_idx] ==
"onedal") {
108 framework_found_actions(out_row_idx);
110 framework_not_found_actions(out_row_idx);
112 }
else if (ml_frameworks[out_row_idx] ==
"mlpack") {
114 framework_found_actions(out_row_idx);
116 framework_not_found_actions(out_row_idx);
120 return num_frameworks;
137 template <
typename K,
typename T>
142 const int num_clusters,
143 const int num_iterations,
149 output_ids = input_ids;
152 return mgr.ERROR_MESSAGE(
"Invalid KMeans initializaiton strategy: " +
156 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
158 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
162 const auto denulled_data =
denull_data(input_features);
163 const int64_t num_rows = denulled_data.masked_num_rows;
164 const bool data_is_masked =
165 denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
166 std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
167 int32_t* denulled_output =
168 data_is_masked ? denulled_output_allocation.data() : output_clusters.
ptr_;
171 const auto normalized_ptrs =
pluck_ptrs(normalized_data, 0L, normalized_data.size());
174 bool did_execute =
false;
178 onedal_kmeans_impl(normalized_ptrs,
183 kmeans_init_strategy);
190 mlpack_kmeans_impl(normalized_ptrs,
195 kmeans_init_strategy);
200 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
201 " ML library to support kmeans implementation.");
203 }
catch (std::runtime_error& e) {
204 return mgr.ERROR_MESSAGE(e.what());
207 if (data_is_masked) {
209 denulled_data.reverse_index_map,
210 output_clusters.
ptr_,
211 denulled_data.unmasked_num_rows,
212 inline_null_value<int32_t>());
214 return input_ids.
size();
230 template <
typename K,
typename T>
234 const int num_clusters,
235 const int num_iterations,
239 std::string preferred_ml_framework{
"DEFAULT"};
246 preferred_ml_framework,
263 template <
typename K,
typename T>
267 const int32_t num_clusters,
268 const int32_t num_iterations,
271 std::string kmeans_init_strategy{
"DEFAULT"};
272 std::string preferred_ml_framework{
"DEFAULT"};
278 kmeans_init_strategy,
279 preferred_ml_framework,
296 template <
typename K,
typename T>
301 const double epsilon,
302 const int32_t min_observations,
307 output_ids = input_ids;
309 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
311 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
315 const auto denulled_data =
denull_data(input_features);
316 const int64_t num_rows = denulled_data.masked_num_rows;
317 const bool data_is_masked =
318 denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
319 std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
320 int32_t* denulled_output =
321 data_is_masked ? denulled_output_allocation.data() : output_clusters.
ptr_;
324 const auto normalized_ptrs =
pluck_ptrs(normalized_data, 0L, normalized_data.size());
327 bool did_execute =
false;
332 normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
340 normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
345 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
346 " ML library to support dbscan implementation.");
348 }
catch (std::runtime_error& e) {
349 return mgr.ERROR_MESSAGE(e.what());
352 if (data_is_masked) {
354 denulled_data.reverse_index_map,
355 output_clusters.
ptr_,
356 denulled_data.unmasked_num_rows,
357 inline_null_value<int32_t>());
359 return input_ids.
size();
373 template <
typename K,
typename T>
377 const double epsilon,
378 const int32_t min_observations,
381 std::string preferred_ml_framework{
"DEFAULT"};
387 preferred_ml_framework,
401 template <
typename T>
409 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
411 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
414 const auto denulled_data =
denull_data(input_labels, input_features);
415 const auto labels_ptrs =
pluck_ptrs(denulled_data.data, 0L, 1L);
416 const auto features_ptrs =
418 const int64_t num_coefs = input_features.
numCols() + 1;
421 bool did_execute =
false;
425 onedal_linear_reg_fit_impl(labels_ptrs[0],
427 output_coef_idxs.
ptr_,
429 denulled_data.masked_num_rows);
436 mlpack_linear_reg_fit_impl(labels_ptrs[0],
438 output_coef_idxs.
ptr_,
440 denulled_data.masked_num_rows);
445 return mgr.ERROR_MESSAGE(
446 "Cannot find " + preferred_ml_framework_str.
getString() +
447 " ML library to support linear regression implementation.");
449 }
catch (std::runtime_error& e) {
450 return mgr.ERROR_MESSAGE(e.what());
462 template <
typename T>
469 std::string preferred_ml_framework{
"DEFAULT"};
473 preferred_ml_framework,
478 template <
typename T>
480 const size_t num_coefs = coef_idxs.
size();
481 std::vector<T> ordered_coefs(num_coefs);
482 for (
size_t coef_idx = 0; coef_idx < num_coefs; ++coef_idx) {
483 ordered_coefs[coef_idxs[coef_idx]] = coefs[coef_idx];
485 return ordered_coefs;
499 template <
typename T,
typename K>
509 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
511 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
516 const auto denulled_data =
denull_data(input_features);
517 const int64_t num_rows = denulled_data.masked_num_rows;
518 const bool data_is_masked =
519 denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
520 std::vector<T> denulled_output_allocation(data_is_masked ? num_rows : 0);
522 data_is_masked ? denulled_output_allocation.data() : output_predictions.
ptr_;
524 const auto features_ptrs =
pluck_ptrs(denulled_data.data, 0L, input_features.
numCols());
526 const auto ordered_coefs =
sort_coefs(coef_idxs, coefs);
529 bool did_execute =
false;
533 onedal_linear_reg_predict_impl(
534 features_ptrs, denulled_output, num_rows, ordered_coefs.data());
541 mlpack_linear_reg_predict_impl(
542 features_ptrs, denulled_output, num_rows, ordered_coefs.data());
547 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
548 " ML library to support kmeans implementation.");
550 }
catch (std::runtime_error& e) {
551 return mgr.ERROR_MESSAGE(e.what());
553 output_ids = input_ids;
554 if (data_is_masked) {
556 denulled_data.reverse_index_map,
557 output_predictions.
ptr_,
558 denulled_data.unmasked_num_rows,
559 inline_null_value<T>());
561 return input_ids.
size();
574 template <
typename T,
typename K>
583 std::string preferred_ml_framework{
"DEFAULT"};
589 preferred_ml_framework,
594 template <
typename T>
596 Column<T> wrapper_col(col_vec.data(),
static_cast<int64_t
>(col_vec.size()));
610 template <
typename T,
typename K>
619 const int64_t num_coefs = input_features.
numCols() + 1;
621 std::vector<int32_t> coef_idxs_vec(num_coefs);
622 std::vector<T> coefs_vec(num_coefs);
630 mgr, input_labels, input_features, preferred_ml_framework_str, coef_idxs, coefs);
640 preferred_ml_framework_str,
654 template <
typename T,
typename K>
662 std::string preferred_ml_framework{
"DEFAULT"};
667 preferred_ml_framework,
672 #endif // #ifndef __CUDACC__
void set_output_row_size(int64_t num_rows)
KMeansInitStrategy get_kmeans_init_type(const std::string &init_type_str)
std::string getString() const
DEVICE int64_t size() const
DEVICE int64_t numCols() const
MaskedData< T > denull_data(const ColumnList< T > &features)
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template(TableFunctionManager &mgr, const Column< T > &input_labels, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< int32_t > &output_coef_idxs, Column< T > &output_coefs)
NEVER_INLINE HOST int32_t kmeans__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const int num_clusters, const int num_iterations, const TextEncodingNone &init_type_str, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
std::vector< T > sort_coefs(const Column< int32_t > &coef_idxs, const Column< T > &coefs)
NEVER_INLINE HOST int32_t linear_reg_predict__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const Column< int32_t > &coef_idxs, const Column< T > &coefs, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
EXTENSION_NOINLINE_HOST int32_t supported_ml_frameworks__cpu_(TableFunctionManager &mgr, Column< TextEncodingDict > &output_ml_frameworks, Column< bool > &output_availability, Column< bool > &output_default)
StringDictionaryProxy * string_dict_proxy_
#define EXTENSION_NOINLINE_HOST
void disable_output_allocations()
NEVER_INLINE HOST int32_t linear_reg_fit_predict__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const Column< T > &input_labels, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
void unmask_data(const T *masked_input, const std::vector< int32_t > &reverse_index_map, T *unmasked_output, const int64_t num_unmasked_rows, const T null_val)
NEVER_INLINE HOST int32_t dbscan__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const double epsilon, const int32_t min_observations, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
std::vector< std::vector< T > > z_std_normalize_data(const std::vector< T * > &input_data, const int64_t num_rows)
std::vector< int32_t > getOrAddTransientBulk(const std::vector< std::string > &strings)
void enable_output_allocations()
Column< T > create_wrapper_col(std::vector< T > &col_vec)