36 #include <tbb/parallel_for.h>
37 #include <tbb/task_arena.h>
39 using namespace TableFunctions_Namespace;
42 std::vector<const T*>
pluck_ptrs(
const std::vector<std::vector<T>>& data,
43 const int64_t start_idx,
44 const int64_t end_idx) {
45 std::vector<const T*> raw_ptrs;
48 CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
49 for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
50 raw_ptrs.emplace_back(data[col_idx].data());
56 std::vector<const T*>
pluck_ptrs(
const std::vector<T*>& data,
57 const int64_t start_idx,
58 const int64_t end_idx) {
59 std::vector<const T*> raw_ptrs;
62 CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
63 for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
64 raw_ptrs.emplace_back(data[col_idx]);
83 const int64_t num_cat_features,
84 const int64_t num_numeric_features);
100 template <
typename K,
typename T>
105 const int num_clusters,
106 const int num_iterations,
112 output_ids = input_ids;
115 return mgr.ERROR_MESSAGE(
"Invalid KMeans initializaiton strategy: " +
119 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
121 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
126 const auto denulled_data =
denull_data(input_features);
127 const int64_t num_rows = denulled_data.masked_num_rows;
128 const bool data_is_masked =
129 denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
130 std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
131 int32_t* denulled_output =
132 data_is_masked ? denulled_output_allocation.data() : output_clusters.
ptr_;
136 const auto normalized_ptrs =
pluck_ptrs(normalized_data, 0L, normalized_data.size());
138 bool did_execute =
false;
142 onedal_kmeans_impl(normalized_ptrs,
147 kmeans_init_strategy);
154 mlpack_kmeans_impl(normalized_ptrs,
159 kmeans_init_strategy);
164 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
165 " ML library to support kmeans implementation.");
168 if (data_is_masked) {
170 denulled_data.reverse_index_map,
171 output_clusters.
ptr_,
172 denulled_data.unmasked_num_rows,
173 inline_null_value<int32_t>());
175 }
catch (std::runtime_error& e) {
176 return mgr.ERROR_MESSAGE(e.what());
178 return input_ids.
size();
193 template <
typename K,
typename T>
198 const double epsilon,
199 const int32_t min_observations,
204 output_ids = input_ids;
206 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
208 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
213 const auto denulled_data =
denull_data(input_features);
214 const int64_t num_rows = denulled_data.masked_num_rows;
215 const bool data_is_masked =
216 denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
217 std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
218 int32_t* denulled_output =
219 data_is_masked ? denulled_output_allocation.data() : output_clusters.
ptr_;
223 const auto normalized_ptrs =
pluck_ptrs(normalized_data, 0L, normalized_data.size());
225 bool did_execute =
false;
230 normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
238 normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
243 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
244 " ML library to support dbscan implementation.");
247 if (data_is_masked) {
249 denulled_data.reverse_index_map,
250 output_clusters.
ptr_,
251 denulled_data.unmasked_num_rows,
252 inline_null_value<int32_t>());
254 }
catch (std::runtime_error& e) {
255 return mgr.ERROR_MESSAGE(e.what());
257 return input_ids.
size();
260 template <
typename T>
266 const std::vector<std::vector<std::string>>& cat_feature_keys,
270 if (input_labels.
size() == 0) {
271 return mgr.ERROR_MESSAGE(
272 "No rows exist in training data. Training data must at least contain 1 row.");
274 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
276 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
279 const auto denulled_data =
denull_data(input_labels, input_features);
280 const auto labels_ptrs =
pluck_ptrs(denulled_data.data, 0L, 1L);
281 const auto features_ptrs =
283 const int64_t num_coefs = input_features.
numCols() + 1;
285 std::vector<int64_t> coef_idxs(num_coefs);
286 std::vector<double> coefs(num_coefs);
288 bool did_execute =
false;
292 onedal_linear_reg_fit_impl(labels_ptrs[0],
296 denulled_data.masked_num_rows);
303 mlpack_linear_reg_fit_impl(labels_ptrs[0],
307 denulled_data.masked_num_rows);
312 return mgr.ERROR_MESSAGE(
313 "Cannot find " + preferred_ml_framework_str.
getString() +
314 " ML library to support linear regression implementation.");
316 }
catch (std::runtime_error& e) {
317 return mgr.ERROR_MESSAGE(e.what());
320 std::make_shared<LinearRegressionModel>(coefs, model_metadata, cat_feature_keys);
322 const std::string model_name_str = model_name.
getString();
325 output_model_name[0] = model_name_str_id;
342 template <
typename T>
351 std::vector<std::vector<std::string>> empty_cat_feature_keys;
356 empty_cat_feature_keys,
357 preferred_ml_framework_str,
362 template <
typename T>
367 const int32_t cat_top_k,
368 const float cat_min_fraction,
369 const bool cat_include_others)
370 : num_rows_(numeric_features.size()) {
372 one_hot_encoding_info(cat_top_k, cat_min_fraction, cat_include_others);
373 const size_t num_cat_features =
static_cast<size_t>(cat_features.
numCols());
374 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
375 one_hot_encoding_infos;
376 for (
size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
377 one_hot_encoding_infos.emplace_back(one_hot_encoding_info);
379 one_hot_encoded_cols_ =
380 TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
381 cat_features, one_hot_encoding_infos);
382 for (
auto& one_hot_encoded_col : one_hot_encoded_cols_) {
383 cat_feature_keys_.emplace_back(one_hot_encoded_col.cat_features);
384 for (
auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
385 col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
388 const int64_t num_numeric_features = numeric_features.
numCols();
389 for (int64_t numeric_feature_idx = 0; numeric_feature_idx < num_numeric_features;
390 ++numeric_feature_idx) {
391 col_ptrs_.emplace_back(numeric_features.
ptrs_[numeric_feature_idx]);
396 const int32_t cat_top_k,
397 const float cat_min_fraction,
398 const bool cat_include_others)
399 : num_rows_(cat_features.size()) {
401 one_hot_encoding_info(cat_top_k, cat_min_fraction, cat_include_others);
402 const size_t num_cat_features =
static_cast<size_t>(cat_features.
numCols());
403 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
404 one_hot_encoding_infos;
405 for (
size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
406 one_hot_encoding_infos.emplace_back(one_hot_encoding_info);
408 one_hot_encoded_cols_ =
409 TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
410 cat_features, one_hot_encoding_infos);
411 for (
auto& one_hot_encoded_col : one_hot_encoded_cols_) {
412 cat_feature_keys_.emplace_back(one_hot_encoded_col.cat_features);
413 for (
auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
414 col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
422 const std::vector<std::vector<std::string>>& cat_feature_keys)
423 : num_rows_(numeric_features.size()), cat_feature_keys_(cat_feature_keys) {
424 const size_t num_cat_features =
static_cast<size_t>(cat_features.
numCols());
425 if (num_cat_features != cat_feature_keys_.size()) {
426 throw std::runtime_error(
427 "Number of provided categorical features does not match number of categorical "
428 "features in the model.");
430 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
431 one_hot_encoding_infos;
432 for (
size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
433 one_hot_encoding_infos.emplace_back(cat_feature_keys_[cat_idx]);
435 one_hot_encoded_cols_ =
436 TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
437 cat_features, one_hot_encoding_infos);
438 for (
auto& one_hot_encoded_col : one_hot_encoded_cols_) {
439 for (
auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
440 col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
443 const int64_t num_numeric_features = numeric_features.
numCols();
444 for (int64_t numeric_feature_idx = 0; numeric_feature_idx < num_numeric_features;
445 ++numeric_feature_idx) {
446 col_ptrs_.emplace_back(numeric_features.
ptrs_[numeric_feature_idx]);
452 const std::vector<std::vector<std::string>>& cat_feature_keys)
453 : num_rows_(cat_features.size()), cat_feature_keys_(cat_feature_keys) {
454 const size_t num_cat_features =
static_cast<size_t>(cat_features.
numCols());
455 if (num_cat_features != cat_feature_keys_.size()) {
456 throw std::runtime_error(
457 "Number of provided categorical features does not match number of categorical "
458 "features in the model.");
460 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
461 one_hot_encoding_infos;
462 for (
size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
463 one_hot_encoding_infos.emplace_back(cat_feature_keys_[cat_idx]);
465 one_hot_encoded_cols_ =
466 TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
467 cat_features, one_hot_encoding_infos);
468 for (
auto& one_hot_encoded_col : one_hot_encoded_cols_) {
469 for (
auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
470 col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
477 col_ptrs_.data(),
static_cast<int64_t
>(col_ptrs_.size()), num_rows_);
481 return cat_feature_keys_;
486 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodedCol<T>>
508 template <
typename T>
515 const int32_t cat_top_k,
516 const float cat_min_fraction,
521 input_numeric_features,
531 preferred_ml_framework_str,
551 template <
typename T>
557 const int32_t cat_top_k,
558 const float cat_min_fraction,
563 input_cat_features, cat_top_k, cat_min_fraction,
false );
570 preferred_ml_framework_str,
575 template <
typename T>
577 Column<T> wrapper_col(col_vec.data(),
static_cast<int64_t
>(col_vec.size()));
619 template <
typename T>
625 const std::vector<std::vector<std::string>>& cat_feature_keys,
626 const int64_t max_tree_depth,
627 const int64_t min_observations_per_leaf_node,
631 if (input_labels.
size() == 0) {
632 return mgr.ERROR_MESSAGE(
633 "No rows exist in training data. Training data must at least contain 1 row.");
635 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
637 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
641 return mgr.ERROR_MESSAGE(
642 "Only OneDAL framework supported for decision tree regression.");
645 return mgr.ERROR_MESSAGE(
646 "Only OneDAL framework supported for decision tree regression.");
649 const auto denulled_data =
denull_data(input_labels, input_features);
650 const auto labels_ptrs =
pluck_ptrs(denulled_data.data, 0L, 1L);
651 const auto features_ptrs =
655 bool did_execute =
false;
659 onedal_decision_tree_reg_fit_impl<T>(model_name,
664 denulled_data.masked_num_rows,
666 min_observations_per_leaf_node);
669 output_model_name[0] = model_name_str_id;
674 return mgr.ERROR_MESSAGE(
675 "Cannot find " + preferred_ml_framework_str.
getString() +
676 " ML library to support decision tree regression implementation.");
678 }
catch (std::runtime_error& e) {
679 return mgr.ERROR_MESSAGE(e.what());
699 template <
typename T>
705 const int64_t max_tree_depth,
706 const int64_t min_observations_per_leaf_node,
710 std::vector<std::vector<std::string>> empty_cat_feature_keys;
715 empty_cat_feature_keys,
717 min_observations_per_leaf_node,
718 preferred_ml_framework_str,
740 template <
typename T>
747 const int64_t max_tree_depth,
748 const int64_t min_observations_per_leaf_node,
749 const int32_t cat_top_k,
750 const float cat_min_fraction,
754 std::vector<std::vector<std::string>> empty_cat_feature_keys;
756 input_numeric_features,
766 min_observations_per_leaf_node,
767 preferred_ml_framework_str,
789 template <
typename T>
795 const int64_t max_tree_depth,
796 const int64_t min_observations_per_leaf_node,
797 const int32_t cat_top_k,
798 const float cat_min_fraction,
802 std::vector<std::vector<std::string>> empty_cat_feature_keys;
804 input_cat_features, cat_top_k, cat_min_fraction,
false );
811 min_observations_per_leaf_node,
812 preferred_ml_framework_str,
817 template <
typename T>
823 const std::vector<std::vector<std::string>>& cat_feature_keys,
824 const int64_t max_iterations,
825 const int64_t max_tree_depth,
826 const double shrinkage,
827 const double min_split_loss,
829 const double obs_per_tree_fraction,
830 const int64_t features_per_node,
831 const int64_t min_observations_per_leaf_node,
832 const int64_t max_bins,
833 const int64_t min_bin_size,
837 if (input_labels.
size() == 0) {
838 return mgr.ERROR_MESSAGE(
839 "No rows exist in training data. Training data must at least contain 1 row.");
841 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
843 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
847 return mgr.ERROR_MESSAGE(
"Only OneDAL framework supported for GBT regression.");
850 return mgr.ERROR_MESSAGE(
"Only OneDAL framework supported for GBT regression.");
853 const auto denulled_data =
denull_data(input_labels, input_features);
854 const auto labels_ptrs =
pluck_ptrs(denulled_data.data, 0L, 1L);
855 const auto features_ptrs =
859 bool did_execute =
false;
863 onedal_gbt_reg_fit_impl<T>(model_name,
868 denulled_data.masked_num_rows,
874 obs_per_tree_fraction,
876 min_observations_per_leaf_node,
881 output_model_name[0] = model_name_str_id;
886 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
887 " ML library to support GBT regression implementation.");
889 }
catch (std::runtime_error& e) {
890 return mgr.ERROR_MESSAGE(e.what());
916 template <
typename T>
922 const int64_t max_iterations,
923 const int64_t max_tree_depth,
924 const double shrinkage,
925 const double min_split_loss,
927 const double obs_per_tree_fraction,
928 const int64_t features_per_node,
929 const int64_t min_observations_per_leaf_node,
930 const int64_t max_bins,
931 const int64_t min_bin_size,
935 std::vector<std::vector<std::string>> empty_cat_feature_keys;
940 empty_cat_feature_keys,
946 obs_per_tree_fraction,
948 min_observations_per_leaf_node,
951 preferred_ml_framework_str,
981 template <
typename T>
988 const int64_t max_iterations,
989 const int64_t max_tree_depth,
990 const double shrinkage,
991 const double min_split_loss,
993 const double obs_per_tree_fraction,
994 const int64_t features_per_node,
995 const int64_t min_observations_per_leaf_node,
996 const int64_t max_bins,
997 const int64_t min_bin_size,
998 const int32_t cat_top_k,
999 const float cat_min_fraction,
1004 input_numeric_features,
1018 obs_per_tree_fraction,
1020 min_observations_per_leaf_node,
1023 preferred_ml_framework_str,
1053 template <
typename T>
1059 const int64_t max_iterations,
1060 const int64_t max_tree_depth,
1061 const double shrinkage,
1062 const double min_split_loss,
1063 const double lambda,
1064 const double obs_per_tree_fraction,
1065 const int64_t features_per_node,
1066 const int64_t min_observations_per_leaf_node,
1067 const int64_t max_bins,
1068 const int64_t min_bin_size,
1069 const int32_t cat_top_k,
1070 const float cat_min_fraction,
1075 input_cat_features, cat_top_k, cat_min_fraction,
false );
1086 obs_per_tree_fraction,
1088 min_observations_per_leaf_node,
1091 preferred_ml_framework_str,
1096 template <
typename T>
1102 const std::vector<std::vector<std::string>>& cat_feature_keys,
1103 const int64_t num_trees,
1104 const double obs_per_tree_fraction,
1105 const int64_t max_tree_depth,
1106 const int64_t features_per_node,
1107 const double impurity_threshold,
1108 const bool bootstrap,
1109 const int64_t min_obs_per_leaf_node,
1110 const int64_t min_obs_per_split_node,
1111 const double min_weight_fraction_in_leaf_node,
1112 const double min_impurity_decrease_in_split_node,
1113 const int64_t max_leaf_nodes,
1114 const bool use_histogram,
1119 if (input_labels.
size() == 0) {
1120 return mgr.ERROR_MESSAGE(
1121 "No rows exist in training data. Training data must at least contain 1 row.");
1123 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
1125 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
1126 preferred_ml_framework_str.
getString());
1129 return mgr.ERROR_MESSAGE(
1130 "Only OneDAL framework supported for random forest regression.");
1133 return mgr.ERROR_MESSAGE(
1134 "Only OneDAL framework supported for random forest regression.");
1137 const auto denulled_data =
denull_data(input_labels, input_features);
1138 const auto labels_ptrs =
pluck_ptrs(denulled_data.data, 0L, 1L);
1139 const auto features_ptrs =
1143 bool did_execute =
false;
1144 const auto var_importance_metric =
1147 return mgr.ERROR_MESSAGE(
"Invalid variable importance metric: " +
1153 if (use_histogram) {
1154 onedal_random_forest_reg_fit_impl<T, decision_forest::regression::training::hist>(
1160 denulled_data.masked_num_rows,
1162 obs_per_tree_fraction,
1167 min_obs_per_leaf_node,
1168 min_obs_per_split_node,
1169 min_weight_fraction_in_leaf_node,
1170 min_impurity_decrease_in_split_node,
1172 var_importance_metric);
1174 onedal_random_forest_reg_fit_impl<
1176 decision_forest::regression::training::defaultDense>(
1182 denulled_data.masked_num_rows,
1184 obs_per_tree_fraction,
1189 min_obs_per_leaf_node,
1190 min_obs_per_split_node,
1191 min_weight_fraction_in_leaf_node,
1192 min_impurity_decrease_in_split_node,
1194 var_importance_metric);
1198 output_model_name[0] = model_name_str_id;
1203 return mgr.ERROR_MESSAGE(
1204 "Cannot find " + preferred_ml_framework_str.
getString() +
1205 " ML library to support random forest regression implementation.");
1207 }
catch (std::runtime_error& e) {
1208 return mgr.ERROR_MESSAGE(e.what());
1239 template <
typename T>
1245 const int64_t num_trees,
1246 const double obs_per_tree_fraction,
1247 const int64_t max_tree_depth,
1248 const int64_t features_per_node,
1249 const double impurity_threshold,
1250 const bool bootstrap,
1251 const int64_t min_obs_per_leaf_node,
1252 const int64_t min_obs_per_split_node,
1253 const double min_weight_fraction_in_leaf_node,
1254 const double min_impurity_decrease_in_split_node,
1255 const int64_t max_leaf_nodes,
1256 const bool use_histogram,
1261 std::vector<std::vector<std::string>> empty_cat_feature_keys;
1266 empty_cat_feature_keys,
1268 obs_per_tree_fraction,
1273 min_obs_per_leaf_node,
1274 min_obs_per_split_node,
1275 min_weight_fraction_in_leaf_node,
1276 min_impurity_decrease_in_split_node,
1279 var_importance_metric_str,
1280 preferred_ml_framework_str,
1313 template <
typename T>
1320 const int64_t num_trees,
1321 const double obs_per_tree_fraction,
1322 const int64_t max_tree_depth,
1323 const int64_t features_per_node,
1324 const double impurity_threshold,
1325 const bool bootstrap,
1326 const int64_t min_obs_per_leaf_node,
1327 const int64_t min_obs_per_split_node,
1328 const double min_weight_fraction_in_leaf_node,
1329 const double min_impurity_decrease_in_split_node,
1330 const int64_t max_leaf_nodes,
1331 const bool use_histogram,
1333 const int32_t cat_top_k,
1334 const float cat_min_fraction,
1339 input_numeric_features,
1349 obs_per_tree_fraction,
1354 min_obs_per_leaf_node,
1355 min_obs_per_split_node,
1356 min_weight_fraction_in_leaf_node,
1357 min_impurity_decrease_in_split_node,
1360 var_importance_metric_str,
1361 preferred_ml_framework_str,
1394 template <
typename T>
1400 const int64_t num_trees,
1401 const double obs_per_tree_fraction,
1402 const int64_t max_tree_depth,
1403 const int64_t features_per_node,
1404 const double impurity_threshold,
1405 const bool bootstrap,
1406 const int64_t min_obs_per_leaf_node,
1407 const int64_t min_obs_per_split_node,
1408 const double min_weight_fraction_in_leaf_node,
1409 const double min_impurity_decrease_in_split_node,
1410 const int64_t max_leaf_nodes,
1411 const bool use_histogram,
1413 const int32_t cat_top_k,
1414 const float cat_min_fraction,
1419 input_cat_features, cat_top_k, cat_min_fraction,
false );
1426 obs_per_tree_fraction,
1431 min_obs_per_leaf_node,
1432 min_obs_per_split_node,
1433 min_weight_fraction_in_leaf_node,
1434 min_impurity_decrease_in_split_node,
1437 var_importance_metric_str,
1438 preferred_ml_framework_str,
1443 template <
typename T>
1448 const std::vector<std::vector<std::string>>& cat_feature_keys,
1452 if (input_features.
size() == 0) {
1453 return mgr.ERROR_MESSAGE(
1454 "No rows exist in training data. Training data must at least contain 1 row.");
1456 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
1458 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
1459 preferred_ml_framework_str.
getString());
1462 const auto denulled_data =
denull_data(input_features);
1463 const int64_t num_rows = denulled_data.masked_num_rows;
1464 if (num_rows == 0) {
1465 return mgr.ERROR_MESSAGE(
1466 "No non-null rows exist in training data. Training data must at least contain "
1470 const auto features_ptrs =
1473 const auto z_std_norm_summary_stats =
1475 const auto normalized_ptrs =
1476 pluck_ptrs(z_std_norm_summary_stats.normalized_data,
1478 z_std_norm_summary_stats.normalized_data.size());
1479 bool did_execute =
false;
1483 const auto [eigenvectors, eigenvalues] =
1484 onedal_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
1485 auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
1486 z_std_norm_summary_stats.std_devs,
1496 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
1497 " ML library to support PCA implementation.");
1502 output_model_name[0] = model_name_str_id;
1504 }
catch (std::runtime_error& e) {
1505 return mgr.ERROR_MESSAGE(e.what());
1522 template <
typename T>
1530 std::vector<std::vector<std::string>> empty_cat_feature_keys;
1534 empty_cat_feature_keys,
1535 preferred_ml_framework_str,
1555 template <
typename T>
1561 const int32_t cat_top_k,
1562 const float cat_min_fraction,
1567 input_numeric_features,
1575 preferred_ml_framework_str,
1599 const int32_t cat_top_k,
1600 const float cat_min_fraction,
1605 template <
typename T,
typename K>
1608 const std::shared_ptr<AbstractMLModel>& model,
1614 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
1616 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
1617 preferred_ml_framework_str.
getString());
1619 const auto denulled_data =
denull_data(input_features);
1620 const int64_t num_rows = denulled_data.masked_num_rows;
1621 const bool data_is_masked =
1622 denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
1623 std::vector<T> denulled_output_allocation(data_is_masked ? num_rows : 0);
1625 T* denulled_output =
1626 data_is_masked ? denulled_output_allocation.data() : output_predictions.
ptr_;
1627 const auto features_ptrs =
pluck_ptrs(denulled_data.data, 0L, input_features.
numCols());
1630 bool did_execute =
false;
1631 const auto model_type = model->getModelType();
1632 switch (model_type) {
1634 const auto linear_reg_model =
1636 CHECK(linear_reg_model);
1640 onedal_linear_reg_predict_impl(
1641 linear_reg_model, features_ptrs, denulled_output, num_rows);
1648 mlpack_linear_reg_predict_impl(
1649 linear_reg_model, features_ptrs, denulled_output, num_rows);
1657 const auto decision_tree_reg_model =
1658 std::dynamic_pointer_cast<DecisionTreeRegressionModel>(model);
1659 CHECK(decision_tree_reg_model);
1662 onedal_decision_tree_reg_predict_impl(
1663 decision_tree_reg_model, features_ptrs, denulled_output, num_rows);
1671 const auto gbt_reg_model = std::dynamic_pointer_cast<GbtRegressionModel>(model);
1672 CHECK(gbt_reg_model);
1675 onedal_gbt_reg_predict_impl(
1676 gbt_reg_model, features_ptrs, denulled_output, num_rows);
1684 const auto random_forest_reg_model =
1685 std::dynamic_pointer_cast<RandomForestRegressionModel>(model);
1686 CHECK(random_forest_reg_model);
1689 onedal_random_forest_reg_predict_impl(
1690 random_forest_reg_model, features_ptrs, denulled_output, num_rows);
1697 throw std::runtime_error(
"Unsupported model type");
1701 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
1702 " ML library to support model implementation.");
1704 }
catch (std::runtime_error& e) {
1705 const std::string error_str(e.what());
1706 return mgr.ERROR_MESSAGE(error_str);
1708 output_ids = input_ids;
1709 if (data_is_masked) {
1711 denulled_data.reverse_index_map,
1712 output_predictions.
ptr_,
1713 denulled_data.unmasked_num_rows,
1714 inline_null_value<T>());
1716 return input_ids.
size();
1730 template <
typename T,
typename K>
1746 preferred_ml_framework_str,
1748 output_predictions);
1749 }
catch (std::runtime_error& e) {
1750 const std::string error_str(e.what());
1751 return mgr.ERROR_MESSAGE(error_str);
1766 template <
typename T,
typename K>
1779 model, input_cat_features.
numCols(), input_numeric_features.
numCols());
1781 input_cat_features, input_numeric_features, model->getCatFeatureKeys());
1785 cat_features_builder.getFeatures(),
1786 preferred_ml_framework_str,
1788 output_predictions);
1789 }
catch (std::runtime_error& e) {
1790 const std::string error_str(e.what());
1791 return mgr.ERROR_MESSAGE(error_str);
1806 template <
typename T,
typename K>
1819 model->getCatFeatureKeys());
1823 cat_features_builder.getFeatures(),
1824 preferred_ml_framework_str,
1826 output_predictions);
1827 }
catch (std::runtime_error& e) {
1828 const std::string error_str(e.what());
1829 return mgr.ERROR_MESSAGE(error_str);
1844 template <
typename T,
typename K>
1853 if (model_name.
size() != 1) {
1854 return mgr.ERROR_MESSAGE(
"Expected only one row in model CURSOR.");
1858 model_name_text_enc_none,
1861 preferred_ml_framework_str,
1863 output_predictions);
1877 template <
typename T,
typename K>
1887 if (model_name.
size() != 1) {
1888 return mgr.ERROR_MESSAGE(
"Expected only one row in model CURSOR.");
1892 model_name_text_enc_none,
1895 input_numeric_features,
1896 preferred_ml_framework_str,
1898 output_predictions);
1912 template <
typename T,
typename K>
1921 if (model_name.
size() != 1) {
1922 return mgr.ERROR_MESSAGE(
"Expected only one row in model CURSOR.");
1926 model_name_text_enc_none,
1929 preferred_ml_framework_str,
1931 output_predictions);
1934 template <
typename T>
1936 const std::shared_ptr<AbstractMLModel>& model,
1940 const int64_t num_rows = input_labels.
size();
1941 if (num_rows == 0) {
1942 return mgr.ERROR_MESSAGE(
1943 "No rows exist in evaluation data. Evaluation data must at least contain 1 row.");
1945 std::vector<T> output_predictions_vec(num_rows);
1946 Column<T> output_predictions(output_predictions_vec);
1947 std::vector<int64_t> input_ids_vec(num_rows);
1948 std::vector<int64_t> output_ids_vec(num_rows);
1959 ml_framework_encoding_none,
1961 output_predictions);
1967 }
catch (std::runtime_error& e) {
1969 return mgr.ERROR_MESSAGE(e.what());
1976 const size_t max_thread_count = std::thread::hardware_concurrency();
1978 const size_t num_threads = std::min(
1979 max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
1981 std::vector<double> local_sum_squared_regressions(num_threads, 0.0);
1982 std::vector<double> local_sum_squares(num_threads, 0.0);
1984 tbb::task_arena limited_arena(num_threads);
1986 limited_arena.execute([&] {
1988 tbb::blocked_range<int64_t>(0, num_rows),
1989 [&](
const tbb::blocked_range<int64_t>& r) {
1990 const int64_t start_idx = r.begin();
1991 const int64_t end_idx = r.end();
1992 double local_sum_squared_regression{0.0};
1993 double local_sum_square{0.0};
1994 for (int64_t row_idx = start_idx; row_idx < end_idx; ++row_idx) {
1995 if (output_predictions[row_idx] != inline_null_value<T>()) {
1996 local_sum_squared_regression +=
1997 (input_labels[row_idx] - output_predictions[row_idx]) *
1998 (input_labels[row_idx] - output_predictions[row_idx]);
1999 local_sum_square += (input_labels[row_idx] - labels_mean) *
2000 (input_labels[row_idx] - labels_mean);
2003 const size_t thread_idx = tbb::this_task_arena::current_thread_index();
2004 local_sum_squared_regressions[thread_idx] += local_sum_squared_regression;
2005 local_sum_squares[thread_idx] += local_sum_square;
2008 double sum_squared_regression{0.0};
2009 double sum_squares{0.0};
2010 for (
size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
2011 sum_squared_regression += local_sum_squared_regressions[thread_idx];
2012 sum_squares += local_sum_squares[thread_idx];
2014 output_r2[0] = sum_squares == 0.0 ? 1.0 : 1.0 - (sum_squared_regression / sum_squares);
2027 template <
typename T>
2036 return r2_score_impl(mgr, model, input_labels, input_features, output_r2);
2037 }
catch (std::runtime_error& e) {
2038 const std::string error_str(e.what());
2039 return mgr.ERROR_MESSAGE(error_str);
2052 template <
typename T>
2059 if (model_name.
size() != 1) {
2060 return mgr.ERROR_MESSAGE(
"Expected only one row in model name CURSOR.");
2064 mgr, model_name_text_enc_none, input_labels, input_features, output_r2);
2075 template <
typename T>
2086 model, input_cat_features.
numCols(), input_numeric_features.
numCols());
2088 input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2090 mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2091 }
catch (std::runtime_error& e) {
2092 const std::string error_str(e.what());
2093 return mgr.ERROR_MESSAGE(error_str);
2105 template <
typename T>
2116 model->getCatFeatureKeys());
2118 mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2119 }
catch (std::runtime_error& e) {
2120 const std::string error_str(e.what());
2121 return mgr.ERROR_MESSAGE(error_str);
2133 template <
typename T>
2141 if (model_name.
size() != 1) {
2142 return mgr.ERROR_MESSAGE(
"Expected only one row in model name CURSOR.");
2144 const std::string model_name_str{model_name.
getString(0)};
2148 model, input_cat_features.
numCols(), input_numeric_features.
numCols());
2150 input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2152 mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2153 }
catch (std::runtime_error& e) {
2154 const std::string error_str(e.what());
2155 return mgr.ERROR_MESSAGE(error_str);
2245 #endif // #ifndef __CUDACC__
DEVICE const std::string getString(int64_t index) const
ColumnList< T > getFeatures()
NEVER_INLINE HOST int32_t pca_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
void set_output_row_size(int64_t num_rows)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const ColumnList< T > &numeric_features, const int32_t cat_top_k, const float cat_min_fraction, const bool cat_include_others)
KMeansInitStrategy get_kmeans_init_type(const std::string &init_type_str)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const int32_t cat_top_k, const float cat_min_fraction, const bool cat_include_others)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
NEVER_INLINE HOST int32_t r2_score_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
std::string getString() const
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
DEVICE int64_t size() const
NEVER_INLINE HOST int32_t decision_tree_reg_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST int32_t pca_fit__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< TextEncodingDict > &input_cat_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
EXTENSION_NOINLINE_HOST int32_t supported_ml_frameworks__cpu_(TableFunctionManager &mgr, Column< TextEncodingDict > &output_ml_frameworks, Column< bool > &output_availability, Column< bool > &output_default)
MaskedData< T > denull_data(const ColumnList< T > &features)
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t kmeans__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const int num_clusters, const int num_iterations, const TextEncodingNone &init_type_str, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
NEVER_INLINE HOST int32_t pca_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
MLFramework get_ml_framework(const std::string &ml_framework_str)
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
const size_t max_inputs_per_thread
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const ColumnList< T > &numeric_features, const std::vector< std::vector< std::string >> &cat_feature_keys)
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
void addModel(const std::string &model_name, std::shared_ptr< AbstractMLModel > model)
VarImportanceMetric get_var_importance_metric(const std::string &var_importance_metric_str)
NEVER_INLINE HOST int32_t gbt_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
#define EXTENSION_NOINLINE_HOST
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
void disable_output_allocations()
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
DEVICE int64_t numCols() const
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
std::vector< int8_t * > col_ptrs_
std::vector< TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodedCol< T > > one_hot_encoded_cols_
void unmask_data(const T *masked_input, const std::vector< int32_t > &reverse_index_map, T *unmasked_output, const int64_t num_unmasked_rows, const T null_val)
NEVER_INLINE HOST int32_t linear_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t dbscan__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const double epsilon, const int32_t min_observations, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const std::vector< std::vector< std::string >> &cat_feature_keys)
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
std::vector< std::vector< T > > z_std_normalize_data(const std::vector< T * > &input_data, const int64_t num_rows)
NEVER_INLINE HOST int32_t ml_reg_predict_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
std::vector< std::vector< std::string > > cat_feature_keys_
DEVICE int64_t size() const
DEVICE int64_t size() const
NEVER_INLINE HOST int32_t r2_score__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
ZStdNormalizationSummaryStats< T > z_std_normalize_data_with_summary_stats(const std::vector< T * > &input_data, const int64_t num_rows)
void enable_output_allocations()
Column< T > create_wrapper_col(std::vector< T > &col_vec)
NEVER_INLINE HOST double get_column_mean(const T *data, const int64_t num_rows)
const std::vector< std::vector< std::string > > & getCatFeatureKeys() const