36 #include <tbb/parallel_for.h>
37 #include <tbb/task_arena.h>
39 using namespace TableFunctions_Namespace;
42 std::vector<const T*>
pluck_ptrs(
const std::vector<std::vector<T>>& data,
43 const int64_t start_idx,
44 const int64_t end_idx) {
45 std::vector<const T*> raw_ptrs;
48 CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
49 for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
50 raw_ptrs.emplace_back(data[col_idx].data());
56 std::vector<const T*>
pluck_ptrs(
const std::vector<T*>& data,
57 const int64_t start_idx,
58 const int64_t end_idx) {
59 std::vector<const T*> raw_ptrs;
62 CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
63 for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
64 raw_ptrs.emplace_back(data[col_idx]);
83 const int64_t num_cat_features,
84 const int64_t num_numeric_features);
100 template <
typename K,
typename T>
105 const int num_clusters,
106 const int num_iterations,
112 output_ids = input_ids;
115 return mgr.ERROR_MESSAGE(
"Invalid KMeans initializaiton strategy: " +
119 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
121 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
126 const auto denulled_data =
denull_data(input_features);
127 const int64_t num_rows = denulled_data.masked_num_rows;
128 const bool data_is_masked =
129 denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
130 std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
131 int32_t* denulled_output =
132 data_is_masked ? denulled_output_allocation.data() : output_clusters.
ptr_;
136 const auto normalized_ptrs =
pluck_ptrs(normalized_data, 0L, normalized_data.size());
138 bool did_execute =
false;
142 onedal_kmeans_impl(normalized_ptrs,
147 kmeans_init_strategy);
154 mlpack_kmeans_impl(normalized_ptrs,
159 kmeans_init_strategy);
164 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
165 " ML library to support kmeans implementation.");
168 if (data_is_masked) {
170 denulled_data.reverse_index_map,
171 output_clusters.
ptr_,
172 denulled_data.unmasked_num_rows,
173 inline_null_value<int32_t>());
175 }
catch (std::runtime_error& e) {
176 return mgr.ERROR_MESSAGE(e.what());
178 return input_ids.
size();
193 template <
typename K,
typename T>
198 const double epsilon,
199 const int32_t min_observations,
204 output_ids = input_ids;
206 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
208 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
213 const auto denulled_data =
denull_data(input_features);
214 const int64_t num_rows = denulled_data.masked_num_rows;
215 const bool data_is_masked =
216 denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
217 std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
218 int32_t* denulled_output =
219 data_is_masked ? denulled_output_allocation.data() : output_clusters.
ptr_;
223 const auto normalized_ptrs =
pluck_ptrs(normalized_data, 0L, normalized_data.size());
225 bool did_execute =
false;
230 normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
238 normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
243 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
244 " ML library to support dbscan implementation.");
247 if (data_is_masked) {
249 denulled_data.reverse_index_map,
250 output_clusters.
ptr_,
251 denulled_data.unmasked_num_rows,
252 inline_null_value<int32_t>());
254 }
catch (std::runtime_error& e) {
255 return mgr.ERROR_MESSAGE(e.what());
257 return input_ids.
size();
260 template <
typename T>
266 const std::vector<std::vector<std::string>>& cat_feature_keys,
270 if (input_labels.
size() == 0) {
271 return mgr.ERROR_MESSAGE(
272 "No rows exist in training data. Training data must at least contain 1 row.");
274 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
276 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
279 const auto denulled_data =
denull_data(input_labels, input_features);
280 const auto labels_ptrs =
pluck_ptrs(denulled_data.data, 0L, 1L);
281 const auto features_ptrs =
283 const int64_t num_coefs = input_features.
numCols() + 1;
285 std::vector<int64_t> coef_idxs(num_coefs);
286 std::vector<double> coefs(num_coefs);
288 bool did_execute =
false;
292 onedal_linear_reg_fit_impl(labels_ptrs[0],
296 denulled_data.masked_num_rows);
303 mlpack_linear_reg_fit_impl(labels_ptrs[0],
307 denulled_data.masked_num_rows);
312 return mgr.ERROR_MESSAGE(
313 "Cannot find " + preferred_ml_framework_str.
getString() +
314 " ML library to support linear regression implementation.");
316 }
catch (std::runtime_error& e) {
317 return mgr.ERROR_MESSAGE(e.what());
320 std::make_shared<LinearRegressionModel>(coefs, model_metadata, cat_feature_keys);
322 const std::string model_name_str = model_name.
getString();
325 output_model_name[0] = model_name_str_id;
340 template <
typename T>
349 std::vector<std::vector<std::string>> empty_cat_feature_keys;
354 empty_cat_feature_keys,
355 preferred_ml_framework_str,
360 template <
typename T>
365 const int32_t cat_top_k,
366 const float cat_min_fraction,
367 const bool cat_include_others)
368 : num_rows_(numeric_features.size()) {
370 one_hot_encoding_info(cat_top_k, cat_min_fraction, cat_include_others);
371 const size_t num_cat_features =
static_cast<size_t>(cat_features.
numCols());
372 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
373 one_hot_encoding_infos;
374 for (
size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
375 one_hot_encoding_infos.emplace_back(one_hot_encoding_info);
377 one_hot_encoded_cols_ =
378 TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
379 cat_features, one_hot_encoding_infos);
380 for (
auto& one_hot_encoded_col : one_hot_encoded_cols_) {
381 cat_feature_keys_.emplace_back(one_hot_encoded_col.cat_features);
382 for (
auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
383 col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
386 const int64_t num_numeric_features = numeric_features.
numCols();
387 for (int64_t numeric_feature_idx = 0; numeric_feature_idx < num_numeric_features;
388 ++numeric_feature_idx) {
389 col_ptrs_.emplace_back(numeric_features.
ptrs_[numeric_feature_idx]);
394 const int32_t cat_top_k,
395 const float cat_min_fraction,
396 const bool cat_include_others)
397 : num_rows_(cat_features.size()) {
399 one_hot_encoding_info(cat_top_k, cat_min_fraction, cat_include_others);
400 const size_t num_cat_features =
static_cast<size_t>(cat_features.
numCols());
401 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
402 one_hot_encoding_infos;
403 for (
size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
404 one_hot_encoding_infos.emplace_back(one_hot_encoding_info);
406 one_hot_encoded_cols_ =
407 TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
408 cat_features, one_hot_encoding_infos);
409 for (
auto& one_hot_encoded_col : one_hot_encoded_cols_) {
410 cat_feature_keys_.emplace_back(one_hot_encoded_col.cat_features);
411 for (
auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
412 col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
420 const std::vector<std::vector<std::string>>& cat_feature_keys)
421 : num_rows_(numeric_features.size()), cat_feature_keys_(cat_feature_keys) {
422 const size_t num_cat_features =
static_cast<size_t>(cat_features.
numCols());
423 if (num_cat_features != cat_feature_keys_.size()) {
424 throw std::runtime_error(
425 "Number of provided categorical features does not match number of categorical "
426 "features in the model.");
428 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
429 one_hot_encoding_infos;
430 for (
size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
431 one_hot_encoding_infos.emplace_back(cat_feature_keys_[cat_idx]);
433 one_hot_encoded_cols_ =
434 TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
435 cat_features, one_hot_encoding_infos);
436 for (
auto& one_hot_encoded_col : one_hot_encoded_cols_) {
437 for (
auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
438 col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
441 const int64_t num_numeric_features = numeric_features.
numCols();
442 for (int64_t numeric_feature_idx = 0; numeric_feature_idx < num_numeric_features;
443 ++numeric_feature_idx) {
444 col_ptrs_.emplace_back(numeric_features.
ptrs_[numeric_feature_idx]);
450 const std::vector<std::vector<std::string>>& cat_feature_keys)
451 : num_rows_(cat_features.size()), cat_feature_keys_(cat_feature_keys) {
452 const size_t num_cat_features =
static_cast<size_t>(cat_features.
numCols());
453 if (num_cat_features != cat_feature_keys_.size()) {
454 throw std::runtime_error(
455 "Number of provided categorical features does not match number of categorical "
456 "features in the model.");
458 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
459 one_hot_encoding_infos;
460 for (
size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
461 one_hot_encoding_infos.emplace_back(cat_feature_keys_[cat_idx]);
463 one_hot_encoded_cols_ =
464 TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
465 cat_features, one_hot_encoding_infos);
466 for (
auto& one_hot_encoded_col : one_hot_encoded_cols_) {
467 for (
auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
468 col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
475 col_ptrs_.data(),
static_cast<int64_t
>(col_ptrs_.size()), num_rows_);
479 return cat_feature_keys_;
484 std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodedCol<T>>
504 template <
typename T>
511 const int32_t cat_top_k,
512 const float cat_min_fraction,
517 input_numeric_features,
527 preferred_ml_framework_str,
545 template <
typename T>
551 const int32_t cat_top_k,
552 const float cat_min_fraction,
557 input_cat_features, cat_top_k, cat_min_fraction,
false );
564 preferred_ml_framework_str,
569 template <
typename T>
571 Column<T> wrapper_col(col_vec.data(),
static_cast<int64_t
>(col_vec.size()));
613 template <
typename T>
619 const std::vector<std::vector<std::string>>& cat_feature_keys,
620 const int64_t max_tree_depth,
621 const int64_t min_observations_per_leaf_node,
625 if (input_labels.
size() == 0) {
626 return mgr.ERROR_MESSAGE(
627 "No rows exist in training data. Training data must at least contain 1 row.");
629 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
631 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
635 return mgr.ERROR_MESSAGE(
636 "Only OneDAL framework supported for decision tree regression.");
639 return mgr.ERROR_MESSAGE(
640 "Only OneDAL framework supported for decision tree regression.");
643 const auto denulled_data =
denull_data(input_labels, input_features);
644 const auto labels_ptrs =
pluck_ptrs(denulled_data.data, 0L, 1L);
645 const auto features_ptrs =
649 bool did_execute =
false;
653 onedal_decision_tree_reg_fit_impl<T>(model_name,
658 denulled_data.masked_num_rows,
660 min_observations_per_leaf_node);
663 output_model_name[0] = model_name_str_id;
668 return mgr.ERROR_MESSAGE(
669 "Cannot find " + preferred_ml_framework_str.
getString() +
670 " ML library to support decision tree regression implementation.");
672 }
catch (std::runtime_error& e) {
673 return mgr.ERROR_MESSAGE(e.what());
691 template <
typename T>
697 const int64_t max_tree_depth,
698 const int64_t min_observations_per_leaf_node,
702 std::vector<std::vector<std::string>> empty_cat_feature_keys;
707 empty_cat_feature_keys,
709 min_observations_per_leaf_node,
710 preferred_ml_framework_str,
730 template <
typename T>
737 const int64_t max_tree_depth,
738 const int64_t min_observations_per_leaf_node,
739 const int32_t cat_top_k,
740 const float cat_min_fraction,
744 std::vector<std::vector<std::string>> empty_cat_feature_keys;
746 input_numeric_features,
756 min_observations_per_leaf_node,
757 preferred_ml_framework_str,
777 template <
typename T>
783 const int64_t max_tree_depth,
784 const int64_t min_observations_per_leaf_node,
785 const int32_t cat_top_k,
786 const float cat_min_fraction,
790 std::vector<std::vector<std::string>> empty_cat_feature_keys;
792 input_cat_features, cat_top_k, cat_min_fraction,
false );
799 min_observations_per_leaf_node,
800 preferred_ml_framework_str,
805 template <
typename T>
811 const std::vector<std::vector<std::string>>& cat_feature_keys,
812 const int64_t max_iterations,
813 const int64_t max_tree_depth,
814 const double shrinkage,
815 const double min_split_loss,
817 const double obs_per_tree_fraction,
818 const int64_t features_per_node,
819 const int64_t min_observations_per_leaf_node,
820 const int64_t max_bins,
821 const int64_t min_bin_size,
825 if (input_labels.
size() == 0) {
826 return mgr.ERROR_MESSAGE(
827 "No rows exist in training data. Training data must at least contain 1 row.");
829 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
831 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
835 return mgr.ERROR_MESSAGE(
"Only OneDAL framework supported for GBT regression.");
838 return mgr.ERROR_MESSAGE(
"Only OneDAL framework supported for GBT regression.");
841 const auto denulled_data =
denull_data(input_labels, input_features);
842 const auto labels_ptrs =
pluck_ptrs(denulled_data.data, 0L, 1L);
843 const auto features_ptrs =
847 bool did_execute =
false;
851 onedal_gbt_reg_fit_impl<T>(model_name,
856 denulled_data.masked_num_rows,
862 obs_per_tree_fraction,
864 min_observations_per_leaf_node,
869 output_model_name[0] = model_name_str_id;
874 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
875 " ML library to support GBT regression implementation.");
877 }
catch (std::runtime_error& e) {
878 return mgr.ERROR_MESSAGE(e.what());
904 template <
typename T>
910 const int64_t max_iterations,
911 const int64_t max_tree_depth,
912 const double shrinkage,
913 const double min_split_loss,
915 const double obs_per_tree_fraction,
916 const int64_t features_per_node,
917 const int64_t min_observations_per_leaf_node,
918 const int64_t max_bins,
919 const int64_t min_bin_size,
923 std::vector<std::vector<std::string>> empty_cat_feature_keys;
928 empty_cat_feature_keys,
934 obs_per_tree_fraction,
936 min_observations_per_leaf_node,
939 preferred_ml_framework_str,
967 template <
typename T>
974 const int64_t max_iterations,
975 const int64_t max_tree_depth,
976 const double shrinkage,
977 const double min_split_loss,
979 const double obs_per_tree_fraction,
980 const int64_t features_per_node,
981 const int64_t min_observations_per_leaf_node,
982 const int64_t max_bins,
983 const int64_t min_bin_size,
984 const int32_t cat_top_k,
985 const float cat_min_fraction,
990 input_numeric_features,
1004 obs_per_tree_fraction,
1006 min_observations_per_leaf_node,
1009 preferred_ml_framework_str,
1037 template <
typename T>
1043 const int64_t max_iterations,
1044 const int64_t max_tree_depth,
1045 const double shrinkage,
1046 const double min_split_loss,
1047 const double lambda,
1048 const double obs_per_tree_fraction,
1049 const int64_t features_per_node,
1050 const int64_t min_observations_per_leaf_node,
1051 const int64_t max_bins,
1052 const int64_t min_bin_size,
1053 const int32_t cat_top_k,
1054 const float cat_min_fraction,
1059 input_cat_features, cat_top_k, cat_min_fraction,
false );
1070 obs_per_tree_fraction,
1072 min_observations_per_leaf_node,
1075 preferred_ml_framework_str,
1080 template <
typename T>
1086 const std::vector<std::vector<std::string>>& cat_feature_keys,
1087 const int64_t num_trees,
1088 const double obs_per_tree_fraction,
1089 const int64_t max_tree_depth,
1090 const int64_t features_per_node,
1091 const double impurity_threshold,
1092 const bool bootstrap,
1093 const int64_t min_obs_per_leaf_node,
1094 const int64_t min_obs_per_split_node,
1095 const double min_weight_fraction_in_leaf_node,
1096 const double min_impurity_decrease_in_split_node,
1097 const int64_t max_leaf_nodes,
1098 const bool use_histogram,
1103 if (input_labels.
size() == 0) {
1104 return mgr.ERROR_MESSAGE(
1105 "No rows exist in training data. Training data must at least contain 1 row.");
1107 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
1109 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
1110 preferred_ml_framework_str.
getString());
1113 return mgr.ERROR_MESSAGE(
1114 "Only OneDAL framework supported for random forest regression.");
1117 return mgr.ERROR_MESSAGE(
1118 "Only OneDAL framework supported for random forest regression.");
1121 const auto denulled_data =
denull_data(input_labels, input_features);
1122 const auto labels_ptrs =
pluck_ptrs(denulled_data.data, 0L, 1L);
1123 const auto features_ptrs =
1127 bool did_execute =
false;
1128 const auto var_importance_metric =
1131 return mgr.ERROR_MESSAGE(
"Invalid variable importance metric: " +
1137 if (use_histogram) {
1138 onedal_random_forest_reg_fit_impl<T, decision_forest::regression::training::hist>(
1144 denulled_data.masked_num_rows,
1146 obs_per_tree_fraction,
1151 min_obs_per_leaf_node,
1152 min_obs_per_split_node,
1153 min_weight_fraction_in_leaf_node,
1154 min_impurity_decrease_in_split_node,
1156 var_importance_metric);
1158 onedal_random_forest_reg_fit_impl<
1160 decision_forest::regression::training::defaultDense>(
1166 denulled_data.masked_num_rows,
1168 obs_per_tree_fraction,
1173 min_obs_per_leaf_node,
1174 min_obs_per_split_node,
1175 min_weight_fraction_in_leaf_node,
1176 min_impurity_decrease_in_split_node,
1178 var_importance_metric);
1182 output_model_name[0] = model_name_str_id;
1187 return mgr.ERROR_MESSAGE(
1188 "Cannot find " + preferred_ml_framework_str.
getString() +
1189 " ML library to support random forest regression implementation.");
1191 }
catch (std::runtime_error& e) {
1192 return mgr.ERROR_MESSAGE(e.what());
1221 template <
typename T>
1227 const int64_t num_trees,
1228 const double obs_per_tree_fraction,
1229 const int64_t max_tree_depth,
1230 const int64_t features_per_node,
1231 const double impurity_threshold,
1232 const bool bootstrap,
1233 const int64_t min_obs_per_leaf_node,
1234 const int64_t min_obs_per_split_node,
1235 const double min_weight_fraction_in_leaf_node,
1236 const double min_impurity_decrease_in_split_node,
1237 const int64_t max_leaf_nodes,
1238 const bool use_histogram,
1243 std::vector<std::vector<std::string>> empty_cat_feature_keys;
1248 empty_cat_feature_keys,
1250 obs_per_tree_fraction,
1255 min_obs_per_leaf_node,
1256 min_obs_per_split_node,
1257 min_weight_fraction_in_leaf_node,
1258 min_impurity_decrease_in_split_node,
1261 var_importance_metric_str,
1262 preferred_ml_framework_str,
1293 template <
typename T>
1300 const int64_t num_trees,
1301 const double obs_per_tree_fraction,
1302 const int64_t max_tree_depth,
1303 const int64_t features_per_node,
1304 const double impurity_threshold,
1305 const bool bootstrap,
1306 const int64_t min_obs_per_leaf_node,
1307 const int64_t min_obs_per_split_node,
1308 const double min_weight_fraction_in_leaf_node,
1309 const double min_impurity_decrease_in_split_node,
1310 const int64_t max_leaf_nodes,
1311 const bool use_histogram,
1313 const int32_t cat_top_k,
1314 const float cat_min_fraction,
1319 input_numeric_features,
1329 obs_per_tree_fraction,
1334 min_obs_per_leaf_node,
1335 min_obs_per_split_node,
1336 min_weight_fraction_in_leaf_node,
1337 min_impurity_decrease_in_split_node,
1340 var_importance_metric_str,
1341 preferred_ml_framework_str,
1372 template <
typename T>
1378 const int64_t num_trees,
1379 const double obs_per_tree_fraction,
1380 const int64_t max_tree_depth,
1381 const int64_t features_per_node,
1382 const double impurity_threshold,
1383 const bool bootstrap,
1384 const int64_t min_obs_per_leaf_node,
1385 const int64_t min_obs_per_split_node,
1386 const double min_weight_fraction_in_leaf_node,
1387 const double min_impurity_decrease_in_split_node,
1388 const int64_t max_leaf_nodes,
1389 const bool use_histogram,
1391 const int32_t cat_top_k,
1392 const float cat_min_fraction,
1397 input_cat_features, cat_top_k, cat_min_fraction,
false );
1404 obs_per_tree_fraction,
1409 min_obs_per_leaf_node,
1410 min_obs_per_split_node,
1411 min_weight_fraction_in_leaf_node,
1412 min_impurity_decrease_in_split_node,
1415 var_importance_metric_str,
1416 preferred_ml_framework_str,
1421 template <
typename T>
1426 const std::vector<std::vector<std::string>>& cat_feature_keys,
1430 if (input_features.
size() == 0) {
1431 return mgr.ERROR_MESSAGE(
1432 "No rows exist in training data. Training data must at least contain 1 row.");
1434 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
1436 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
1437 preferred_ml_framework_str.
getString());
1440 const auto denulled_data =
denull_data(input_features);
1441 const int64_t num_rows = denulled_data.masked_num_rows;
1442 if (num_rows == 0) {
1443 return mgr.ERROR_MESSAGE(
1444 "No non-null rows exist in training data. Training data must at least contain "
1448 const auto features_ptrs =
1451 const auto z_std_norm_summary_stats =
1453 const auto normalized_ptrs =
1454 pluck_ptrs(z_std_norm_summary_stats.normalized_data,
1456 z_std_norm_summary_stats.normalized_data.size());
1457 bool did_execute =
false;
1461 const auto [eigenvectors, eigenvalues] =
1462 onedal_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
1463 auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
1464 z_std_norm_summary_stats.std_devs,
1474 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
1475 " ML library to support PCA implementation.");
1480 output_model_name[0] = model_name_str_id;
1482 }
catch (std::runtime_error& e) {
1483 return mgr.ERROR_MESSAGE(e.what());
1498 template <
typename T>
1506 std::vector<std::vector<std::string>> empty_cat_feature_keys;
1510 empty_cat_feature_keys,
1511 preferred_ml_framework_str,
1529 template <
typename T>
1535 const int32_t cat_top_k,
1536 const float cat_min_fraction,
1541 input_numeric_features,
1549 preferred_ml_framework_str,
1571 const int32_t cat_top_k,
1572 const float cat_min_fraction,
1577 template <
typename T,
typename K>
1580 const std::shared_ptr<AbstractMLModel>& model,
1586 const auto preferred_ml_framework =
get_ml_framework(preferred_ml_framework_str);
1588 return mgr.ERROR_MESSAGE(
"Invalid ML Framework: " +
1589 preferred_ml_framework_str.
getString());
1591 const auto denulled_data =
denull_data(input_features);
1592 const int64_t num_rows = denulled_data.masked_num_rows;
1593 const bool data_is_masked =
1594 denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
1595 std::vector<T> denulled_output_allocation(data_is_masked ? num_rows : 0);
1597 T* denulled_output =
1598 data_is_masked ? denulled_output_allocation.data() : output_predictions.
ptr_;
1599 const auto features_ptrs =
pluck_ptrs(denulled_data.data, 0L, input_features.
numCols());
1602 bool did_execute =
false;
1603 const auto model_type = model->getModelType();
1604 switch (model_type) {
1606 const auto linear_reg_model =
1608 CHECK(linear_reg_model);
1612 onedal_linear_reg_predict_impl(
1613 linear_reg_model, features_ptrs, denulled_output, num_rows);
1620 mlpack_linear_reg_predict_impl(
1621 linear_reg_model, features_ptrs, denulled_output, num_rows);
1629 const auto decision_tree_reg_model =
1630 std::dynamic_pointer_cast<DecisionTreeRegressionModel>(model);
1631 CHECK(decision_tree_reg_model);
1634 onedal_decision_tree_reg_predict_impl(
1635 decision_tree_reg_model, features_ptrs, denulled_output, num_rows);
1643 const auto gbt_reg_model = std::dynamic_pointer_cast<GbtRegressionModel>(model);
1644 CHECK(gbt_reg_model);
1647 onedal_gbt_reg_predict_impl(
1648 gbt_reg_model, features_ptrs, denulled_output, num_rows);
1656 const auto random_forest_reg_model =
1657 std::dynamic_pointer_cast<RandomForestRegressionModel>(model);
1658 CHECK(random_forest_reg_model);
1661 onedal_random_forest_reg_predict_impl(
1662 random_forest_reg_model, features_ptrs, denulled_output, num_rows);
1669 throw std::runtime_error(
"Unsupported model type");
1673 return mgr.ERROR_MESSAGE(
"Cannot find " + preferred_ml_framework_str.
getString() +
1674 " ML library to support model implementation.");
1676 }
catch (std::runtime_error& e) {
1677 const std::string error_str(e.what());
1678 return mgr.ERROR_MESSAGE(error_str);
1680 output_ids = input_ids;
1681 if (data_is_masked) {
1683 denulled_data.reverse_index_map,
1684 output_predictions.
ptr_,
1685 denulled_data.unmasked_num_rows,
1686 inline_null_value<T>());
1688 return input_ids.
size();
1702 template <
typename T,
typename K>
1718 preferred_ml_framework_str,
1720 output_predictions);
1721 }
catch (std::runtime_error& e) {
1722 const std::string error_str(e.what());
1723 return mgr.ERROR_MESSAGE(error_str);
1738 template <
typename T,
typename K>
1751 model, input_cat_features.
numCols(), input_numeric_features.
numCols());
1753 input_cat_features, input_numeric_features, model->getCatFeatureKeys());
1757 cat_features_builder.getFeatures(),
1758 preferred_ml_framework_str,
1760 output_predictions);
1761 }
catch (std::runtime_error& e) {
1762 const std::string error_str(e.what());
1763 return mgr.ERROR_MESSAGE(error_str);
1778 template <
typename T,
typename K>
1791 model->getCatFeatureKeys());
1795 cat_features_builder.getFeatures(),
1796 preferred_ml_framework_str,
1798 output_predictions);
1799 }
catch (std::runtime_error& e) {
1800 const std::string error_str(e.what());
1801 return mgr.ERROR_MESSAGE(error_str);
1816 template <
typename T,
typename K>
1825 if (model_name.
size() != 1) {
1826 return mgr.ERROR_MESSAGE(
"Expected only one row in model CURSOR.");
1830 model_name_text_enc_none,
1833 preferred_ml_framework_str,
1835 output_predictions);
1849 template <
typename T,
typename K>
1859 if (model_name.
size() != 1) {
1860 return mgr.ERROR_MESSAGE(
"Expected only one row in model CURSOR.");
1864 model_name_text_enc_none,
1867 input_numeric_features,
1868 preferred_ml_framework_str,
1870 output_predictions);
1884 template <
typename T,
typename K>
1893 if (model_name.
size() != 1) {
1894 return mgr.ERROR_MESSAGE(
"Expected only one row in model CURSOR.");
1898 model_name_text_enc_none,
1901 preferred_ml_framework_str,
1903 output_predictions);
1906 template <
typename T>
1908 const std::shared_ptr<AbstractMLModel>& model,
1912 const int64_t num_rows = input_labels.
size();
1913 if (num_rows == 0) {
1914 return mgr.ERROR_MESSAGE(
1915 "No rows exist in evaluation data. Evaluation data must at least contain 1 row.");
1917 std::vector<T> output_predictions_vec(num_rows);
1918 Column<T> output_predictions(output_predictions_vec);
1919 std::vector<int64_t> input_ids_vec(num_rows);
1920 std::vector<int64_t> output_ids_vec(num_rows);
1931 ml_framework_encoding_none,
1933 output_predictions);
1939 }
catch (std::runtime_error& e) {
1941 return mgr.ERROR_MESSAGE(e.what());
1948 const size_t max_thread_count = std::thread::hardware_concurrency();
1950 const size_t num_threads = std::min(
1951 max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
1953 std::vector<double> local_sum_squared_regressions(num_threads, 0.0);
1954 std::vector<double> local_sum_squares(num_threads, 0.0);
1956 tbb::task_arena limited_arena(num_threads);
1958 limited_arena.execute([&] {
1960 tbb::blocked_range<int64_t>(0, num_rows),
1961 [&](
const tbb::blocked_range<int64_t>& r) {
1962 const int64_t start_idx = r.begin();
1963 const int64_t end_idx = r.end();
1964 double local_sum_squared_regression{0.0};
1965 double local_sum_square{0.0};
1966 for (int64_t row_idx = start_idx; row_idx < end_idx; ++row_idx) {
1967 if (output_predictions[row_idx] != inline_null_value<T>()) {
1968 local_sum_squared_regression +=
1969 (input_labels[row_idx] - output_predictions[row_idx]) *
1970 (input_labels[row_idx] - output_predictions[row_idx]);
1971 local_sum_square += (input_labels[row_idx] - labels_mean) *
1972 (input_labels[row_idx] - labels_mean);
1975 const size_t thread_idx = tbb::this_task_arena::current_thread_index();
1976 local_sum_squared_regressions[thread_idx] += local_sum_squared_regression;
1977 local_sum_squares[thread_idx] += local_sum_square;
1980 double sum_squared_regression{0.0};
1981 double sum_squares{0.0};
1982 for (
size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
1983 sum_squared_regression += local_sum_squared_regressions[thread_idx];
1984 sum_squares += local_sum_squares[thread_idx];
1986 output_r2[0] = sum_squares == 0.0 ? 1.0 : 1.0 - (sum_squared_regression / sum_squares);
1999 template <
typename T>
2008 return r2_score_impl(mgr, model, input_labels, input_features, output_r2);
2009 }
catch (std::runtime_error& e) {
2010 const std::string error_str(e.what());
2011 return mgr.ERROR_MESSAGE(error_str);
2024 template <
typename T>
2031 if (model_name.
size() != 1) {
2032 return mgr.ERROR_MESSAGE(
"Expected only one row in model name CURSOR.");
2036 mgr, model_name_text_enc_none, input_labels, input_features, output_r2);
2047 template <
typename T>
2058 model, input_cat_features.
numCols(), input_numeric_features.
numCols());
2060 input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2062 mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2063 }
catch (std::runtime_error& e) {
2064 const std::string error_str(e.what());
2065 return mgr.ERROR_MESSAGE(error_str);
2077 template <
typename T>
2088 model->getCatFeatureKeys());
2090 mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2091 }
catch (std::runtime_error& e) {
2092 const std::string error_str(e.what());
2093 return mgr.ERROR_MESSAGE(error_str);
2105 template <
typename T>
2113 if (model_name.
size() != 1) {
2114 return mgr.ERROR_MESSAGE(
"Expected only one row in model name CURSOR.");
2116 const std::string model_name_str{model_name.
getString(0)};
2120 model, input_cat_features.
numCols(), input_numeric_features.
numCols());
2122 input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2124 mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2125 }
catch (std::runtime_error& e) {
2126 const std::string error_str(e.what());
2127 return mgr.ERROR_MESSAGE(error_str);
2217 #endif // #ifndef __CUDACC__
DEVICE const std::string getString(int64_t index) const
ColumnList< T > getFeatures()
NEVER_INLINE HOST int32_t pca_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
void set_output_row_size(int64_t num_rows)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const ColumnList< T > &numeric_features, const int32_t cat_top_k, const float cat_min_fraction, const bool cat_include_others)
KMeansInitStrategy get_kmeans_init_type(const std::string &init_type_str)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const int32_t cat_top_k, const float cat_min_fraction, const bool cat_include_others)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
NEVER_INLINE HOST int32_t r2_score_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
std::string getString() const
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
DEVICE int64_t size() const
NEVER_INLINE HOST int32_t decision_tree_reg_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST int32_t pca_fit__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< TextEncodingDict > &input_cat_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
EXTENSION_NOINLINE_HOST int32_t supported_ml_frameworks__cpu_(TableFunctionManager &mgr, Column< TextEncodingDict > &output_ml_frameworks, Column< bool > &output_availability, Column< bool > &output_default)
MaskedData< T > denull_data(const ColumnList< T > &features)
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t kmeans__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const int num_clusters, const int num_iterations, const TextEncodingNone &init_type_str, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
NEVER_INLINE HOST int32_t pca_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
MLFramework get_ml_framework(const std::string &ml_framework_str)
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
const size_t max_inputs_per_thread
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const ColumnList< T > &numeric_features, const std::vector< std::vector< std::string >> &cat_feature_keys)
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
void addModel(const std::string &model_name, std::shared_ptr< AbstractMLModel > model)
VarImportanceMetric get_var_importance_metric(const std::string &var_importance_metric_str)
NEVER_INLINE HOST int32_t gbt_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
#define EXTENSION_NOINLINE_HOST
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
void disable_output_allocations()
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
DEVICE int64_t numCols() const
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
std::vector< int8_t * > col_ptrs_
std::vector< TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodedCol< T > > one_hot_encoded_cols_
void unmask_data(const T *masked_input, const std::vector< int32_t > &reverse_index_map, T *unmasked_output, const int64_t num_unmasked_rows, const T null_val)
NEVER_INLINE HOST int32_t linear_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t dbscan__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const double epsilon, const int32_t min_observations, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const std::vector< std::vector< std::string >> &cat_feature_keys)
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
std::vector< std::vector< T > > z_std_normalize_data(const std::vector< T * > &input_data, const int64_t num_rows)
NEVER_INLINE HOST int32_t ml_reg_predict_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
std::vector< std::vector< std::string > > cat_feature_keys_
DEVICE int64_t size() const
DEVICE int64_t size() const
NEVER_INLINE HOST int32_t r2_score__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
ZStdNormalizationSummaryStats< T > z_std_normalize_data_with_summary_stats(const std::vector< T * > &input_data, const int64_t num_rows)
void enable_output_allocations()
Column< T > create_wrapper_col(std::vector< T > &col_vec)
NEVER_INLINE HOST double get_column_mean(const T *data, const int64_t num_rows)
const std::vector< std::vector< std::string > > & getCatFeatureKeys() const