27 using namespace daal::algorithms;
28 using namespace daal::data_management;
31 const NumericTablePtr prepare_data_table(
const T* data,
const int64_t num_rows) {
34 data_table->setArray<
T>(
const_cast<T*
>(data), 0);
39 const NumericTablePtr prepare_data_table(
const std::vector<const T*>& data,
40 const int64_t num_rows) {
42 const size_t num_columns = data.size();
46 for (
size_t i = 0; i < num_columns; ++i) {
47 data_table->setArray<
T>(
const_cast<T*
>(data[i]), i);
53 const NumericTablePtr prepare_pivoted_data_table(
const T* data,
const int64_t num_elems) {
57 for (
size_t c = 0; c < static_cast<size_t>(num_elems); ++c) {
58 data_table->setArray<
T>(
const_cast<T*
>(data) + c, c);
64 const static std::map<KMeansInitStrategy, kmeans::init::Method> kmeans_init_type_map = {
70 const auto itr = kmeans_init_type_map.find(init_type);
71 if (itr == kmeans_init_type_map.end()) {
72 std::ostringstream oss;
73 oss <<
"Invalid Kmeans cluster centroid initialization type. "
74 <<
"Was expecting one of DETERMINISTIC, RANDOM, or PLUS_PLUS.";
75 throw std::runtime_error(oss.str());
80 template <
typename T, kmeans::init::Method M>
81 const NumericTablePtr init_centroids_for_type(
const NumericTablePtr& input_features_table,
82 const int32_t num_clusters) {
83 kmeans::init::Batch<T, M>
init(num_clusters);
84 init.input.set(kmeans::init::data, input_features_table);
86 return init.getResult()->get(kmeans::init::centroids);
90 const NumericTablePtr init_centroids(
const NumericTablePtr& input_features_table,
91 const kmeans::init::Method& init_type,
92 const int32_t num_clusters) {
94 case kmeans::init::Method::deterministicDense:
95 return init_centroids_for_type<T, kmeans::init::Method::deterministicDense>(
96 input_features_table, num_clusters);
97 case kmeans::init::Method::randomDense:
98 return init_centroids_for_type<T, kmeans::init::Method::randomDense>(
99 input_features_table, num_clusters);
100 case kmeans::init::Method::plusPlusDense:
101 return init_centroids_for_type<T, kmeans::init::Method::plusPlusDense>(
102 input_features_table, num_clusters);
103 case kmeans::init::Method::parallelPlusDense:
104 return init_centroids_for_type<T, kmeans::init::Method::parallelPlusDense>(
105 input_features_table, num_clusters);
108 return init_centroids_for_type<T, kmeans::init::Method::deterministicDense>(
109 input_features_table, num_clusters);
114 template <
typename T>
115 NEVER_INLINE HOST int32_t onedal_kmeans_impl(
const std::vector<const T*>& input_features,
116 int32_t* output_clusters,
117 const int64_t num_rows,
118 const int num_clusters,
119 const int num_iterations,
122 const auto features_table = prepare_data_table(input_features, num_rows);
124 const auto centroids =
125 init_centroids<T>(features_table, onedal_kmeans_init_type, num_clusters);
126 const auto assignments_table =
128 const kmeans::ResultPtr
result(
new kmeans::Result);
129 result->set(kmeans::assignments, assignments_table);
130 result->set(kmeans::objectiveFunction,
132 result->set(kmeans::nIterations,
134 kmeans::Batch<> algorithm(num_clusters, num_iterations);
135 algorithm.input.set(kmeans::data, features_table);
136 algorithm.input.set(kmeans::inputCentroids, centroids);
137 algorithm.parameter().resultsToEvaluate = kmeans::computeAssignments;
138 algorithm.setResult(
result);
140 }
catch (std::exception& e) {
141 throw std::runtime_error(e.what());
146 template <
typename T>
147 NEVER_INLINE HOST int32_t onedal_dbscan_impl(
const std::vector<const T*>& input_features,
148 int32_t* output_clusters,
149 const int64_t num_rows,
150 const double epsilon,
151 const int32_t min_observations) {
153 const auto features_table = prepare_data_table(input_features, num_rows);
154 const auto assignments_table =
156 const dbscan::ResultPtr
result(
new dbscan::Result);
157 result->set(dbscan::assignments, assignments_table);
158 result->set(dbscan::nClusters,
160 dbscan::Batch<> algorithm(epsilon, min_observations);
161 algorithm.input.set(dbscan::data, features_table);
162 algorithm.parameter().resultsToCompute = dbscan::assignments;
163 algorithm.setResult(
result);
165 }
catch (std::exception& e) {
166 throw std::runtime_error(e.what());
171 template <
typename T>
172 NEVER_INLINE HOST std::pair<std::vector<std::vector<T>>, std::vector<T>> onedal_pca_impl(
173 const std::vector<const T*>& input_features,
174 const int64_t num_rows) {
176 const auto features_table = prepare_data_table(input_features, num_rows);
177 pca::Batch<> algorithm;
178 algorithm.input.set(pca::data, features_table);
179 algorithm.parameter.resultsToCompute = pca::mean | pca::variance | pca::eigenvalue;
180 algorithm.parameter.isDeterministic =
true;
183 pca::ResultPtr
result = algorithm.getResult();
184 const auto eigenvectors_table = result->get(pca::eigenvectors);
185 const int64_t num_dims = eigenvectors_table->getNumberOfRows();
186 CHECK_EQ(num_dims, static_cast<int64_t>(eigenvectors_table->getNumberOfColumns()));
187 std::vector<std::vector<T>> eigenvectors(num_dims, std::vector<T>(num_dims));
188 for (int64_t row_idx = 0; row_idx < num_dims; ++row_idx) {
189 for (int64_t col_idx = 0; col_idx < num_dims; ++col_idx) {
191 eigenvectors[row_idx][col_idx] =
192 eigenvectors_table->getValue<
T>(col_idx, row_idx);
195 const auto eigenvalues_table = result->get(pca::eigenvalues);
196 std::vector<T> eigenvalues(num_dims);
197 for (int64_t dim_idx = 0; dim_idx < num_dims; ++dim_idx) {
198 eigenvalues[dim_idx] = eigenvalues_table->getValue<
T>(dim_idx, 0);
200 return std::make_pair(eigenvectors, eigenvalues);
201 }
catch (std::exception& e) {
202 throw std::runtime_error(e.what());
206 template <
typename T>
207 int32_t extract_model_coefs(
const NumericTablePtr& coefs_table,
210 const int64_t num_coefs = coefs_table->getNumberOfColumns();
211 for (int64_t coef_idx = 0; coef_idx < num_coefs; ++coef_idx) {
212 coef_idxs[coef_idx] = coef_idx;
214 coefs_table->NumericTable::getValue<
T>(coef_idx,
static_cast<size_t>(0));
219 template <
typename T>
221 onedal_linear_reg_fit_impl(
const T* input_labels,
222 const std::vector<const T*>& input_features,
223 int64_t* output_coef_idxs,
224 double* output_coefs,
225 const int64_t num_rows) {
227 const auto labels_table = prepare_data_table(input_labels, num_rows);
228 const auto features_table = prepare_data_table(input_features, num_rows);
230 linear_regression::training::Batch<T, linear_regression::training::Method::qrDense>
233 algorithm.input.set(linear_regression::training::data, features_table);
234 algorithm.input.set(linear_regression::training::dependentVariables, labels_table);
237 const auto training_result = algorithm.getResult();
238 const auto coefs_table =
239 training_result->get(linear_regression::training::model)->getBeta();
240 return extract_model_coefs<T>(coefs_table, output_coef_idxs, output_coefs);
241 }
catch (std::exception& e) {
242 throw std::runtime_error(e.what());
246 template <
typename T>
248 const double* model_coefs,
249 const int64_t num_coefs) {
253 std::vector<T> casted_model_coefs(num_coefs);
254 for (int64_t coef_idx = 0; coef_idx < num_coefs; ++coef_idx) {
255 casted_model_coefs[coef_idx] = model_coefs[coef_idx];
257 const auto betas_table =
258 prepare_pivoted_data_table(casted_model_coefs.data(), num_coefs);
259 CHECK_EQ(betas_table->getNumberOfColumns(), num_coefs);
262 linear_regression::ModelBuilder<T> model_builder(num_coefs - 1,
266 BlockDescriptor<T> block_result;
270 betas_table->getBlockOfRows(0, betas_table->getNumberOfRows(), readOnly, block_result);
272 (betas_table->getNumberOfRows()) * (betas_table->getNumberOfColumns());
275 T* first_itr = block_result.getBlockPtr();
276 T* last_itr = first_itr + num_betas;
277 model_builder.setBeta(first_itr, last_itr);
278 betas_table->releaseBlockOfRows(block_result);
280 return model_builder.getModel();
283 template <
typename T>
285 onedal_linear_reg_predict_impl(
const std::shared_ptr<LinearRegressionModel>& model,
286 const std::vector<const T*>& input_features,
287 T* output_predictions,
288 const int64_t num_rows) {
291 if (model->getNumFeatures() !=
static_cast<int64_t
>(input_features.size())) {
292 throw std::runtime_error(
293 "Number of model coefficients does not match number of input features.");
295 const auto features_table = prepare_data_table(input_features, num_rows);
296 const auto model_ptr =
297 build_linear_reg_model<T>(model->getCoefs().data(), input_features.size() + 1);
299 linear_regression::prediction::Batch<> algorithm;
300 algorithm.input.set(linear_regression::prediction::data, features_table);
301 algorithm.input.set(linear_regression::prediction::model, model_ptr);
303 const auto predictions_table =
306 const linear_regression::prediction::ResultPtr
result(
307 new linear_regression::prediction::Result);
308 result->set(linear_regression::prediction::prediction, predictions_table);
309 algorithm.setResult(result);
312 }
catch (std::exception& e) {
313 throw std::runtime_error(e.what());
317 template <
typename T>
319 const std::string& model_name,
320 const T* input_labels,
321 const std::vector<const T*>& input_features,
322 const std::string& model_metadata,
323 const std::vector<std::vector<std::string>>& cat_feature_keys,
324 const int64_t num_rows,
325 const int64_t max_tree_depth,
326 const int64_t min_observations_per_leaf_node) {
328 const auto labels_table = prepare_data_table(input_labels, num_rows);
329 const auto features_table = prepare_data_table(input_features, num_rows);
330 decision_tree::regression::training::Batch<T> algorithm;
331 algorithm.input.set(decision_tree::regression::training::data, features_table);
332 algorithm.input.set(decision_tree::regression::training::dependentVariables,
335 algorithm.parameter.pruning = decision_tree::Pruning::none;
336 algorithm.parameter.maxTreeDepth = max_tree_depth;
337 algorithm.parameter.minObservationsInLeafNodes = min_observations_per_leaf_node;
340 decision_tree::regression::training::ResultPtr training_result =
341 algorithm.getResult();
343 auto model_ptr = training_result->get(decision_tree::regression::training::model);
344 auto model = std::make_shared<DecisionTreeRegressionModel>(
345 model_ptr, model_metadata, cat_feature_keys);
347 }
catch (std::exception& e) {
348 throw std::runtime_error(e.what());
352 template <
typename T>
354 const std::string& model_name,
355 const T* input_labels,
356 const std::vector<const T*>& input_features,
357 const std::string& model_metadata,
358 const std::vector<std::vector<std::string>>& cat_feature_keys,
359 const int64_t num_rows,
360 const int64_t max_iterations,
361 const int64_t max_tree_depth,
362 const double shrinkage,
363 const double min_split_loss,
365 const double obs_per_tree_fraction,
366 const int64_t features_per_node,
367 const int64_t min_observations_per_leaf_node,
368 const int64_t max_bins,
369 const int64_t min_bin_size) {
371 const auto labels_table = prepare_data_table(input_labels, num_rows);
372 const auto features_table = prepare_data_table(input_features, num_rows);
373 gbt::regression::training::Batch<T> algorithm;
374 algorithm.input.set(gbt::regression::training::data, features_table);
375 algorithm.input.set(gbt::regression::training::dependentVariable, labels_table);
377 algorithm.parameter().maxIterations = max_iterations;
378 algorithm.parameter().maxTreeDepth = max_tree_depth;
379 algorithm.parameter().shrinkage = shrinkage;
380 algorithm.parameter().minSplitLoss = min_split_loss;
381 algorithm.parameter().lambda = lambda;
382 algorithm.parameter().observationsPerTreeFraction = obs_per_tree_fraction;
383 algorithm.parameter().featuresPerNode = features_per_node;
384 algorithm.parameter().minObservationsInLeafNode = min_observations_per_leaf_node;
385 algorithm.parameter().maxBins = max_bins;
386 algorithm.parameter().minBinSize = min_bin_size;
389 gbt::regression::training::ResultPtr training_result = algorithm.getResult();
391 auto model_ptr = training_result->get(gbt::regression::training::model);
393 std::make_shared<GbtRegressionModel>(model_ptr, model_metadata, cat_feature_keys);
395 }
catch (std::exception& e) {
396 throw std::runtime_error(e.what());
400 inline decision_forest::training::VariableImportanceMode get_var_importance_metric_type(
403 decision_forest::training::VariableImportanceMode>
404 var_importance_mode_type_map = {
406 decision_forest::training::VariableImportanceMode::MDI},
408 decision_forest::training::VariableImportanceMode::none},
410 decision_forest::training::VariableImportanceMode::MDI},
412 decision_forest::training::VariableImportanceMode::MDA_Raw},
414 decision_forest::training::VariableImportanceMode::MDA_Scaled}};
416 const auto itr = var_importance_mode_type_map.find(var_importance_metric);
417 if (itr == var_importance_mode_type_map.end()) {
418 std::ostringstream oss;
419 oss <<
"Invalid variable importance mode type. "
420 <<
"Was expecting one of DEFAULT, NONE, MDI, MDA, or MDA_SCALED.";
421 throw std::runtime_error(oss.str());
426 template <
typename T, decision_forest::regression::training::Method M>
428 const std::string& model_name,
429 const T* input_labels,
430 const std::vector<const T*>& input_features,
431 const std::string& model_metadata,
432 const std::vector<std::vector<std::string>>& cat_feature_keys,
433 const int64_t num_rows,
434 const int64_t num_trees,
435 const double obs_per_tree_fraction,
436 const int64_t max_tree_depth,
437 const int64_t features_per_node,
438 const double impurity_threshold,
439 const bool bootstrap,
440 const int64_t min_obs_per_leaf_node,
441 const int64_t min_obs_per_split_node,
442 const double min_weight_fraction_in_leaf_node,
443 const double min_impurity_decrease_in_split_node,
444 const int64_t max_leaf_nodes,
445 const VarImportanceMetric var_importance_metric) {
446 constexpr
bool compute_out_of_bag_error{
false};
448 const auto labels_table = prepare_data_table(input_labels, num_rows);
449 const auto features_table = prepare_data_table(input_features, num_rows);
450 decision_forest::regression::training::Batch<T, M> algorithm;
451 algorithm.input.set(decision_forest::regression::training::data, features_table);
452 algorithm.input.set(decision_forest::regression::training::dependentVariable,
455 algorithm.parameter().nTrees = num_trees;
456 algorithm.parameter().observationsPerTreeFraction = obs_per_tree_fraction;
457 algorithm.parameter().maxTreeDepth = max_tree_depth;
458 algorithm.parameter().featuresPerNode = features_per_node;
459 algorithm.parameter().impurityThreshold = impurity_threshold;
460 algorithm.parameter().bootstrap = bootstrap;
461 algorithm.parameter().minObservationsInLeafNode = min_obs_per_leaf_node;
462 algorithm.parameter().minObservationsInSplitNode = min_obs_per_split_node;
463 algorithm.parameter().minWeightFractionInLeafNode = min_weight_fraction_in_leaf_node;
464 algorithm.parameter().minImpurityDecreaseInSplitNode =
465 min_impurity_decrease_in_split_node;
466 algorithm.parameter().varImportance =
467 get_var_importance_metric_type(var_importance_metric);
468 algorithm.parameter().resultsToCompute =
469 compute_out_of_bag_error ? decision_forest::training::computeOutOfBagError : 0;
472 decision_forest::regression::training::ResultPtr training_result =
473 algorithm.getResult();
475 auto model_ptr = training_result->get(decision_forest::regression::training::model);
476 auto variable_importance_table =
477 training_result->get(decision_forest::regression::training::variableImportance);
478 const size_t num_features = input_features.size();
479 std::vector<double> variable_importance(
482 for (
size_t feature_idx = 0; feature_idx < num_features; ++feature_idx) {
483 variable_importance[feature_idx] =
484 variable_importance_table->NumericTable::getValue<
T>(feature_idx, size_t(0));
487 double out_of_bag_error{0};
488 if (compute_out_of_bag_error) {
489 auto out_of_bag_error_table =
490 training_result->get(decision_forest::regression::training::outOfBagError);
492 out_of_bag_error_table->NumericTable::getValue<
T>(0,
static_cast<size_t>(0));
494 auto model = std::make_shared<RandomForestRegressionModel>(model_ptr,
500 }
catch (std::exception& e) {
501 throw std::runtime_error(e.what());
505 template <
typename T>
507 const std::shared_ptr<DecisionTreeRegressionModel>& model,
508 const std::vector<const T*>& input_features,
509 T* output_predictions,
510 const int64_t num_rows) {
513 if (model->getNumFeatures() !=
static_cast<int64_t
>(input_features.size())) {
514 throw std::runtime_error(
"Number of provided features does not match model.");
516 const auto features_table = prepare_data_table(input_features, num_rows);
517 decision_tree::regression::prediction::Batch<T> algorithm;
518 algorithm.input.set(decision_tree::regression::prediction::data, features_table);
519 algorithm.input.set(decision_tree::regression::prediction::model,
520 model->getModelPtr());
522 const auto predictions_table =
525 const decision_tree::regression::prediction::ResultPtr
result(
526 new decision_tree::regression::prediction::Result);
527 result->set(decision_tree::regression::prediction::prediction, predictions_table);
528 algorithm.setResult(result);
531 }
catch (std::exception& e) {
532 throw std::runtime_error(e.what());
536 template <
typename T>
538 onedal_gbt_reg_predict_impl(
const std::shared_ptr<GbtRegressionModel>& model,
539 const std::vector<const T*>& input_features,
540 T* output_predictions,
541 const int64_t num_rows) {
544 if (model->getNumFeatures() !=
static_cast<int64_t
>(input_features.size())) {
545 throw std::runtime_error(
"Number of provided features does not match model.");
547 const auto features_table = prepare_data_table(input_features, num_rows);
548 gbt::regression::prediction::Batch<T> algorithm;
549 algorithm.input.set(gbt::regression::prediction::data, features_table);
550 algorithm.input.set(gbt::regression::prediction::model, model->getModelPtr());
552 const auto predictions_table =
555 const gbt::regression::prediction::ResultPtr
result(
556 new gbt::regression::prediction::Result);
557 result->set(gbt::regression::prediction::prediction, predictions_table);
558 algorithm.setResult(result);
561 }
catch (std::exception& e) {
562 throw std::runtime_error(e.what());
566 template <
typename T>
568 const std::shared_ptr<RandomForestRegressionModel>& model,
569 const std::vector<const T*>& input_features,
570 T* output_predictions,
571 const int64_t num_rows) {
574 if (model->getNumFeatures() !=
static_cast<int64_t
>(input_features.size())) {
575 throw std::runtime_error(
"Number of provided features does not match model.");
577 const auto features_table = prepare_data_table(input_features, num_rows);
578 decision_forest::regression::prediction::Batch<T> algorithm;
579 algorithm.input.set(decision_forest::regression::prediction::data, features_table);
580 algorithm.input.set(decision_forest::regression::prediction::model,
581 model->getModelPtr());
583 const auto predictions_table =
586 const decision_forest::regression::prediction::ResultPtr
result(
587 new decision_forest::regression::prediction::Result);
588 result->set(decision_forest::regression::prediction::prediction, predictions_table);
589 algorithm.setResult(result);
592 }
catch (std::exception& e) {
593 throw std::runtime_error(e.what());
597 inline const std::vector<double>& onedal_random_forest_reg_var_importance_impl(
598 const std::shared_ptr<RandomForestRegressionModel>& rand_forest_model) {
599 return rand_forest_model->getVariableImportanceScores();
602 #endif // #ifdef HAVE_ONEDAL
603 #endif // #ifdef __CUDACC__
KMeansInitStrategy get_kmeans_init_type(const std::string &init_type_str)
std::pair< FILE *, std::string > create(const std::string &basePath, const int fileId, const size_t pageSize, const size_t numPages)
void init(LogOptions const &log_opts)
void addModel(const std::string &model_name, std::shared_ptr< AbstractMLModel > model)