OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MLTableFunctions.hpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc., Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #ifndef __CUDACC__
20 
24 
27 
28 #ifdef HAVE_ONEDAL
31 #endif
32 
33 #ifdef HAVE_MLPACK
35 #endif
36 
37 #include <tbb/parallel_for.h>
38 #include <tbb/task_arena.h>
39 
40 using namespace TableFunctions_Namespace;
41 
42 template <typename T>
43 std::vector<const T*> pluck_ptrs(const std::vector<std::vector<T>>& data,
44  const int64_t start_idx,
45  const int64_t end_idx) {
46  std::vector<const T*> raw_ptrs;
47  CHECK_GE(start_idx, 0L);
48  CHECK_GT(end_idx, start_idx);
49  CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
50  for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
51  raw_ptrs.emplace_back(data[col_idx].data());
52  }
53  return raw_ptrs;
54 }
55 
56 template <typename T>
57 std::vector<const T*> pluck_ptrs(const std::vector<T*>& data,
58  const int64_t start_idx,
59  const int64_t end_idx) {
60  std::vector<const T*> raw_ptrs;
61  CHECK_GE(start_idx, 0L);
62  CHECK_GT(end_idx, start_idx);
63  CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
64  for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
65  raw_ptrs.emplace_back(data[col_idx]);
66  }
67  return raw_ptrs;
68 }
69 
70 // clang-format off
71 /*
72  UDTF: supported_ml_frameworks__cpu_(TableFunctionManager) ->
73  Column<TextEncodingDict> ml_framework | input_id=args<>, Column<bool> is_available, Column<bool> is_default
74 */
75 // clang-format on
76 
79  Column<TextEncodingDict>& output_ml_frameworks,
80  Column<bool>& output_availability,
81  Column<bool>& output_default);
83 void check_model_params(const std::shared_ptr<AbstractMLModel>& model,
84  const int64_t num_cat_features,
85  const int64_t num_numeric_features);
86 
87 // clang-format off
88 /*
89  UDTF: kmeans__cpu_template(TableFunctionManager,
90  Cursor<Column<K> input_ids, ColumnList<T> input_features> data,
91  int32_t num_clusters | require="num_clusters > 0" | require="num_clusters <= input_ids.size()",
92  int32_t num_iterations | require="num_iterations > 0" | default=10,
93  TextEncodingNone init_type | default="DEFAULT",
94  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
95  Column<K> id | input_id=args<0>,
96  Column<int32_t> cluster_id,
97  K=[int64_t, TextEncodingDict], T=[double]
98 */
99 // clang-format on
100 
101 template <typename K, typename T>
102 NEVER_INLINE HOST int32_t
104  const Column<K>& input_ids,
105  const ColumnList<T>& input_features,
106  const int num_clusters,
107  const int num_iterations,
108  const TextEncodingNone& init_type_str,
109  const TextEncodingNone& preferred_ml_framework_str,
110  Column<K>& output_ids,
111  Column<int32_t>& output_clusters) {
112  mgr.set_output_row_size(input_ids.size());
113  output_ids = input_ids;
114  const auto kmeans_init_strategy = get_kmeans_init_type(init_type_str);
115  if (kmeans_init_strategy == KMeansInitStrategy::INVALID) {
116  return mgr.ERROR_MESSAGE("Invalid KMeans initialization strategy: " +
117  init_type_str.getString());
118  }
119 
120  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
121  if (preferred_ml_framework == MLFramework::INVALID) {
122  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
123  preferred_ml_framework_str.getString());
124  }
125 
126  try {
127  const auto denulled_data = denull_data(input_features);
128  const int64_t num_rows = denulled_data.masked_num_rows;
129  const bool data_is_masked =
130  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
131  std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
132  int32_t* denulled_output =
133  data_is_masked ? denulled_output_allocation.data() : output_clusters.ptr_;
134 
135  // z_std_normalize_data can throw if std dev is 0
136  const auto normalized_data = z_std_normalize_data(denulled_data.data, num_rows);
137  const auto normalized_ptrs = pluck_ptrs(normalized_data, 0L, normalized_data.size());
138 
139  bool did_execute = false;
140 #ifdef HAVE_ONEDAL
141  if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
142  preferred_ml_framework == MLFramework::DEFAULT)) {
143  onedal_oneapi_kmeans_impl(normalized_ptrs,
144  denulled_output,
145  num_rows,
146  num_clusters,
147  num_iterations,
148  kmeans_init_strategy);
149  did_execute = true;
150  } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
151  onedal_kmeans_impl(normalized_ptrs,
152  denulled_output,
153  num_rows,
154  num_clusters,
155  num_iterations,
156  kmeans_init_strategy);
157  did_execute = true;
158  }
159 #endif
160 #ifdef HAVE_MLPACK
161  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
162  preferred_ml_framework == MLFramework::DEFAULT)) {
163  mlpack_kmeans_impl(normalized_ptrs,
164  denulled_output,
165  num_rows,
166  num_clusters,
167  num_iterations,
168  kmeans_init_strategy);
169  did_execute = true;
170  }
171 #endif
172  if (!did_execute) {
173  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
174  " ML library to support kmeans implementation.");
175  }
176 
177  if (data_is_masked) {
178  unmask_data(denulled_output,
179  denulled_data.reverse_index_map,
180  output_clusters.ptr_,
181  denulled_data.unmasked_num_rows,
182  inline_null_value<int32_t>());
183  }
184  } catch (std::runtime_error& e) {
185  return mgr.ERROR_MESSAGE(e.what());
186  }
187  return input_ids.size();
188 }
189 
190 // clang-format off
191 /*
192  UDTF: dbscan__cpu_template(TableFunctionManager,
193  Cursor<Column<K> input_ids, ColumnList<T> input_features> data,
194  double epsilon | require="epsilon > 0.0",
195  int32_t min_observations | require="min_observations > 0",
196  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
197  Column<K> id | input_id=args<0>, Column<int32_t> cluster_id,
198  K=[int64_t, TextEncodingDict], T=[double]
199  */
200 // clang-format on
201 
202 template <typename K, typename T>
203 NEVER_INLINE HOST int32_t
205  const Column<K>& input_ids,
206  const ColumnList<T>& input_features,
207  const double epsilon,
208  const int32_t min_observations,
209  const TextEncodingNone& preferred_ml_framework_str,
210  Column<K>& output_ids,
211  Column<int32_t>& output_clusters) {
212  mgr.set_output_row_size(input_ids.size());
213  output_ids = input_ids;
214 
215  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
216  if (preferred_ml_framework == MLFramework::INVALID) {
217  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
218  preferred_ml_framework_str.getString());
219  }
220 
221  try {
222  const auto denulled_data = denull_data(input_features);
223  const int64_t num_rows = denulled_data.masked_num_rows;
224  const bool data_is_masked =
225  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
226  std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
227  int32_t* denulled_output =
228  data_is_masked ? denulled_output_allocation.data() : output_clusters.ptr_;
229 
230  // z_std_normalize_data can throw if std dev is 0
231  const auto normalized_data = z_std_normalize_data(denulled_data.data, num_rows);
232  const auto normalized_ptrs = pluck_ptrs(normalized_data, 0L, normalized_data.size());
233 
234  bool did_execute = false;
235 #ifdef HAVE_ONEDAL
236  if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
237  preferred_ml_framework == MLFramework::DEFAULT)) {
238  onedal_oneapi_dbscan_impl(
239  normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
240  did_execute = true;
241  } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
242  onedal_dbscan_impl(
243  normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
244  did_execute = true;
245  }
246 #endif
247 #ifdef HAVE_MLPACK
248  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
249  preferred_ml_framework == MLFramework::DEFAULT)) {
250  mlpack_dbscan_impl(
251  normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
252  did_execute = true;
253  }
254 #endif
255  if (!did_execute) {
256  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
257  " ML library to support dbscan implementation.");
258  }
259 
260  if (data_is_masked) {
261  unmask_data(denulled_output,
262  denulled_data.reverse_index_map,
263  output_clusters.ptr_,
264  denulled_data.unmasked_num_rows,
265  inline_null_value<int32_t>());
266  }
267  } catch (std::runtime_error& e) {
268  return mgr.ERROR_MESSAGE(e.what());
269  }
270  return input_ids.size();
271 }
272 
273 template <typename T>
274 NEVER_INLINE HOST int32_t
276  const TextEncodingNone& model_name,
277  const Column<T>& input_labels,
278  const ColumnList<T>& input_features,
279  const std::vector<std::vector<std::string>>& cat_feature_keys,
280  const TextEncodingNone& preferred_ml_framework_str,
281  const TextEncodingNone& model_metadata,
282  Column<TextEncodingDict>& output_model_name) {
283  if (input_labels.size() == 0) {
284  return mgr.ERROR_MESSAGE(
285  "No rows exist in training data. Training data must at least contain 1 row.");
286  }
287  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
288  if (preferred_ml_framework == MLFramework::INVALID) {
289  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
290  preferred_ml_framework_str.getString());
291  }
292  const auto denulled_data = denull_data(input_labels, input_features);
293  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
294  const auto features_ptrs =
295  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
296  const int64_t num_coefs = input_features.numCols() + 1;
297  mgr.set_output_row_size(num_coefs);
298  std::vector<int64_t> coef_idxs(num_coefs);
299  std::vector<double> coefs(num_coefs);
300  try {
301  bool did_execute = false;
302 #ifdef HAVE_ONEDAL
303  // FIXME: We default to legacy DAAL Linear Regression, as the oneAPI implementation
304  // seems to be experimental. It crashes on a few small toy models (such as datasets
305  // with 1 datapoint) and finds different coefficients for large models, when compared
306  // with the DAAL implementation. This should be revisited when oneDAL is updated.
307  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
308  preferred_ml_framework == MLFramework::DEFAULT)) {
309  onedal_linear_reg_fit_impl(labels_ptrs[0],
310  features_ptrs,
311  coef_idxs.data(),
312  coefs.data(),
313  denulled_data.masked_num_rows);
314  did_execute = true;
315  } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI)) {
316  onedal_oneapi_linear_reg_fit_impl(labels_ptrs[0],
317  features_ptrs,
318  coef_idxs.data(),
319  coefs.data(),
320  denulled_data.masked_num_rows);
321  did_execute = true;
322  }
323 #endif
324 #ifdef HAVE_MLPACK
325  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
326  preferred_ml_framework == MLFramework::DEFAULT)) {
327  mlpack_linear_reg_fit_impl(labels_ptrs[0],
328  features_ptrs,
329  coef_idxs.data(),
330  coefs.data(),
331  denulled_data.masked_num_rows);
332  did_execute = true;
333  }
334 #endif
335  if (!did_execute) {
336  return mgr.ERROR_MESSAGE(
337  "Cannot find " + preferred_ml_framework_str.getString() +
338  " ML library to support linear regression implementation.");
339  }
340  } catch (std::runtime_error& e) {
341  return mgr.ERROR_MESSAGE(e.what());
342  }
343  auto model =
344  std::make_shared<LinearRegressionModel>(coefs, model_metadata, cat_feature_keys);
345  g_ml_models.addModel(model_name, model);
346  const std::string model_name_str = model_name.getString();
347  const TextEncodingDict model_name_str_id =
348  output_model_name.getOrAddTransient(model_name);
349  output_model_name[0] = model_name_str_id;
350  return 1;
351 }
352 
353 // clang-format off
354 /*
355  UDTF: linear_reg_fit__cpu_template(TableFunctionManager,
356  TextEncodingNone model_name,
357  Cursor<Column<T> labels, ColumnList<T> features> data,
358  TextEncodingNone preferred_ml_framework | default="DEFAULT",
359  TextEncodingNone model_metadata | default="DEFAULT") ->
360  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
361  */
362 // clang-format on
363 
364 template <typename T>
365 NEVER_INLINE HOST int32_t
367  const TextEncodingNone& model_name,
368  const Column<T>& input_labels,
369  const ColumnList<T>& input_features,
370  const TextEncodingNone& preferred_ml_framework_str,
371  const TextEncodingNone& model_metadata,
372  Column<TextEncodingDict>& output_model_name) {
373  std::vector<std::vector<std::string>> empty_cat_feature_keys;
374  return linear_reg_fit_impl(mgr,
375  model_name,
376  input_labels,
377  input_features,
378  empty_cat_feature_keys,
379  preferred_ml_framework_str,
380  model_metadata,
381  output_model_name);
382 }
383 
384 template <typename T>
386  public:
388  const ColumnList<T>& numeric_features,
389  const int32_t cat_top_k,
390  const float cat_min_fraction,
391  const bool cat_include_others)
392  : num_rows_(numeric_features.size()) {
394  one_hot_encoding_info(cat_top_k, cat_min_fraction, cat_include_others);
395  const size_t num_cat_features = static_cast<size_t>(cat_features.numCols());
396  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
397  one_hot_encoding_infos;
398  for (size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
399  one_hot_encoding_infos.emplace_back(one_hot_encoding_info);
400  }
401  one_hot_encoded_cols_ =
402  TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
403  cat_features, one_hot_encoding_infos);
404  for (auto& one_hot_encoded_col : one_hot_encoded_cols_) {
405  cat_feature_keys_.emplace_back(one_hot_encoded_col.cat_features);
406  for (auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
407  col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
408  }
409  }
410  const int64_t num_numeric_features = numeric_features.numCols();
411  for (int64_t numeric_feature_idx = 0; numeric_feature_idx < num_numeric_features;
412  ++numeric_feature_idx) {
413  col_ptrs_.emplace_back(numeric_features.ptrs_[numeric_feature_idx]);
414  }
415  }
416 
418  const int32_t cat_top_k,
419  const float cat_min_fraction,
420  const bool cat_include_others)
421  : num_rows_(cat_features.size()) {
423  one_hot_encoding_info(cat_top_k, cat_min_fraction, cat_include_others);
424  const size_t num_cat_features = static_cast<size_t>(cat_features.numCols());
425  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
426  one_hot_encoding_infos;
427  for (size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
428  one_hot_encoding_infos.emplace_back(one_hot_encoding_info);
429  }
430  one_hot_encoded_cols_ =
431  TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
432  cat_features, one_hot_encoding_infos);
433  for (auto& one_hot_encoded_col : one_hot_encoded_cols_) {
434  cat_feature_keys_.emplace_back(one_hot_encoded_col.cat_features);
435  for (auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
436  col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
437  }
438  }
439  }
440 
442  const ColumnList<TextEncodingDict>& cat_features,
443  const ColumnList<T>& numeric_features,
444  const std::vector<std::vector<std::string>>& cat_feature_keys)
445  : num_rows_(numeric_features.size()), cat_feature_keys_(cat_feature_keys) {
446  const size_t num_cat_features = static_cast<size_t>(cat_features.numCols());
447  if (num_cat_features != cat_feature_keys_.size()) {
448  throw std::runtime_error(
449  "Number of provided categorical features does not match number of categorical "
450  "features in the model.");
451  }
452  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
453  one_hot_encoding_infos;
454  for (size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
455  one_hot_encoding_infos.emplace_back(cat_feature_keys_[cat_idx]);
456  }
457  one_hot_encoded_cols_ =
458  TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
459  cat_features, one_hot_encoding_infos);
460  for (auto& one_hot_encoded_col : one_hot_encoded_cols_) {
461  for (auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
462  col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
463  }
464  }
465  const int64_t num_numeric_features = numeric_features.numCols();
466  for (int64_t numeric_feature_idx = 0; numeric_feature_idx < num_numeric_features;
467  ++numeric_feature_idx) {
468  col_ptrs_.emplace_back(numeric_features.ptrs_[numeric_feature_idx]);
469  }
470  }
471 
473  const ColumnList<TextEncodingDict>& cat_features,
474  const std::vector<std::vector<std::string>>& cat_feature_keys)
475  : num_rows_(cat_features.size()), cat_feature_keys_(cat_feature_keys) {
476  const size_t num_cat_features = static_cast<size_t>(cat_features.numCols());
477  if (num_cat_features != cat_feature_keys_.size()) {
478  throw std::runtime_error(
479  "Number of provided categorical features does not match number of categorical "
480  "features in the model.");
481  }
482  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
483  one_hot_encoding_infos;
484  for (size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
485  one_hot_encoding_infos.emplace_back(cat_feature_keys_[cat_idx]);
486  }
487  one_hot_encoded_cols_ =
488  TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
489  cat_features, one_hot_encoding_infos);
490  for (auto& one_hot_encoded_col : one_hot_encoded_cols_) {
491  for (auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
492  col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
493  }
494  }
495  }
496 
498  return ColumnList<T>(
499  col_ptrs_.data(), static_cast<int64_t>(col_ptrs_.size()), num_rows_);
500  }
501 
502  const std::vector<std::vector<std::string>>& getCatFeatureKeys() const {
503  return cat_feature_keys_;
504  }
505 
506  private:
507  int64_t num_rows_;
508  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodedCol<T>>
510  std::vector<std::vector<std::string>> cat_feature_keys_;
511  std::vector<int8_t*> col_ptrs_;
512 };
513 
514 // clang-format off
515 /*
516  UDTF: linear_reg_fit__cpu_template(TableFunctionManager,
517  TextEncodingNone model_name,
518  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features,
519  ColumnList<T> numeric_features> data,
520  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
521  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
522  TextEncodingNone preferred_ml_framework | default="DEFAULT",
523  TextEncodingNone model_metadata | default="DEFAULT") ->
524  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
525  */
526 // clang-format on
527 
528 template <typename T>
529 NEVER_INLINE HOST int32_t
531  const TextEncodingNone& model_name,
532  const Column<T>& input_labels,
533  const ColumnList<TextEncodingDict>& input_cat_features,
534  const ColumnList<T>& input_numeric_features,
535  const int32_t cat_top_k,
536  const float cat_min_fraction,
537  const TextEncodingNone& preferred_ml_framework_str,
538  const TextEncodingNone& model_metadata,
539  Column<TextEncodingDict>& output_model_name) {
540  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
541  input_numeric_features,
542  cat_top_k,
543  cat_min_fraction,
544  false /* cat_include_others */);
545 
546  return linear_reg_fit_impl(mgr,
547  model_name,
548  input_labels,
549  cat_features_builder.getFeatures(),
550  cat_features_builder.getCatFeatureKeys(),
551  preferred_ml_framework_str,
552  model_metadata,
553  output_model_name);
554 }
555 
556 // clang-format off
557 /*
558  UDTF: linear_reg_fit__cpu_template(TableFunctionManager,
559  TextEncodingNone model_name,
560  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data,
561  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
562  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
563  TextEncodingNone preferred_ml_framework | default="DEFAULT",
564  TextEncodingNone model_metadata | default="DEFAULT") ->
565  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
566  */
567 // clang-format on
568 
569 template <typename T>
570 NEVER_INLINE HOST int32_t
572  const TextEncodingNone& model_name,
573  const Column<T>& input_labels,
574  const ColumnList<TextEncodingDict>& input_cat_features,
575  const int32_t cat_top_k,
576  const float cat_min_fraction,
577  const TextEncodingNone& preferred_ml_framework_str,
578  const TextEncodingNone& model_metadata,
579  Column<TextEncodingDict>& output_model_name) {
580  CategoricalFeaturesBuilder<T> cat_features_builder(
581  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
582 
583  return linear_reg_fit_impl(mgr,
584  model_name,
585  input_labels,
586  cat_features_builder.getFeatures(),
587  cat_features_builder.getCatFeatureKeys(),
588  preferred_ml_framework_str,
589  model_metadata,
590  output_model_name);
591 }
592 
593 template <typename T>
594 Column<T> create_wrapper_col(std::vector<T>& col_vec) {
595  Column<T> wrapper_col(col_vec.data(), static_cast<int64_t>(col_vec.size()));
596  return wrapper_col;
597 }
598 
599 // clang-format off
600 /*
601  UDTF: linear_reg_coefs__cpu_1(TableFunctionManager,
602  TextEncodingNone model_name) ->
603  Column<int64_t> coef_idx, Column<TextEncodingDict> feature | input_id=args<>,
604  Column<int64_t> sub_coef_idx, Column<TextEncodingDict> sub_feature | input_id=args<>,
605  Column<double> coef
606  */
607 // clang-format on
608 
611  const TextEncodingNone& model_name,
612  Column<int64_t>& output_coef_idx,
613  Column<TextEncodingDict>& output_feature,
614  Column<int64_t>& output_sub_coef_idx,
615  Column<TextEncodingDict>& output_sub_feature,
616  Column<double>& output_coef);
617 
618 // clang-format off
619 /*
620  UDTF: linear_reg_coefs__cpu_2(TableFunctionManager,
621  Cursor<Column<TextEncodingDict> name> model_name) ->
622  Column<int64_t> coef_idx, Column<TextEncodingDict> feature | input_id=args<>,
623  Column<int64_t> sub_coef_idx, Column<TextEncodingDict> sub_feature | input_id=args<>,
624  Column<double> coef
625  */
626 // clang-format on
627 
630  const Column<TextEncodingDict>& model_name,
631  Column<int64_t>& output_coef_idx,
632  Column<TextEncodingDict>& output_feature,
633  Column<int64_t>& output_sub_coef_idx,
634  Column<TextEncodingDict>& output_sub_feature,
635  Column<double>& output_coef);
636 
637 template <typename T>
638 NEVER_INLINE HOST int32_t
640  const TextEncodingNone& model_name,
641  const Column<T>& input_labels,
642  const ColumnList<T>& input_features,
643  const std::vector<std::vector<std::string>>& cat_feature_keys,
644  const int64_t max_tree_depth,
645  const int64_t min_observations_per_leaf_node,
646  const TextEncodingNone& preferred_ml_framework_str,
647  const TextEncodingNone& model_metadata,
648  Column<TextEncodingDict>& output_model_name) {
649  if (input_labels.size() == 0) {
650  return mgr.ERROR_MESSAGE(
651  "No rows exist in training data. Training data must at least contain 1 row.");
652  }
653  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
654  if (preferred_ml_framework == MLFramework::INVALID) {
655  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
656  preferred_ml_framework_str.getString());
657  }
658  if (preferred_ml_framework == MLFramework::MLPACK) {
659  return mgr.ERROR_MESSAGE(
660  "Only OneDAL framework supported for decision tree regression.");
661  }
662 #ifndef HAVE_ONEDAL
663  return mgr.ERROR_MESSAGE(
664  "Only OneDAL framework supported for decision tree regression.");
665 #endif
666 
667  const auto denulled_data = denull_data(input_labels, input_features);
668  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
669  const auto features_ptrs =
670  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
671  mgr.set_output_row_size(1);
672  try {
673  bool did_execute = false;
674 #ifdef HAVE_ONEDAL
675  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
676  preferred_ml_framework == MLFramework::DEFAULT)) {
677  onedal_decision_tree_reg_fit_impl<T>(model_name,
678  labels_ptrs[0],
679  features_ptrs,
680  model_metadata,
681  cat_feature_keys,
682  denulled_data.masked_num_rows,
683  max_tree_depth,
684  min_observations_per_leaf_node);
685  const TextEncodingDict model_name_str_id =
686  output_model_name.getOrAddTransient(model_name);
687  output_model_name[0] = model_name_str_id;
688  did_execute = true;
689  }
690 #endif
691  if (!did_execute) {
692  return mgr.ERROR_MESSAGE(
693  "Cannot find " + preferred_ml_framework_str.getString() +
694  " ML library to support decision tree regression implementation.");
695  }
696  } catch (std::runtime_error& e) {
697  return mgr.ERROR_MESSAGE(e.what());
698  }
699  return 1;
700 }
701 
702 // clang-format off
703 /*
704  UDTF: decision_tree_reg_fit__cpu_template(TableFunctionManager,
705  TextEncodingNone model_name,
706  Cursor<Column<T> labels, ColumnList<T> features> data,
707  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
708  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node >= 0" | default=5,
709  TextEncodingNone preferred_ml_framework | default="DEFAULT",
710  TextEncodingNone model_metadata | default="DEFAULT") ->
711  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
712  */
713 // clang-format on
714 
715 template <typename T>
716 NEVER_INLINE HOST int32_t
718  const TextEncodingNone& model_name,
719  const Column<T>& input_labels,
720  const ColumnList<T>& input_features,
721  const int64_t max_tree_depth,
722  const int64_t min_observations_per_leaf_node,
723  const TextEncodingNone& preferred_ml_framework_str,
724  const TextEncodingNone& model_metadata,
725  Column<TextEncodingDict>& output_model_name) {
726  std::vector<std::vector<std::string>> empty_cat_feature_keys;
727  return decision_tree_reg_impl(mgr,
728  model_name,
729  input_labels,
730  input_features,
731  empty_cat_feature_keys,
732  max_tree_depth,
733  min_observations_per_leaf_node,
734  preferred_ml_framework_str,
735  model_metadata,
736  output_model_name);
737 }
738 
739 // clang-format off
740 /*
741  UDTF: decision_tree_reg_fit__cpu_template(TableFunctionManager,
742  TextEncodingNone model_name,
743  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data,
744  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
745  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node >= 0" | default=5,
746  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
747  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
748  TextEncodingNone preferred_ml_framework | default="DEFAULT",
749  TextEncodingNone model_metadata | default="DEFAULT") ->
750  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
751  */
752 // clang-format on
753 
754 template <typename T>
757  const TextEncodingNone& model_name,
758  const Column<T>& input_labels,
759  const ColumnList<TextEncodingDict>& input_cat_features,
760  const ColumnList<T>& input_numeric_features,
761  const int64_t max_tree_depth,
762  const int64_t min_observations_per_leaf_node,
763  const int32_t cat_top_k,
764  const float cat_min_fraction,
765  const TextEncodingNone& preferred_ml_framework_str,
766  const TextEncodingNone& model_metadata,
767  Column<TextEncodingDict>& output_model_name) {
768  std::vector<std::vector<std::string>> empty_cat_feature_keys;
769  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
770  input_numeric_features,
771  cat_top_k,
772  cat_min_fraction,
773  false /* cat_include_others */);
774  return decision_tree_reg_impl(mgr,
775  model_name,
776  input_labels,
777  cat_features_builder.getFeatures(),
778  cat_features_builder.getCatFeatureKeys(),
779  max_tree_depth,
780  min_observations_per_leaf_node,
781  preferred_ml_framework_str,
782  model_metadata,
783  output_model_name);
784 }
785 
786 // clang-format off
787 /*
788  UDTF: decision_tree_reg_fit__cpu_template(TableFunctionManager,
789  TextEncodingNone model_name,
790  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data,
791  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
792  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node >= 0" | default=5,
793  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
794  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
795  TextEncodingNone preferred_ml_framework | default="DEFAULT",
796  TextEncodingNone model_metadata | default="DEFAULT") ->
797  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
798  */
799 // clang-format on
800 
801 template <typename T>
804  const TextEncodingNone& model_name,
805  const Column<T>& input_labels,
806  const ColumnList<TextEncodingDict>& input_cat_features,
807  const int64_t max_tree_depth,
808  const int64_t min_observations_per_leaf_node,
809  const int32_t cat_top_k,
810  const float cat_min_fraction,
811  const TextEncodingNone& preferred_ml_framework_str,
812  const TextEncodingNone& model_metadata,
813  Column<TextEncodingDict>& output_model_name) {
814  std::vector<std::vector<std::string>> empty_cat_feature_keys;
815  CategoricalFeaturesBuilder<T> cat_features_builder(
816  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
817  return decision_tree_reg_impl(mgr,
818  model_name,
819  input_labels,
820  cat_features_builder.getFeatures(),
821  cat_features_builder.getCatFeatureKeys(),
822  max_tree_depth,
823  min_observations_per_leaf_node,
824  preferred_ml_framework_str,
825  model_metadata,
826  output_model_name);
827 }
828 
829 template <typename T>
830 NEVER_INLINE HOST int32_t
832  const TextEncodingNone& model_name,
833  const Column<T>& input_labels,
834  const ColumnList<T>& input_features,
835  const std::vector<std::vector<std::string>>& cat_feature_keys,
836  const int64_t max_iterations,
837  const int64_t max_tree_depth,
838  const double shrinkage,
839  const double min_split_loss,
840  const double lambda,
841  const double obs_per_tree_fraction,
842  const int64_t features_per_node,
843  const int64_t min_observations_per_leaf_node,
844  const int64_t max_bins,
845  const int64_t min_bin_size,
846  const TextEncodingNone& preferred_ml_framework_str,
847  const TextEncodingNone& model_metadata,
848  Column<TextEncodingDict>& output_model_name) {
849  if (input_labels.size() == 0) {
850  return mgr.ERROR_MESSAGE(
851  "No rows exist in training data. Training data must at least contain 1 row.");
852  }
853  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
854  if (preferred_ml_framework == MLFramework::INVALID) {
855  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
856  preferred_ml_framework_str.getString());
857  }
858  if (preferred_ml_framework == MLFramework::MLPACK) {
859  return mgr.ERROR_MESSAGE("Only OneDAL framework supported for GBT regression.");
860  }
861 #ifndef HAVE_ONEDAL
862  return mgr.ERROR_MESSAGE("Only OneDAL framework supported for GBT regression.");
863 #endif
864 
865  const auto denulled_data = denull_data(input_labels, input_features);
866  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
867  const auto features_ptrs =
868  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
869  mgr.set_output_row_size(1);
870  try {
871  bool did_execute = false;
872 #ifdef HAVE_ONEDAL
873  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
874  preferred_ml_framework == MLFramework::DEFAULT)) {
875  onedal_gbt_reg_fit_impl<T>(model_name,
876  labels_ptrs[0],
877  features_ptrs,
878  model_metadata,
879  cat_feature_keys,
880  denulled_data.masked_num_rows,
881  max_iterations,
882  max_tree_depth,
883  shrinkage,
884  min_split_loss,
885  lambda,
886  obs_per_tree_fraction,
887  features_per_node,
888  min_observations_per_leaf_node,
889  max_bins,
890  min_bin_size);
891  const TextEncodingDict model_name_str_id =
892  output_model_name.getOrAddTransient(model_name);
893  output_model_name[0] = model_name_str_id;
894  did_execute = true;
895  }
896 #endif
897  if (!did_execute) {
898  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
899  " ML library to support GBT regression implementation.");
900  }
901  } catch (std::runtime_error& e) {
902  return mgr.ERROR_MESSAGE(e.what());
903  }
904  return 1;
905 }
906 
907 // clang-format off
908 /*
909  UDTF: gbt_reg_fit__cpu_template(TableFunctionManager,
910  TextEncodingNone model_name,
911  Cursor<Column<T> labels, ColumnList<T> features> data,
912  int64_t max_iterations | require="max_iterations > 0" | default=50,
913  int64_t max_tree_depth | require="max_tree_depth > 0" | default=6,
914  double shrinkage | require="shrinkage > 0.0" | require="shrinkage <= 1.0" | default=0.3,
915  double min_split_loss | require="min_split_loss >= 0.0" | default=0.0,
916  double lambda | require="lambda >= 0.0" | default=1.0,
917  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
918  int64_t features_per_node | require="features_per_node >= 0" | default=0,
919  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
920  int64_t max_bins | require="max_bins > 0" | default=256,
921  int64_t min_bin_size | require="min_bin_size >= 0" | default=5,
922  TextEncodingNone preferred_ml_framework | default="DEFAULT",
923  TextEncodingNone model_metadata | default="DEFAULT") ->
924  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
925  */
926 // clang-format on
927 
928 template <typename T>
929 NEVER_INLINE HOST int32_t
931  const TextEncodingNone& model_name,
932  const Column<T>& input_labels,
933  const ColumnList<T>& input_features,
934  const int64_t max_iterations,
935  const int64_t max_tree_depth,
936  const double shrinkage,
937  const double min_split_loss,
938  const double lambda,
939  const double obs_per_tree_fraction,
940  const int64_t features_per_node,
941  const int64_t min_observations_per_leaf_node,
942  const int64_t max_bins,
943  const int64_t min_bin_size,
944  const TextEncodingNone& preferred_ml_framework_str,
945  const TextEncodingNone& model_metadata,
946  Column<TextEncodingDict>& output_model_name) {
947  std::vector<std::vector<std::string>> empty_cat_feature_keys;
948  return gbt_reg_fit_impl(mgr,
949  model_name,
950  input_labels,
951  input_features,
952  empty_cat_feature_keys,
953  max_iterations,
954  max_tree_depth,
955  shrinkage,
956  min_split_loss,
957  lambda,
958  obs_per_tree_fraction,
959  features_per_node,
960  min_observations_per_leaf_node,
961  max_bins,
962  min_bin_size,
963  preferred_ml_framework_str,
964  model_metadata,
965  output_model_name);
966 }
967 
968 // clang-format off
969 /*
970  UDTF: gbt_reg_fit__cpu_template(TableFunctionManager,
971  TextEncodingNone model_name,
972  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data,
973  int64_t max_iterations | require="max_iterations > 0" | default=50,
974  int64_t max_tree_depth | require="max_tree_depth > 0" | default=6,
975  double shrinkage | require="shrinkage > 0.0" | require="shrinkage <= 1.0" | default=0.3,
976  double min_split_loss | require="min_split_loss >= 0.0" | default=0.0,
977  double lambda | require="lambda >= 0.0" | default=1.0,
978  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
979  int64_t features_per_node | require="features_per_node >= 0" | default=0,
980  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
981  int64_t max_bins | require="max_bins > 0" | default=256,
982  int64_t min_bin_size | require="min_bin_size >= 0" | default=5,
983  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
984  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
985  TextEncodingNone preferred_ml_framework | default="DEFAULT",
986  TextEncodingNone model_metadata | default="DEFAULT") ->
987  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
988  */
989 // clang-format on
990 
991 template <typename T>
992 NEVER_INLINE HOST int32_t
994  const TextEncodingNone& model_name,
995  const Column<T>& input_labels,
996  const ColumnList<TextEncodingDict>& input_cat_features,
997  const ColumnList<T>& input_numeric_features,
998  const int64_t max_iterations,
999  const int64_t max_tree_depth,
1000  const double shrinkage,
1001  const double min_split_loss,
1002  const double lambda,
1003  const double obs_per_tree_fraction,
1004  const int64_t features_per_node,
1005  const int64_t min_observations_per_leaf_node,
1006  const int64_t max_bins,
1007  const int64_t min_bin_size,
1008  const int32_t cat_top_k,
1009  const float cat_min_fraction,
1010  const TextEncodingNone& preferred_ml_framework_str,
1011  const TextEncodingNone& model_metadata,
1012  Column<TextEncodingDict>& output_model_name) {
1013  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1014  input_numeric_features,
1015  cat_top_k,
1016  cat_min_fraction,
1017  false /* cat_include_others */);
1018  return gbt_reg_fit_impl(mgr,
1019  model_name,
1020  input_labels,
1021  cat_features_builder.getFeatures(),
1022  cat_features_builder.getCatFeatureKeys(),
1023  max_iterations,
1024  max_tree_depth,
1025  shrinkage,
1026  min_split_loss,
1027  lambda,
1028  obs_per_tree_fraction,
1029  features_per_node,
1030  min_observations_per_leaf_node,
1031  max_bins,
1032  min_bin_size,
1033  preferred_ml_framework_str,
1034  model_metadata,
1035  output_model_name);
1036 }
1037 
1038 // clang-format off
1039 /*
1040  UDTF: gbt_reg_fit__cpu_template(TableFunctionManager,
1041  TextEncodingNone model_name,
1042  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data,
1043  int64_t max_iterations | require="max_iterations > 0" | default=50,
1044  int64_t max_tree_depth | require="max_tree_depth > 0" | default=6,
1045  double shrinkage | require="shrinkage > 0.0" | require="shrinkage <= 1.0" | default=0.3,
1046  double min_split_loss | require="min_split_loss >= 0.0" | default=0.0,
1047  double lambda | require="lambda >= 0.0" | default=1.0,
1048  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
1049  int64_t features_per_node | require="features_per_node >= 0" | default=0,
1050  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
1051  int64_t max_bins | require="max_bins > 0" | default=256,
1052  int64_t min_bin_size | require="min_bin_size >= 0" | default=5,
1053  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1054  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1055  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1056  TextEncodingNone model_metadata | default="DEFAULT") ->
1057  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1058  */
1059 // clang-format on
1060 
1061 template <typename T>
1062 NEVER_INLINE HOST int32_t
1064  const TextEncodingNone& model_name,
1065  const Column<T>& input_labels,
1066  const ColumnList<TextEncodingDict>& input_cat_features,
1067  const int64_t max_iterations,
1068  const int64_t max_tree_depth,
1069  const double shrinkage,
1070  const double min_split_loss,
1071  const double lambda,
1072  const double obs_per_tree_fraction,
1073  const int64_t features_per_node,
1074  const int64_t min_observations_per_leaf_node,
1075  const int64_t max_bins,
1076  const int64_t min_bin_size,
1077  const int32_t cat_top_k,
1078  const float cat_min_fraction,
1079  const TextEncodingNone& preferred_ml_framework_str,
1080  const TextEncodingNone& model_metadata,
1081  Column<TextEncodingDict>& output_model_name) {
1082  CategoricalFeaturesBuilder<T> cat_features_builder(
1083  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
1084  return gbt_reg_fit_impl(mgr,
1085  model_name,
1086  input_labels,
1087  cat_features_builder.getFeatures(),
1088  cat_features_builder.getCatFeatureKeys(),
1089  max_iterations,
1090  max_tree_depth,
1091  shrinkage,
1092  min_split_loss,
1093  lambda,
1094  obs_per_tree_fraction,
1095  features_per_node,
1096  min_observations_per_leaf_node,
1097  max_bins,
1098  min_bin_size,
1099  preferred_ml_framework_str,
1100  model_metadata,
1101  output_model_name);
1102 }
1103 
1104 template <typename T>
1105 NEVER_INLINE HOST int32_t
1107  const TextEncodingNone& model_name,
1108  const Column<T>& input_labels,
1109  const ColumnList<T>& input_features,
1110  const std::vector<std::vector<std::string>>& cat_feature_keys,
1111  const int64_t num_trees,
1112  const double obs_per_tree_fraction,
1113  const int64_t max_tree_depth,
1114  const int64_t features_per_node,
1115  const double impurity_threshold,
1116  const bool bootstrap,
1117  const int64_t min_obs_per_leaf_node,
1118  const int64_t min_obs_per_split_node,
1119  const double min_weight_fraction_in_leaf_node,
1120  const double min_impurity_decrease_in_split_node,
1121  const int64_t max_leaf_nodes,
1122  const bool use_histogram,
1123  const TextEncodingNone& var_importance_metric_str,
1124  const TextEncodingNone& preferred_ml_framework_str,
1125  const TextEncodingNone& model_metadata,
1126  Column<TextEncodingDict>& output_model_name) {
1127  if (input_labels.size() == 0) {
1128  return mgr.ERROR_MESSAGE(
1129  "No rows exist in training data. Training data must at least contain 1 row.");
1130  }
1131  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1132  if (preferred_ml_framework == MLFramework::INVALID) {
1133  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1134  preferred_ml_framework_str.getString());
1135  }
1136  if (preferred_ml_framework == MLFramework::MLPACK) {
1137  return mgr.ERROR_MESSAGE(
1138  "Only OneDAL framework supported for random forest regression.");
1139  }
1140 #ifndef HAVE_ONEDAL
1141  return mgr.ERROR_MESSAGE(
1142  "Only OneDAL framework supported for random forest regression.");
1143 #endif
1144 
1145  const auto denulled_data = denull_data(input_labels, input_features);
1146  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
1147  const auto features_ptrs =
1148  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
1149  mgr.set_output_row_size(1);
1150  try {
1151  bool did_execute = false;
1152  const auto var_importance_metric =
1153  get_var_importance_metric(var_importance_metric_str);
1154  if (var_importance_metric == VarImportanceMetric::INVALID) {
1155  return mgr.ERROR_MESSAGE("Invalid variable importance metric: " +
1156  var_importance_metric_str.getString());
1157  }
1158 #ifdef HAVE_ONEDAL
1159  if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
1160  preferred_ml_framework == MLFramework::DEFAULT)) {
1161  if (use_histogram) {
1162  onedal_oneapi_random_forest_reg_fit_impl<
1163  T,
1164  oneapi::dal::decision_forest::method::hist>(
1165  model_name,
1166  labels_ptrs[0],
1167  features_ptrs,
1168  model_metadata,
1169  cat_feature_keys,
1170  denulled_data.masked_num_rows,
1171  num_trees,
1172  obs_per_tree_fraction,
1173  max_tree_depth,
1174  features_per_node,
1175  impurity_threshold,
1176  bootstrap,
1177  min_obs_per_leaf_node,
1178  min_obs_per_split_node,
1179  min_weight_fraction_in_leaf_node,
1180  min_impurity_decrease_in_split_node,
1181  max_leaf_nodes,
1182  var_importance_metric);
1183  } else {
1184  onedal_oneapi_random_forest_reg_fit_impl<
1185  T,
1186  oneapi::dal::decision_forest::method::dense>(
1187  model_name,
1188  labels_ptrs[0],
1189  features_ptrs,
1190  model_metadata,
1191  cat_feature_keys,
1192  denulled_data.masked_num_rows,
1193  num_trees,
1194  obs_per_tree_fraction,
1195  max_tree_depth,
1196  features_per_node,
1197  impurity_threshold,
1198  bootstrap,
1199  min_obs_per_leaf_node,
1200  min_obs_per_split_node,
1201  min_weight_fraction_in_leaf_node,
1202  min_impurity_decrease_in_split_node,
1203  max_leaf_nodes,
1204  var_importance_metric);
1205  }
1206  const TextEncodingDict model_name_str_id =
1207  output_model_name.getOrAddTransient(model_name);
1208  output_model_name[0] = model_name_str_id;
1209  did_execute = true;
1210  } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
1211  if (use_histogram) {
1212  onedal_random_forest_reg_fit_impl<T, decision_forest::regression::training::hist>(
1213  model_name,
1214  labels_ptrs[0],
1215  features_ptrs,
1216  model_metadata,
1217  cat_feature_keys,
1218  denulled_data.masked_num_rows,
1219  num_trees,
1220  obs_per_tree_fraction,
1221  max_tree_depth,
1222  features_per_node,
1223  impurity_threshold,
1224  bootstrap,
1225  min_obs_per_leaf_node,
1226  min_obs_per_split_node,
1227  min_weight_fraction_in_leaf_node,
1228  min_impurity_decrease_in_split_node,
1229  max_leaf_nodes,
1230  var_importance_metric);
1231  } else {
1232  onedal_random_forest_reg_fit_impl<
1233  T,
1234  decision_forest::regression::training::defaultDense>(
1235  model_name,
1236  labels_ptrs[0],
1237  features_ptrs,
1238  model_metadata,
1239  cat_feature_keys,
1240  denulled_data.masked_num_rows,
1241  num_trees,
1242  obs_per_tree_fraction,
1243  max_tree_depth,
1244  features_per_node,
1245  impurity_threshold,
1246  bootstrap,
1247  min_obs_per_leaf_node,
1248  min_obs_per_split_node,
1249  min_weight_fraction_in_leaf_node,
1250  min_impurity_decrease_in_split_node,
1251  max_leaf_nodes,
1252  var_importance_metric);
1253  }
1254  const TextEncodingDict model_name_str_id =
1255  output_model_name.getOrAddTransient(model_name);
1256  output_model_name[0] = model_name_str_id;
1257  did_execute = true;
1258  }
1259 #endif
1260  if (!did_execute) {
1261  return mgr.ERROR_MESSAGE(
1262  "Cannot find " + preferred_ml_framework_str.getString() +
1263  " ML library to support random forest regression implementation.");
1264  }
1265  } catch (std::runtime_error& e) {
1266  return mgr.ERROR_MESSAGE(e.what());
1267  }
1268  return 1;
1269 }
1270 
1271 // clang-format off
1272 /*
1273  UDTF: random_forest_reg_fit__cpu_template(TableFunctionManager,
1274  TextEncodingNone model_name,
1275  Cursor<Column<T> labels, ColumnList<T> features> data,
1276  int64_t num_trees | require="num_trees > 0" | default=10,
1277  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
1278  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
1279  int64_t features_per_node | require="features_per_node >= 0" | default=0,
1280  double impurity_threshold | require="impurity_threshold >= 0.0" | default=0.0,
1281  bool bootstrap | default=true,
1282  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
1283  int64_t min_obs_per_split_node | require="min_obs_per_leaf_node > 0" | default=2,
1284  double min_weight_fraction_in_leaf_node | require="min_weight_fraction_in_leaf_node >= 0.0" | default=0.0,
1285  double min_impurity_decrease_in_split_node | require="min_impurity_decrease_in_split_node >= 0.0" | default=0.0,
1286  int64_t max_leaf_nodes | require="max_leaf_nodes >=0" | default=0,
1287  bool use_histogram | default=false,
1288  TextEncodingNone var_importance_metric | default="MDI",
1289  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1290  TextEncodingNone model_metadata | default="DEFAULT") ->
1291  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1292  */
1293 // clang-format on
1294 
1295 template <typename T>
1296 NEVER_INLINE HOST int32_t
1298  const TextEncodingNone& model_name,
1299  const Column<T>& input_labels,
1300  const ColumnList<T>& input_features,
1301  const int64_t num_trees,
1302  const double obs_per_tree_fraction,
1303  const int64_t max_tree_depth,
1304  const int64_t features_per_node,
1305  const double impurity_threshold,
1306  const bool bootstrap,
1307  const int64_t min_obs_per_leaf_node,
1308  const int64_t min_obs_per_split_node,
1309  const double min_weight_fraction_in_leaf_node,
1310  const double min_impurity_decrease_in_split_node,
1311  const int64_t max_leaf_nodes,
1312  const bool use_histogram,
1313  const TextEncodingNone& var_importance_metric_str,
1314  const TextEncodingNone& preferred_ml_framework_str,
1315  const TextEncodingNone& model_metadata,
1316  Column<TextEncodingDict>& output_model_name) {
1317  std::vector<std::vector<std::string>> empty_cat_feature_keys;
1318  return random_forest_reg_fit_impl(mgr,
1319  model_name,
1320  input_labels,
1321  input_features,
1322  empty_cat_feature_keys,
1323  num_trees,
1324  obs_per_tree_fraction,
1325  max_tree_depth,
1326  features_per_node,
1327  impurity_threshold,
1328  bootstrap,
1329  min_obs_per_leaf_node,
1330  min_obs_per_split_node,
1331  min_weight_fraction_in_leaf_node,
1332  min_impurity_decrease_in_split_node,
1333  max_leaf_nodes,
1334  use_histogram,
1335  var_importance_metric_str,
1336  preferred_ml_framework_str,
1337  model_metadata,
1338  output_model_name);
1339 }
1340 
1341 // clang-format off
1342 /*
1343  UDTF: random_forest_reg_fit__cpu_template(TableFunctionManager,
1344  TextEncodingNone model_name,
1345  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data,
1346  int64_t num_trees | require="num_trees > 0" | default=10,
1347  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
1348  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
1349  int64_t features_per_node | require="features_per_node >= 0" | default=0,
1350  double impurity_threshold | require="impurity_threshold >= 0.0" | default=0.0,
1351  bool bootstrap | default=true,
1352  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
1353  int64_t min_obs_per_split_node | require="min_obs_per_leaf_node > 0" | default=2,
1354  double min_weight_fraction_in_leaf_node | require="min_weight_fraction_in_leaf_node >= 0.0" | default=0.0,
1355  double min_impurity_decrease_in_split_node | require="min_impurity_decrease_in_split_node >= 0.0" | default=0.0,
1356  int64_t max_leaf_nodes | require="max_leaf_nodes >=0" | default=0,
1357  bool use_histogram | default=false,
1358  TextEncodingNone var_importance_metric | default="MDI",
1359  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1360  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1361  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1362  TextEncodingNone model_metadata | default="DEFAULT") ->
1363  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1364  */
1365 // clang-format on
1366 
1367 template <typename T>
1369  TableFunctionManager& mgr,
1370  const TextEncodingNone& model_name,
1371  const Column<T>& input_labels,
1372  const ColumnList<TextEncodingDict>& input_cat_features,
1373  const ColumnList<T>& input_numeric_features,
1374  const int64_t num_trees,
1375  const double obs_per_tree_fraction,
1376  const int64_t max_tree_depth,
1377  const int64_t features_per_node,
1378  const double impurity_threshold,
1379  const bool bootstrap,
1380  const int64_t min_obs_per_leaf_node,
1381  const int64_t min_obs_per_split_node,
1382  const double min_weight_fraction_in_leaf_node,
1383  const double min_impurity_decrease_in_split_node,
1384  const int64_t max_leaf_nodes,
1385  const bool use_histogram,
1386  const TextEncodingNone& var_importance_metric_str,
1387  const int32_t cat_top_k,
1388  const float cat_min_fraction,
1389  const TextEncodingNone& preferred_ml_framework_str,
1390  const TextEncodingNone& model_metadata,
1391  Column<TextEncodingDict>& output_model_name) {
1392  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1393  input_numeric_features,
1394  cat_top_k,
1395  cat_min_fraction,
1396  false /* cat_include_others */);
1397  return random_forest_reg_fit_impl(mgr,
1398  model_name,
1399  input_labels,
1400  cat_features_builder.getFeatures(),
1401  cat_features_builder.getCatFeatureKeys(),
1402  num_trees,
1403  obs_per_tree_fraction,
1404  max_tree_depth,
1405  features_per_node,
1406  impurity_threshold,
1407  bootstrap,
1408  min_obs_per_leaf_node,
1409  min_obs_per_split_node,
1410  min_weight_fraction_in_leaf_node,
1411  min_impurity_decrease_in_split_node,
1412  max_leaf_nodes,
1413  use_histogram,
1414  var_importance_metric_str,
1415  preferred_ml_framework_str,
1416  model_metadata,
1417  output_model_name);
1418 }
1419 
1420 // clang-format off
1421 /*
1422  UDTF: random_forest_reg_fit__cpu_template(TableFunctionManager,
1423  TextEncodingNone model_name,
1424  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data,
1425  int64_t num_trees | require="num_trees > 0" | default=10,
1426  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
1427  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
1428  int64_t features_per_node | require="features_per_node >= 0" | default=0,
1429  double impurity_threshold | require="impurity_threshold >= 0.0" | default=0.0,
1430  bool bootstrap | default=true,
1431  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
1432  int64_t min_obs_per_split_node | require="min_obs_per_leaf_node > 0" | default=2,
1433  double min_weight_fraction_in_leaf_node | require="min_weight_fraction_in_leaf_node >= 0.0" | default=0.0,
1434  double min_impurity_decrease_in_split_node | require="min_impurity_decrease_in_split_node >= 0.0" | default=0.0,
1435  int64_t max_leaf_nodes | require="max_leaf_nodes >=0" | default=0,
1436  bool use_histogram | default=false,
1437  TextEncodingNone var_importance_metric | default="MDI",
1438  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1439  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1440  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1441  TextEncodingNone model_metadata | default="DEFAULT") ->
1442  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1443  */
1444 // clang-format on
1445 
1446 template <typename T>
1448  TableFunctionManager& mgr,
1449  const TextEncodingNone& model_name,
1450  const Column<T>& input_labels,
1451  const ColumnList<TextEncodingDict>& input_cat_features,
1452  const int64_t num_trees,
1453  const double obs_per_tree_fraction,
1454  const int64_t max_tree_depth,
1455  const int64_t features_per_node,
1456  const double impurity_threshold,
1457  const bool bootstrap,
1458  const int64_t min_obs_per_leaf_node,
1459  const int64_t min_obs_per_split_node,
1460  const double min_weight_fraction_in_leaf_node,
1461  const double min_impurity_decrease_in_split_node,
1462  const int64_t max_leaf_nodes,
1463  const bool use_histogram,
1464  const TextEncodingNone& var_importance_metric_str,
1465  const int32_t cat_top_k,
1466  const float cat_min_fraction,
1467  const TextEncodingNone& preferred_ml_framework_str,
1468  const TextEncodingNone& model_metadata,
1469  Column<TextEncodingDict>& output_model_name) {
1470  CategoricalFeaturesBuilder<T> cat_features_builder(
1471  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
1472  return random_forest_reg_fit_impl(mgr,
1473  model_name,
1474  input_labels,
1475  cat_features_builder.getFeatures(),
1476  cat_features_builder.getCatFeatureKeys(),
1477  num_trees,
1478  obs_per_tree_fraction,
1479  max_tree_depth,
1480  features_per_node,
1481  impurity_threshold,
1482  bootstrap,
1483  min_obs_per_leaf_node,
1484  min_obs_per_split_node,
1485  min_weight_fraction_in_leaf_node,
1486  min_impurity_decrease_in_split_node,
1487  max_leaf_nodes,
1488  use_histogram,
1489  var_importance_metric_str,
1490  preferred_ml_framework_str,
1491  model_metadata,
1492  output_model_name);
1493 }
1494 
1495 template <typename T>
1496 NEVER_INLINE HOST int32_t
1498  const TextEncodingNone& model_name,
1499  const ColumnList<T>& input_features,
1500  const std::vector<std::vector<std::string>>& cat_feature_keys,
1501  const TextEncodingNone& preferred_ml_framework_str,
1502  const TextEncodingNone& model_metadata,
1503  Column<TextEncodingDict>& output_model_name) {
1504  if (input_features.size() == 0) {
1505  return mgr.ERROR_MESSAGE(
1506  "No rows exist in training data. Training data must at least contain 1 row.");
1507  }
1508  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1509  if (preferred_ml_framework == MLFramework::INVALID) {
1510  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1511  preferred_ml_framework_str.getString());
1512  }
1513  try {
1514  const auto denulled_data = denull_data(input_features);
1515  const int64_t num_rows = denulled_data.masked_num_rows;
1516  if (num_rows == 0) {
1517  return mgr.ERROR_MESSAGE(
1518  "No non-null rows exist in training data. Training data must at least contain "
1519  "1 "
1520  "non-null row.");
1521  }
1522  const auto features_ptrs =
1523  pluck_ptrs(denulled_data.data, 0L, input_features.numCols());
1524  // z_std_normalize_data_with_summary_stats can throw if std dev is 0
1525  const auto z_std_norm_summary_stats =
1526  z_std_normalize_data_with_summary_stats(denulled_data.data, num_rows);
1527  const auto normalized_ptrs =
1528  pluck_ptrs(z_std_norm_summary_stats.normalized_data,
1529  0L,
1530  z_std_norm_summary_stats.normalized_data.size());
1531  bool did_execute = false;
1532 #ifdef HAVE_ONEDAL
1533  if (preferred_ml_framework == MLFramework::ONEAPI ||
1534  preferred_ml_framework == MLFramework::DEFAULT) {
1535  const auto [eigenvectors, eigenvalues] =
1536  onedal_oneapi_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
1537  auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
1538  z_std_norm_summary_stats.std_devs,
1539  eigenvectors,
1540  eigenvalues,
1541  model_metadata,
1542  cat_feature_keys);
1543  g_ml_models.addModel(model_name, model);
1544  did_execute = true;
1545  } else if (preferred_ml_framework == MLFramework::ONEDAL) {
1546  const auto [eigenvectors, eigenvalues] =
1547  onedal_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
1548  auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
1549  z_std_norm_summary_stats.std_devs,
1550  eigenvectors,
1551  eigenvalues,
1552  model_metadata,
1553  cat_feature_keys);
1554  g_ml_models.addModel(model_name, model);
1555  did_execute = true;
1556  }
1557 #endif
1558  if (!did_execute) {
1559  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
1560  " ML library to support PCA implementation.");
1561  }
1562  mgr.set_output_row_size(1);
1563  const TextEncodingDict model_name_str_id =
1564  output_model_name.getOrAddTransient(model_name);
1565  output_model_name[0] = model_name_str_id;
1566  return 1;
1567  } catch (std::runtime_error& e) {
1568  return mgr.ERROR_MESSAGE(e.what());
1569  }
1570 }
1571 
1572 // clang-format off
1573 /*
1574  UDTF: pca_fit__cpu_template(TableFunctionManager,
1575  TextEncodingNone model_name,
1576  Cursor<ColumnList<T> features> data,
1577  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1578  TextEncodingNone model_metadata | default="DEFAULT") ->
1579  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1580  */
1581 // clang-format on
1582 
1583 template <typename T>
1584 NEVER_INLINE HOST int32_t
1586  const TextEncodingNone& model_name,
1587  const ColumnList<T>& input_features,
1588  const TextEncodingNone& preferred_ml_framework_str,
1589  const TextEncodingNone& model_metadata,
1590  Column<TextEncodingDict>& output_model_name) {
1591  std::vector<std::vector<std::string>> empty_cat_feature_keys;
1592  return pca_fit_impl(mgr,
1593  model_name,
1594  input_features,
1595  empty_cat_feature_keys,
1596  preferred_ml_framework_str,
1597  model_metadata,
1598  output_model_name);
1599 }
1600 
1601 // clang-format off
1602 /*
1603  UDTF: pca_fit__cpu_template(TableFunctionManager,
1604  TextEncodingNone model_name,
1605  Cursor<ColumnList<TextEncodingDict> cat_features, ColumnList<T> features> data,
1606  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1607  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1608  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1609  TextEncodingNone model_metadata | default="DEFAULT") ->
1610  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1611  */
1612 // clang-format on
1613 
1614 template <typename T>
1615 NEVER_INLINE HOST int32_t
1617  const TextEncodingNone& model_name,
1618  const ColumnList<TextEncodingDict>& input_cat_features,
1619  const ColumnList<T>& input_numeric_features,
1620  const int32_t cat_top_k,
1621  const float cat_min_fraction,
1622  const TextEncodingNone& preferred_ml_framework_str,
1623  const TextEncodingNone& model_metadata,
1624  Column<TextEncodingDict>& output_model_name) {
1625  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1626  input_numeric_features,
1627  cat_top_k,
1628  cat_min_fraction,
1629  false /* cat_include_others */);
1630  return pca_fit_impl(mgr,
1631  model_name,
1632  cat_features_builder.getFeatures(),
1633  cat_features_builder.getCatFeatureKeys(),
1634  preferred_ml_framework_str,
1635  model_metadata,
1636  output_model_name);
1637 }
1638 
1639 // clang-format off
1640 /*
1641  UDTF: pca_fit__cpu_1(TableFunctionManager,
1642  TextEncodingNone model_name,
1643  Cursor<ColumnList<TextEncodingDict> cat_features> data,
1644  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1645  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1646  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1647  TextEncodingNone model_metadata | default="DEFAULT") ->
1648  Column<TextEncodingDict> model_name | input_id=args<>
1649 */
1650 // clang-format on
1651 
1654  const TextEncodingNone& model_name,
1655  const ColumnList<TextEncodingDict>& input_cat_features,
1656  const int32_t cat_top_k,
1657  const float cat_min_fraction,
1658  const TextEncodingNone& preferred_ml_framework_str,
1659  const TextEncodingNone& model_metadata,
1660  Column<TextEncodingDict>& output_model_name);
1661 
1662 template <typename T, typename K>
1663 NEVER_INLINE HOST int32_t
1665  const std::shared_ptr<AbstractMLModel>& model,
1666  const Column<K>& input_ids,
1667  const ColumnList<T>& input_features,
1668  const TextEncodingNone& preferred_ml_framework_str,
1669  Column<K>& output_ids,
1670  Column<T>& output_predictions) {
1671  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1672  if (preferred_ml_framework == MLFramework::INVALID) {
1673  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1674  preferred_ml_framework_str.getString());
1675  }
1676  const auto denulled_data = denull_data(input_features);
1677  const int64_t num_rows = denulled_data.masked_num_rows;
1678  const bool data_is_masked =
1679  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
1680  std::vector<T> denulled_output_allocation(data_is_masked ? num_rows : 0);
1681  mgr.set_output_row_size(input_ids.size());
1682  T* denulled_output =
1683  data_is_masked ? denulled_output_allocation.data() : output_predictions.ptr_;
1684  const auto features_ptrs = pluck_ptrs(denulled_data.data, 0L, input_features.numCols());
1685 
1686  try {
1687  bool did_execute = false;
1688  const auto model_type = model->getModelType();
1689  switch (model_type) {
1690  case MLModelType::LINEAR_REG: {
1691  const auto linear_reg_model =
1692  std::dynamic_pointer_cast<LinearRegressionModel>(model);
1693  CHECK(linear_reg_model);
1694 #ifdef HAVE_ONEDAL
1695  if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
1696  preferred_ml_framework == MLFramework::DEFAULT)) {
1697  onedal_oneapi_linear_reg_predict_impl(
1698  linear_reg_model, features_ptrs, denulled_output, num_rows);
1699  did_execute = true;
1700  } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
1701  onedal_linear_reg_predict_impl(
1702  linear_reg_model, features_ptrs, denulled_output, num_rows);
1703  did_execute = true;
1704  }
1705 #endif
1706 #ifdef HAVE_MLPACK
1707  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
1708  preferred_ml_framework == MLFramework::DEFAULT)) {
1709  mlpack_linear_reg_predict_impl(
1710  linear_reg_model, features_ptrs, denulled_output, num_rows);
1711  did_execute = true;
1712  }
1713 #endif
1714  break;
1715  }
1717 #ifdef HAVE_ONEDAL
1718  const auto decision_tree_reg_model =
1719  std::dynamic_pointer_cast<DecisionTreeRegressionModel>(model);
1720  CHECK(decision_tree_reg_model);
1721  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1722  preferred_ml_framework == MLFramework::DEFAULT)) {
1723  onedal_decision_tree_reg_predict_impl(
1724  decision_tree_reg_model, features_ptrs, denulled_output, num_rows);
1725  did_execute = true;
1726  }
1727 #endif
1728  break;
1729  }
1730  case MLModelType::GBT_REG: {
1731 #ifdef HAVE_ONEDAL
1732  const auto gbt_reg_model = std::dynamic_pointer_cast<GbtRegressionModel>(model);
1733  CHECK(gbt_reg_model);
1734  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1735  preferred_ml_framework == MLFramework::DEFAULT)) {
1736  onedal_gbt_reg_predict_impl(
1737  gbt_reg_model, features_ptrs, denulled_output, num_rows);
1738  did_execute = true;
1739  }
1740 #endif
1741  break;
1742  }
1744 #ifdef HAVE_ONEDAL
1745  const auto random_forest_reg_model =
1746  std::dynamic_pointer_cast<RandomForestRegressionModel>(model);
1747  const auto oneapi_random_forest_reg_model =
1748  std::dynamic_pointer_cast<OneAPIRandomForestRegressionModel>(model);
1749  CHECK(random_forest_reg_model || oneapi_random_forest_reg_model);
1750  if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
1751  preferred_ml_framework == MLFramework::ONEDAL ||
1752  preferred_ml_framework == MLFramework::DEFAULT)) {
1753  if (random_forest_reg_model) {
1754  onedal_random_forest_reg_predict_impl(
1755  random_forest_reg_model, features_ptrs, denulled_output, num_rows);
1756  } else {
1757  onedal_oneapi_random_forest_reg_predict_impl(
1758  oneapi_random_forest_reg_model, features_ptrs, denulled_output, num_rows);
1759  }
1760  did_execute = true;
1761  }
1762 #endif
1763  break;
1764  }
1765  default: {
1766  throw std::runtime_error("Unsupported model type");
1767  }
1768  }
1769  if (!did_execute) {
1770  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
1771  " ML library to support model implementation.");
1772  }
1773  } catch (std::runtime_error& e) {
1774  const std::string error_str(e.what());
1775  return mgr.ERROR_MESSAGE(error_str);
1776  }
1777  output_ids = input_ids;
1778  if (data_is_masked) {
1779  unmask_data(denulled_output,
1780  denulled_data.reverse_index_map,
1781  output_predictions.ptr_,
1782  denulled_data.unmasked_num_rows,
1783  inline_null_value<T>());
1784  }
1785  return input_ids.size();
1786 }
1787 
1788 // clang-format off
1789 /*
1790  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1791  TextEncodingNone model_name,
1792  Cursor<Column<K> id, ColumnList<T> features> data,
1793  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1794  Column<K> id | input_id=args<0>, Column<T> prediction,
1795  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1796  */
1797 // clang-format on
1798 
1799 template <typename T, typename K>
1800 NEVER_INLINE HOST int32_t
1802  const TextEncodingNone& model_name,
1803  const Column<K>& input_ids,
1804  const ColumnList<T>& input_features,
1805  const TextEncodingNone& preferred_ml_framework_str,
1806  Column<K>& output_ids,
1807  Column<T>& output_predictions) {
1808  try {
1809  const auto model = g_ml_models.getModel(model_name);
1810  check_model_params(model, 0, input_features.numCols());
1811  return ml_reg_predict_impl(mgr,
1812  model,
1813  input_ids,
1814  input_features,
1815  preferred_ml_framework_str,
1816  output_ids,
1817  output_predictions);
1818  } catch (std::runtime_error& e) {
1819  const std::string error_str(e.what());
1820  return mgr.ERROR_MESSAGE(error_str);
1821  }
1822 }
1823 
1824 // clang-format off
1825 /*
1826  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1827  TextEncodingNone model_name,
1828  Cursor<Column<K> id, ColumnList<TextEncodingDict> cat_features, ColumnList<T> features> data,
1829  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1830  Column<K> id | input_id=args<0>, Column<T> prediction,
1831  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1832  */
1833 // clang-format on
1834 
1835 template <typename T, typename K>
1836 NEVER_INLINE HOST int32_t
1838  const TextEncodingNone& model_name,
1839  const Column<K>& input_ids,
1840  const ColumnList<TextEncodingDict>& input_cat_features,
1841  const ColumnList<T>& input_numeric_features,
1842  const TextEncodingNone& preferred_ml_framework_str,
1843  Column<K>& output_ids,
1844  Column<T>& output_predictions) {
1845  try {
1846  const auto model = g_ml_models.getModel(model_name);
1848  model, input_cat_features.numCols(), input_numeric_features.numCols());
1849  CategoricalFeaturesBuilder<T> cat_features_builder(
1850  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
1851  return ml_reg_predict_impl(mgr,
1852  model,
1853  input_ids,
1854  cat_features_builder.getFeatures(),
1855  preferred_ml_framework_str,
1856  output_ids,
1857  output_predictions);
1858  } catch (std::runtime_error& e) {
1859  const std::string error_str(e.what());
1860  return mgr.ERROR_MESSAGE(error_str);
1861  }
1862 }
1863 
1864 // clang-format off
1865 /*
1866  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1867  TextEncodingNone model_name,
1868  Cursor<Column<K> id, ColumnList<TextEncodingDict> cat_features> data,
1869  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1870  Column<K> id | input_id=args<0>, Column<T> prediction,
1871  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1872  */
1873 // clang-format on
1874 
1875 template <typename T, typename K>
1876 NEVER_INLINE HOST int32_t
1878  const TextEncodingNone& model_name,
1879  const Column<K>& input_ids,
1880  const ColumnList<TextEncodingDict>& input_cat_features,
1881  const TextEncodingNone& preferred_ml_framework_str,
1882  Column<K>& output_ids,
1883  Column<T>& output_predictions) {
1884  try {
1885  const auto model = g_ml_models.getModel(model_name);
1886  check_model_params(model, input_cat_features.numCols(), 0);
1887  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1888  model->getCatFeatureKeys());
1889  return ml_reg_predict_impl(mgr,
1890  model,
1891  input_ids,
1892  cat_features_builder.getFeatures(),
1893  preferred_ml_framework_str,
1894  output_ids,
1895  output_predictions);
1896  } catch (std::runtime_error& e) {
1897  const std::string error_str(e.what());
1898  return mgr.ERROR_MESSAGE(error_str);
1899  }
1900 }
1901 
1902 // clang-format off
1903 /*
1904  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1905  Cursor<Column<TextEncodingDict> name> model_name,
1906  Cursor<Column<K> id, ColumnList<T> features> data,
1907  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1908  Column<K> id | input_id=args<0>, Column<T> prediction,
1909  K=[int64_t, TextEncodingDict], T=[double]
1910  */
1911 // clang-format on
1912 
1913 template <typename T, typename K>
1914 NEVER_INLINE HOST int32_t
1916  const Column<TextEncodingDict>& model_name,
1917  const Column<K>& input_ids,
1918  const ColumnList<T>& input_features,
1919  const TextEncodingNone& preferred_ml_framework_str,
1920  Column<K>& output_ids,
1921  Column<T>& output_predictions) {
1922  if (model_name.size() != 1) {
1923  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1924  }
1925  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1926  return ml_reg_predict__cpu_template(mgr,
1927  model_name_text_enc_none,
1928  input_ids,
1929  input_features,
1930  preferred_ml_framework_str,
1931  output_ids,
1932  output_predictions);
1933 }
1934 
1935 // clang-format off
1936 /*
1937  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1938  Cursor<Column<TextEncodingDict> name> model_name,
1939  Cursor<Column<K> id, ColumnList<TextEncodingDict> cat_features, ColumnList<T> features> data,
1940  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1941  Column<K> id | input_id=args<0>, Column<T> prediction,
1942  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1943  */
1944 // clang-format on
1945 
1946 template <typename T, typename K>
1947 NEVER_INLINE HOST int32_t
1949  const Column<TextEncodingDict>& model_name,
1950  const Column<K>& input_ids,
1951  const ColumnList<TextEncodingDict>& input_cat_features,
1952  const ColumnList<T>& input_numeric_features,
1953  const TextEncodingNone& preferred_ml_framework_str,
1954  Column<K>& output_ids,
1955  Column<T>& output_predictions) {
1956  if (model_name.size() != 1) {
1957  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1958  }
1959  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1960  return ml_reg_predict__cpu_template(mgr,
1961  model_name_text_enc_none,
1962  input_ids,
1963  input_cat_features,
1964  input_numeric_features,
1965  preferred_ml_framework_str,
1966  output_ids,
1967  output_predictions);
1968 }
1969 
1970 // clang-format off
1971 /*
1972  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1973  Cursor<Column<TextEncodingDict> name> model_name,
1974  Cursor<Column<K> id, ColumnList<TextEncodingDict> cat_features> data,
1975  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1976  Column<K> id | input_id=args<0>, Column<T> prediction,
1977  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1978  */
1979 // clang-format on
1980 
1981 template <typename T, typename K>
1982 NEVER_INLINE HOST int32_t
1984  const Column<TextEncodingDict>& model_name,
1985  const Column<K>& input_ids,
1986  const ColumnList<TextEncodingDict>& input_cat_features,
1987  const TextEncodingNone& preferred_ml_framework_str,
1988  Column<K>& output_ids,
1989  Column<T>& output_predictions) {
1990  if (model_name.size() != 1) {
1991  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1992  }
1993  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1994  return ml_reg_predict__cpu_template(mgr,
1995  model_name_text_enc_none,
1996  input_ids,
1997  input_cat_features,
1998  preferred_ml_framework_str,
1999  output_ids,
2000  output_predictions);
2001 }
2002 
2003 template <typename T>
2005  const std::shared_ptr<AbstractMLModel>& model,
2006  const Column<T>& input_labels,
2007  const ColumnList<T>& input_features,
2008  Column<double>& output_r2) {
2009  const int64_t num_rows = input_labels.size();
2010  if (num_rows == 0) {
2011  return mgr.ERROR_MESSAGE(
2012  "No rows exist in evaluation data. Evaluation data must at least contain 1 row.");
2013  }
2014  std::vector<T> output_predictions_vec(num_rows);
2015  Column<T> output_predictions(output_predictions_vec);
2016  std::vector<int64_t> input_ids_vec(num_rows);
2017  std::vector<int64_t> output_ids_vec(num_rows);
2018  Column<int64_t> input_ids(input_ids_vec);
2019  Column<int64_t> output_ids(output_ids_vec);
2021  TextEncodingNone ml_framework_encoding_none("DEFAULT");
2022 
2023  try {
2024  auto ret = ml_reg_predict_impl(mgr,
2025  model,
2026  input_ids,
2027  input_features,
2028  ml_framework_encoding_none,
2029  output_ids,
2030  output_predictions);
2031 
2032  if (ret < 0) {
2033  // A return of less than 0 symbolizes an error
2034  return ret;
2035  }
2036  } catch (std::runtime_error& e) {
2038  return mgr.ERROR_MESSAGE(e.what());
2039  }
2040 
2042  mgr.set_output_row_size(1);
2043 
2044  const auto labels_mean = get_column_mean(input_labels);
2045  const size_t max_thread_count = std::thread::hardware_concurrency();
2046  const size_t max_inputs_per_thread = 20000;
2047  const size_t num_threads = std::min(
2048  max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
2049 
2050  std::vector<double> local_sum_squared_regressions(num_threads, 0.0);
2051  std::vector<double> local_sum_squares(num_threads, 0.0);
2052 
2053  tbb::task_arena limited_arena(num_threads);
2054 
2055  limited_arena.execute([&] {
2057  tbb::blocked_range<int64_t>(0, num_rows),
2058  [&](const tbb::blocked_range<int64_t>& r) {
2059  const int64_t start_idx = r.begin();
2060  const int64_t end_idx = r.end();
2061  double local_sum_squared_regression{0.0};
2062  double local_sum_square{0.0};
2063  for (int64_t row_idx = start_idx; row_idx < end_idx; ++row_idx) {
2064  if (output_predictions[row_idx] != inline_null_value<T>()) {
2065  local_sum_squared_regression +=
2066  (input_labels[row_idx] - output_predictions[row_idx]) *
2067  (input_labels[row_idx] - output_predictions[row_idx]);
2068  local_sum_square += (input_labels[row_idx] - labels_mean) *
2069  (input_labels[row_idx] - labels_mean);
2070  }
2071  }
2072  const size_t thread_idx = tbb::this_task_arena::current_thread_index();
2073  local_sum_squared_regressions[thread_idx] += local_sum_squared_regression;
2074  local_sum_squares[thread_idx] += local_sum_square;
2075  });
2076  });
2077  double sum_squared_regression{0.0};
2078  double sum_squares{0.0};
2079  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
2080  sum_squared_regression += local_sum_squared_regressions[thread_idx];
2081  sum_squares += local_sum_squares[thread_idx];
2082  }
2083  output_r2[0] = sum_squares == 0.0 ? 1.0 : 1.0 - (sum_squared_regression / sum_squares);
2084  return 1;
2085 }
2086 
2087 // clang-format off
2088 /*
2089  UDTF: r2_score__cpu_template(TableFunctionManager,
2090  TextEncodingNone model_name,
2091  Cursor<Column<T> labels, ColumnList<T> features> data) ->
2092  Column<double> r2, T=[double]
2093  */
2094 // clang-format on
2095 
2096 template <typename T>
2098  const TextEncodingNone& model_name,
2099  const Column<T>& input_labels,
2100  const ColumnList<T>& input_features,
2101  Column<double>& output_r2) {
2102  try {
2103  const auto model = g_ml_models.getModel(model_name);
2104  check_model_params(model, 0, input_features.numCols());
2105  return r2_score_impl(mgr, model, input_labels, input_features, output_r2);
2106  } catch (std::runtime_error& e) {
2107  const std::string error_str(e.what());
2108  return mgr.ERROR_MESSAGE(error_str);
2109  }
2110 }
2111 
2112 // clang-format off
2113 /*
2114  UDTF: r2_score__cpu_template(TableFunctionManager,
2115  Cursor<Column<TextEncodingDict> name> model_name,
2116  Cursor<Column<T> labels, ColumnList<T> features> data) ->
2117  Column<double> r2, T=[double]
2118  */
2119 // clang-format on
2120 
2121 template <typename T>
2122 NEVER_INLINE HOST int32_t
2124  const Column<TextEncodingDict>& model_name,
2125  const Column<T>& input_labels,
2126  const ColumnList<T>& input_features,
2127  Column<double>& output_r2) {
2128  if (model_name.size() != 1) {
2129  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
2130  }
2131  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
2132  return r2_score__cpu_template(
2133  mgr, model_name_text_enc_none, input_labels, input_features, output_r2);
2134 }
2135 
2136 // clang-format off
2137 /*
2138  UDTF: r2_score__cpu_template(TableFunctionManager,
2139  TextEncodingNone model_name,
2140  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data) -> Column<double> r2, T=[double]
2141  */
2142 // clang-format on
2143 
2144 template <typename T>
2145 NEVER_INLINE HOST int32_t
2147  const TextEncodingNone& model_name,
2148  const Column<T>& input_labels,
2149  const ColumnList<TextEncodingDict>& input_cat_features,
2150  const ColumnList<T>& input_numeric_features,
2151  Column<double>& output_r2) {
2152  try {
2153  const auto model = g_ml_models.getModel(model_name);
2155  model, input_cat_features.numCols(), input_numeric_features.numCols());
2156  CategoricalFeaturesBuilder<T> cat_features_builder(
2157  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2158  return r2_score_impl(
2159  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2160  } catch (std::runtime_error& e) {
2161  const std::string error_str(e.what());
2162  return mgr.ERROR_MESSAGE(error_str);
2163  }
2164 }
2165 
2166 // clang-format off
2167 /*
2168  UDTF: r2_score__cpu_template(TableFunctionManager,
2169  TextEncodingNone model_name,
2170  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data) -> Column<double> r2, T=[double]
2171  */
2172 // clang-format on
2173 
2174 template <typename T>
2175 NEVER_INLINE HOST int32_t
2177  const TextEncodingNone& model_name,
2178  const Column<T>& input_labels,
2179  const ColumnList<TextEncodingDict>& input_cat_features,
2180  Column<double>& output_r2) {
2181  try {
2182  const auto model = g_ml_models.getModel(model_name);
2183  check_model_params(model, input_cat_features.numCols(), 0);
2184  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
2185  model->getCatFeatureKeys());
2186  return r2_score_impl(
2187  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2188  } catch (std::runtime_error& e) {
2189  const std::string error_str(e.what());
2190  return mgr.ERROR_MESSAGE(error_str);
2191  }
2192 }
2193 
2194 // clang-format off
2195 /*
2196  UDTF: r2_score__cpu_template(TableFunctionManager,
2197  Cursor<Column<TextEncodingDict> name> model_name,
2198  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data) -> Column<double> r2, T=[double]
2199  */
2200 // clang-format on
2201 
2202 template <typename T>
2203 NEVER_INLINE HOST int32_t
2205  const Column<TextEncodingDict>& model_name,
2206  const Column<T>& input_labels,
2207  const ColumnList<TextEncodingDict>& input_cat_features,
2208  const ColumnList<T>& input_numeric_features,
2209  Column<double>& output_r2) {
2210  if (model_name.size() != 1) {
2211  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
2212  }
2213  const std::string model_name_str{model_name.getString(0)};
2214  try {
2215  const auto model = g_ml_models.getModel(model_name_str);
2217  model, input_cat_features.numCols(), input_numeric_features.numCols());
2218  CategoricalFeaturesBuilder<T> cat_features_builder(
2219  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2220  return r2_score_impl(
2221  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2222  } catch (std::runtime_error& e) {
2223  const std::string error_str(e.what());
2224  return mgr.ERROR_MESSAGE(error_str);
2225  }
2226 }
2227 
2228 // clang-format off
2229 /*
2230  UDTF: random_forest_reg_var_importance__cpu_1(TableFunctionManager,
2231  TextEncodingNone model_name) ->
2232  Column<int64_t> feature_id, Column<TextEncodingDict> feature | input_id=args<>,
2233  Column<int64_t> sub_feature_id, Column<TextEncodingDict> sub_feature | input_id=args<>, Column<double> importance_score
2234  */
2235 // clang-format on
2236 
2239  const TextEncodingNone& model_name,
2240  Column<int64_t>& feature_id,
2241  Column<TextEncodingDict>& feature,
2242  Column<int64_t>& sub_feature_id,
2243  Column<TextEncodingDict>& sub_feature,
2244  Column<double>& importance_score);
2245 
2246 // clang-format off
2247 /*
2248  UDTF: random_forest_reg_var_importance__cpu_2(TableFunctionManager,
2249  Cursor<Column<TextEncodingDict> name> model_name) ->
2250  Column<int64_t> feature_id, Column<TextEncodingDict> feature | input_id=args<>,
2251  Column<int64_t> sub_feature_id, Column<TextEncodingDict> sub_feature | input_id=args<>, Column<double> importance_score
2252  */
2253 // clang-format on
2254 
2257  const Column<TextEncodingDict>& model_name,
2258  Column<int64_t>& feature_id,
2259  Column<TextEncodingDict>& feature,
2260  Column<int64_t>& sub_feature_id,
2261  Column<TextEncodingDict>& sub_feature,
2262  Column<double>& importance_score);
2263 
2264 // clang-format off
2265 /*
2266  UDTF: get_decision_trees__cpu_1(TableFunctionManager,
2267  TextEncodingNone model_name) ->
2268  Column<int64_t> tree_id,
2269  Column<int64_t> entry_id,
2270  Column<bool> is_split_node,
2271  Column<int64_t> feature_id,
2272  Column<int64_t> left_child,
2273  Column<int64_t> right_child,
2274  Column<double> value
2275  */
2276 // clang-format on
2277 
2280  const TextEncodingNone& model_name,
2281  Column<int64_t>& tree_id,
2282  Column<int64_t>& entry_id,
2283  Column<bool>& is_split_node,
2284  Column<int64_t>& feature_id,
2285  Column<int64_t>& left_child,
2286  Column<int64_t>& right_child,
2287  Column<double>& value);
2288 
2289 // clang-format off
2290 /*
2291  UDTF: get_decision_trees__cpu_2(TableFunctionManager,
2292  Cursor<Column<TextEncodingDict> name> model_name) ->
2293  Column<int64_t> tree_id,
2294  Column<int64_t> entry_id,
2295  Column<bool> is_split_node,
2296  Column<int64_t> feature_id,
2297  Column<int64_t> left_child,
2298  Column<int64_t> right_child,
2299  Column<double> value
2300  */
2301 // clang-format on
2302 
2305  const Column<TextEncodingDict>& model_name,
2306  Column<int64_t>& tree_id,
2307  Column<int64_t>& entry_id,
2308  Column<bool>& is_split_node,
2309  Column<int64_t>& feature_id,
2310  Column<int64_t>& left_child,
2311  Column<int64_t>& right_child,
2312  Column<double>& value);
2313 
2314 #endif // #ifndef __CUDACC__
DEVICE const std::string getString(int64_t index) const
NEVER_INLINE HOST int32_t pca_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const ColumnList< T > &numeric_features, const int32_t cat_top_k, const float cat_min_fraction, const bool cat_include_others)
KMeansInitStrategy get_kmeans_init_type(const std::string &init_type_str)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const int32_t cat_top_k, const float cat_min_fraction, const bool cat_include_others)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
NEVER_INLINE HOST int32_t r2_score_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
std::string getString() const
Definition: heavydbTypes.h:641
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
DEVICE int64_t size() const
NEVER_INLINE HOST int32_t decision_tree_reg_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST int32_t pca_fit__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< TextEncodingDict > &input_cat_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
EXTENSION_NOINLINE_HOST int32_t supported_ml_frameworks__cpu_(TableFunctionManager &mgr, Column< TextEncodingDict > &output_ml_frameworks, Column< bool > &output_availability, Column< bool > &output_default)
#define CHECK_GE(x, y)
Definition: Logger.h:306
MaskedData< T > denull_data(const ColumnList< T > &features)
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t kmeans__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const int num_clusters, const int num_iterations, const TextEncodingNone &init_type_str, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
NEVER_INLINE HOST int32_t pca_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
#define CHECK_GT(x, y)
Definition: Logger.h:305
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
MLFramework get_ml_framework(const std::string &ml_framework_str)
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
#define HOST
const size_t max_inputs_per_thread
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const ColumnList< T > &numeric_features, const std::vector< std::vector< std::string >> &cat_feature_keys)
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
void addModel(const std::string &model_name, std::shared_ptr< AbstractMLModel > model)
Definition: MLModel.h:38
VarImportanceMetric get_var_importance_metric(const std::string &var_importance_metric_str)
NEVER_INLINE HOST int32_t gbt_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
#define EXTENSION_NOINLINE_HOST
Definition: heavydbTypes.h:55
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:51
void disable_output_allocations()
Definition: heavydbTypes.h:379
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
DEVICE int64_t numCols() const
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
int8_t ** ptrs_
MLModelMap g_ml_models
Definition: MLModel.h:125
std::vector< int8_t * > col_ptrs_
#define CHECK_LE(x, y)
Definition: Logger.h:304
std::vector< TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodedCol< T > > one_hot_encoded_cols_
void unmask_data(const T *masked_input, const std::vector< int32_t > &reverse_index_map, T *unmasked_output, const int64_t num_unmasked_rows, const T null_val)
NEVER_INLINE HOST int32_t linear_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t dbscan__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const double epsilon, const int32_t min_observations, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const std::vector< std::vector< std::string >> &cat_feature_keys)
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
#define NEVER_INLINE
std::vector< std::vector< T > > z_std_normalize_data(const std::vector< T * > &input_data, const int64_t num_rows)
#define CHECK(condition)
Definition: Logger.h:291
NEVER_INLINE HOST int32_t ml_reg_predict_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
std::vector< std::vector< std::string > > cat_feature_keys_
DEVICE int64_t size() const
DEVICE int64_t size() const
NEVER_INLINE HOST int32_t r2_score__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
ZStdNormalizationSummaryStats< T > z_std_normalize_data_with_summary_stats(const std::vector< T * > &input_data, const int64_t num_rows)
void enable_output_allocations()
Definition: heavydbTypes.h:381
Column< T > create_wrapper_col(std::vector< T > &col_vec)
NEVER_INLINE HOST double get_column_mean(const T *data, const int64_t num_rows)
const std::vector< std::vector< std::string > > & getCatFeatureKeys() const