OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MLTableFunctions.hpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc., Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #ifndef __CUDACC__
20 
24 
27 
28 #ifdef HAVE_ONEDAL
30 #endif
31 
32 #ifdef HAVE_MLPACK
34 #endif
35 
36 #include <tbb/parallel_for.h>
37 #include <tbb/task_arena.h>
38 
39 using namespace TableFunctions_Namespace;
40 
41 template <typename T>
42 std::vector<const T*> pluck_ptrs(const std::vector<std::vector<T>>& data,
43  const int64_t start_idx,
44  const int64_t end_idx) {
45  std::vector<const T*> raw_ptrs;
46  CHECK_GE(start_idx, 0L);
47  CHECK_GT(end_idx, start_idx);
48  CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
49  for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
50  raw_ptrs.emplace_back(data[col_idx].data());
51  }
52  return raw_ptrs;
53 }
54 
55 template <typename T>
56 std::vector<const T*> pluck_ptrs(const std::vector<T*>& data,
57  const int64_t start_idx,
58  const int64_t end_idx) {
59  std::vector<const T*> raw_ptrs;
60  CHECK_GE(start_idx, 0L);
61  CHECK_GT(end_idx, start_idx);
62  CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
63  for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
64  raw_ptrs.emplace_back(data[col_idx]);
65  }
66  return raw_ptrs;
67 }
68 
69 // clang-format off
70 /*
71  UDTF: supported_ml_frameworks__cpu_(TableFunctionManager) ->
72  Column<TextEncodingDict> ml_framework | input_id=args<>, Column<bool> is_available, Column<bool> is_default
73 */
74 // clang-format on
75 
78  Column<TextEncodingDict>& output_ml_frameworks,
79  Column<bool>& output_availability,
80  Column<bool>& output_default);
82 void check_model_params(const std::shared_ptr<AbstractMLModel>& model,
83  const int64_t num_cat_features,
84  const int64_t num_numeric_features);
85 
86 // clang-format off
87 /*
88  UDTF: kmeans__cpu_template(TableFunctionManager,
89  Cursor<Column<K> input_ids, ColumnList<T> input_features> data,
90  int32_t num_clusters | require="num_clusters > 0" | require="num_clusters <= input_ids.size()",
91  int32_t num_iterations | require="num_iterations > 0" | default=10,
92  TextEncodingNone init_type | default="DEFAULT",
93  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
94  Column<K> id | input_id=args<0>,
95  Column<int32_t> cluster_id,
96  K=[int64_t, TextEncodingDict], T=[double]
97 */
98 // clang-format on
99 
100 template <typename K, typename T>
101 NEVER_INLINE HOST int32_t
103  const Column<K>& input_ids,
104  const ColumnList<T>& input_features,
105  const int num_clusters,
106  const int num_iterations,
107  const TextEncodingNone& init_type_str,
108  const TextEncodingNone& preferred_ml_framework_str,
109  Column<K>& output_ids,
110  Column<int32_t>& output_clusters) {
111  mgr.set_output_row_size(input_ids.size());
112  output_ids = input_ids;
113  const auto kmeans_init_strategy = get_kmeans_init_type(init_type_str);
114  if (kmeans_init_strategy == KMeansInitStrategy::INVALID) {
115  return mgr.ERROR_MESSAGE("Invalid KMeans initializaiton strategy: " +
116  init_type_str.getString());
117  }
118 
119  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
120  if (preferred_ml_framework == MLFramework::INVALID) {
121  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
122  preferred_ml_framework_str.getString());
123  }
124 
125  try {
126  const auto denulled_data = denull_data(input_features);
127  const int64_t num_rows = denulled_data.masked_num_rows;
128  const bool data_is_masked =
129  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
130  std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
131  int32_t* denulled_output =
132  data_is_masked ? denulled_output_allocation.data() : output_clusters.ptr_;
133 
134  // z_std_normalize_data can throw if std dev is 0
135  const auto normalized_data = z_std_normalize_data(denulled_data.data, num_rows);
136  const auto normalized_ptrs = pluck_ptrs(normalized_data, 0L, normalized_data.size());
137 
138  bool did_execute = false;
139 #ifdef HAVE_ONEDAL
140  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
141  preferred_ml_framework == MLFramework::DEFAULT)) {
142  onedal_kmeans_impl(normalized_ptrs,
143  denulled_output,
144  num_rows,
145  num_clusters,
146  num_iterations,
147  kmeans_init_strategy);
148  did_execute = true;
149  }
150 #endif
151 #ifdef HAVE_MLPACK
152  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
153  preferred_ml_framework == MLFramework::DEFAULT)) {
154  mlpack_kmeans_impl(normalized_ptrs,
155  denulled_output,
156  num_rows,
157  num_clusters,
158  num_iterations,
159  kmeans_init_strategy);
160  did_execute = true;
161  }
162 #endif
163  if (!did_execute) {
164  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
165  " ML library to support kmeans implementation.");
166  }
167 
168  if (data_is_masked) {
169  unmask_data(denulled_output,
170  denulled_data.reverse_index_map,
171  output_clusters.ptr_,
172  denulled_data.unmasked_num_rows,
173  inline_null_value<int32_t>());
174  }
175  } catch (std::runtime_error& e) {
176  return mgr.ERROR_MESSAGE(e.what());
177  }
178  return input_ids.size();
179 }
180 
181 // clang-format off
182 /*
183  UDTF: dbscan__cpu_template(TableFunctionManager,
184  Cursor<Column<K> input_ids, ColumnList<T> input_features> data,
185  double epsilon | require="epsilon > 0.0",
186  int32_t min_observations | require="min_observations > 0",
187  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
188  Column<K> id | input_id=args<0>, Column<int32_t> cluster_id,
189  K=[int64_t, TextEncodingDict], T=[double]
190  */
191 // clang-format on
192 
193 template <typename K, typename T>
194 NEVER_INLINE HOST int32_t
196  const Column<K>& input_ids,
197  const ColumnList<T>& input_features,
198  const double epsilon,
199  const int32_t min_observations,
200  const TextEncodingNone& preferred_ml_framework_str,
201  Column<K>& output_ids,
202  Column<int32_t>& output_clusters) {
203  mgr.set_output_row_size(input_ids.size());
204  output_ids = input_ids;
205 
206  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
207  if (preferred_ml_framework == MLFramework::INVALID) {
208  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
209  preferred_ml_framework_str.getString());
210  }
211 
212  try {
213  const auto denulled_data = denull_data(input_features);
214  const int64_t num_rows = denulled_data.masked_num_rows;
215  const bool data_is_masked =
216  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
217  std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
218  int32_t* denulled_output =
219  data_is_masked ? denulled_output_allocation.data() : output_clusters.ptr_;
220 
221  // z_std_normalize_data can throw if std dev is 0
222  const auto normalized_data = z_std_normalize_data(denulled_data.data, num_rows);
223  const auto normalized_ptrs = pluck_ptrs(normalized_data, 0L, normalized_data.size());
224 
225  bool did_execute = false;
226 #ifdef HAVE_ONEDAL
227  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
228  preferred_ml_framework == MLFramework::DEFAULT)) {
229  onedal_dbscan_impl(
230  normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
231  did_execute = true;
232  }
233 #endif
234 #ifdef HAVE_MLPACK
235  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
236  preferred_ml_framework == MLFramework::DEFAULT)) {
237  mlpack_dbscan_impl(
238  normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
239  did_execute = true;
240  }
241 #endif
242  if (!did_execute) {
243  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
244  " ML library to support dbscan implementation.");
245  }
246 
247  if (data_is_masked) {
248  unmask_data(denulled_output,
249  denulled_data.reverse_index_map,
250  output_clusters.ptr_,
251  denulled_data.unmasked_num_rows,
252  inline_null_value<int32_t>());
253  }
254  } catch (std::runtime_error& e) {
255  return mgr.ERROR_MESSAGE(e.what());
256  }
257  return input_ids.size();
258 }
259 
260 template <typename T>
261 NEVER_INLINE HOST int32_t
263  const TextEncodingNone& model_name,
264  const Column<T>& input_labels,
265  const ColumnList<T>& input_features,
266  const std::vector<std::vector<std::string>>& cat_feature_keys,
267  const TextEncodingNone& preferred_ml_framework_str,
268  const TextEncodingNone& model_metadata,
269  Column<TextEncodingDict>& output_model_name) {
270  if (input_labels.size() == 0) {
271  return mgr.ERROR_MESSAGE(
272  "No rows exist in training data. Training data must at least contain 1 row.");
273  }
274  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
275  if (preferred_ml_framework == MLFramework::INVALID) {
276  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
277  preferred_ml_framework_str.getString());
278  }
279  const auto denulled_data = denull_data(input_labels, input_features);
280  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
281  const auto features_ptrs =
282  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
283  const int64_t num_coefs = input_features.numCols() + 1;
284  mgr.set_output_row_size(num_coefs);
285  std::vector<int64_t> coef_idxs(num_coefs);
286  std::vector<double> coefs(num_coefs);
287  try {
288  bool did_execute = false;
289 #ifdef HAVE_ONEDAL
290  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
291  preferred_ml_framework == MLFramework::DEFAULT)) {
292  onedal_linear_reg_fit_impl(labels_ptrs[0],
293  features_ptrs,
294  coef_idxs.data(),
295  coefs.data(),
296  denulled_data.masked_num_rows);
297  did_execute = true;
298  }
299 #endif
300 #ifdef HAVE_MLPACK
301  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
302  preferred_ml_framework == MLFramework::DEFAULT)) {
303  mlpack_linear_reg_fit_impl(labels_ptrs[0],
304  features_ptrs,
305  coef_idxs.data(),
306  coefs.data(),
307  denulled_data.masked_num_rows);
308  did_execute = true;
309  }
310 #endif
311  if (!did_execute) {
312  return mgr.ERROR_MESSAGE(
313  "Cannot find " + preferred_ml_framework_str.getString() +
314  " ML library to support linear regression implementation.");
315  }
316  } catch (std::runtime_error& e) {
317  return mgr.ERROR_MESSAGE(e.what());
318  }
319  auto model =
320  std::make_shared<LinearRegressionModel>(coefs, model_metadata, cat_feature_keys);
321  g_ml_models.addModel(model_name, model);
322  const std::string model_name_str = model_name.getString();
323  const TextEncodingDict model_name_str_id =
324  output_model_name.getOrAddTransient(model_name);
325  output_model_name[0] = model_name_str_id;
326  return 1;
327 }
328 
329 // clang-format off
330 /*
331  UDTF: linear_reg_fit__cpu_template(TableFunctionManager,
332  TextEncodingNone model_name,
333  Cursor<Column<T> labels, ColumnList<T> features> data,
334  TextEncodingNone preferred_ml_framework | default="DEFAULT",
335  TextEncodingNone model_metadata | default="DEFAULT") ->
336  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
337  */
338 // clang-format on
339 
340 template <typename T>
341 NEVER_INLINE HOST int32_t
343  const TextEncodingNone& model_name,
344  const Column<T>& input_labels,
345  const ColumnList<T>& input_features,
346  const TextEncodingNone& preferred_ml_framework_str,
347  const TextEncodingNone& model_metadata,
348  Column<TextEncodingDict>& output_model_name) {
349  std::vector<std::vector<std::string>> empty_cat_feature_keys;
350  return linear_reg_fit_impl(mgr,
351  model_name,
352  input_labels,
353  input_features,
354  empty_cat_feature_keys,
355  preferred_ml_framework_str,
356  model_metadata,
357  output_model_name);
358 }
359 
360 template <typename T>
362  public:
364  const ColumnList<T>& numeric_features,
365  const int32_t cat_top_k,
366  const float cat_min_fraction,
367  const bool cat_include_others)
368  : num_rows_(numeric_features.size()) {
370  one_hot_encoding_info(cat_top_k, cat_min_fraction, cat_include_others);
371  const size_t num_cat_features = static_cast<size_t>(cat_features.numCols());
372  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
373  one_hot_encoding_infos;
374  for (size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
375  one_hot_encoding_infos.emplace_back(one_hot_encoding_info);
376  }
377  one_hot_encoded_cols_ =
378  TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
379  cat_features, one_hot_encoding_infos);
380  for (auto& one_hot_encoded_col : one_hot_encoded_cols_) {
381  cat_feature_keys_.emplace_back(one_hot_encoded_col.cat_features);
382  for (auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
383  col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
384  }
385  }
386  const int64_t num_numeric_features = numeric_features.numCols();
387  for (int64_t numeric_feature_idx = 0; numeric_feature_idx < num_numeric_features;
388  ++numeric_feature_idx) {
389  col_ptrs_.emplace_back(numeric_features.ptrs_[numeric_feature_idx]);
390  }
391  }
392 
394  const int32_t cat_top_k,
395  const float cat_min_fraction,
396  const bool cat_include_others)
397  : num_rows_(cat_features.size()) {
399  one_hot_encoding_info(cat_top_k, cat_min_fraction, cat_include_others);
400  const size_t num_cat_features = static_cast<size_t>(cat_features.numCols());
401  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
402  one_hot_encoding_infos;
403  for (size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
404  one_hot_encoding_infos.emplace_back(one_hot_encoding_info);
405  }
406  one_hot_encoded_cols_ =
407  TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
408  cat_features, one_hot_encoding_infos);
409  for (auto& one_hot_encoded_col : one_hot_encoded_cols_) {
410  cat_feature_keys_.emplace_back(one_hot_encoded_col.cat_features);
411  for (auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
412  col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
413  }
414  }
415  }
416 
418  const ColumnList<TextEncodingDict>& cat_features,
419  const ColumnList<T>& numeric_features,
420  const std::vector<std::vector<std::string>>& cat_feature_keys)
421  : num_rows_(numeric_features.size()), cat_feature_keys_(cat_feature_keys) {
422  const size_t num_cat_features = static_cast<size_t>(cat_features.numCols());
423  if (num_cat_features != cat_feature_keys_.size()) {
424  throw std::runtime_error(
425  "Number of provided categorical features does not match number of categorical "
426  "features in the model.");
427  }
428  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
429  one_hot_encoding_infos;
430  for (size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
431  one_hot_encoding_infos.emplace_back(cat_feature_keys_[cat_idx]);
432  }
433  one_hot_encoded_cols_ =
434  TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
435  cat_features, one_hot_encoding_infos);
436  for (auto& one_hot_encoded_col : one_hot_encoded_cols_) {
437  for (auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
438  col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
439  }
440  }
441  const int64_t num_numeric_features = numeric_features.numCols();
442  for (int64_t numeric_feature_idx = 0; numeric_feature_idx < num_numeric_features;
443  ++numeric_feature_idx) {
444  col_ptrs_.emplace_back(numeric_features.ptrs_[numeric_feature_idx]);
445  }
446  }
447 
449  const ColumnList<TextEncodingDict>& cat_features,
450  const std::vector<std::vector<std::string>>& cat_feature_keys)
451  : num_rows_(cat_features.size()), cat_feature_keys_(cat_feature_keys) {
452  const size_t num_cat_features = static_cast<size_t>(cat_features.numCols());
453  if (num_cat_features != cat_feature_keys_.size()) {
454  throw std::runtime_error(
455  "Number of provided categorical features does not match number of categorical "
456  "features in the model.");
457  }
458  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
459  one_hot_encoding_infos;
460  for (size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
461  one_hot_encoding_infos.emplace_back(cat_feature_keys_[cat_idx]);
462  }
463  one_hot_encoded_cols_ =
464  TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
465  cat_features, one_hot_encoding_infos);
466  for (auto& one_hot_encoded_col : one_hot_encoded_cols_) {
467  for (auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
468  col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
469  }
470  }
471  }
472 
474  return ColumnList<T>(
475  col_ptrs_.data(), static_cast<int64_t>(col_ptrs_.size()), num_rows_);
476  }
477 
478  const std::vector<std::vector<std::string>>& getCatFeatureKeys() const {
479  return cat_feature_keys_;
480  }
481 
482  private:
483  int64_t num_rows_;
484  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodedCol<T>>
486  std::vector<std::vector<std::string>> cat_feature_keys_;
487  std::vector<int8_t*> col_ptrs_;
488 };
489 
490 // clang-format off
491 /*
492  UDTF: linear_reg_fit__cpu_template(TableFunctionManager,
493  TextEncodingNone model_name,
494  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features,
495  ColumnList<T> numeric_features> data,
496  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
497  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
498  TextEncodingNone preferred_ml_framework | default="DEFAULT",
499  TextEncodingNone model_metadata | default="DEFAULT") ->
500  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
501  */
502 // clang-format on
503 
504 template <typename T>
505 NEVER_INLINE HOST int32_t
507  const TextEncodingNone& model_name,
508  const Column<T>& input_labels,
509  const ColumnList<TextEncodingDict>& input_cat_features,
510  const ColumnList<T>& input_numeric_features,
511  const int32_t cat_top_k,
512  const float cat_min_fraction,
513  const TextEncodingNone& preferred_ml_framework_str,
514  const TextEncodingNone& model_metadata,
515  Column<TextEncodingDict>& output_model_name) {
516  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
517  input_numeric_features,
518  cat_top_k,
519  cat_min_fraction,
520  false /* cat_include_others */);
521 
522  return linear_reg_fit_impl(mgr,
523  model_name,
524  input_labels,
525  cat_features_builder.getFeatures(),
526  cat_features_builder.getCatFeatureKeys(),
527  preferred_ml_framework_str,
528  model_metadata,
529  output_model_name);
530 }
531 
532 // clang-format off
533 /*
534  UDTF: linear_reg_fit__cpu_template(TableFunctionManager,
535  TextEncodingNone model_name,
536  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data,
537  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
538  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
539  TextEncodingNone preferred_ml_framework | default="DEFAULT",
540  TextEncodingNone model_metadata | default="DEFAULT") ->
541  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
542  */
543 // clang-format on
544 
545 template <typename T>
546 NEVER_INLINE HOST int32_t
548  const TextEncodingNone& model_name,
549  const Column<T>& input_labels,
550  const ColumnList<TextEncodingDict>& input_cat_features,
551  const int32_t cat_top_k,
552  const float cat_min_fraction,
553  const TextEncodingNone& preferred_ml_framework_str,
554  const TextEncodingNone& model_metadata,
555  Column<TextEncodingDict>& output_model_name) {
556  CategoricalFeaturesBuilder<T> cat_features_builder(
557  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
558 
559  return linear_reg_fit_impl(mgr,
560  model_name,
561  input_labels,
562  cat_features_builder.getFeatures(),
563  cat_features_builder.getCatFeatureKeys(),
564  preferred_ml_framework_str,
565  model_metadata,
566  output_model_name);
567 }
568 
569 template <typename T>
570 Column<T> create_wrapper_col(std::vector<T>& col_vec) {
571  Column<T> wrapper_col(col_vec.data(), static_cast<int64_t>(col_vec.size()));
572  return wrapper_col;
573 }
574 
575 // clang-format off
576 /*
577  UDTF: linear_reg_coefs__cpu_1(TableFunctionManager,
578  TextEncodingNone model_name) ->
579  Column<int64_t> coef_idx, Column<TextEncodingDict> feature | input_id=args<>,
580  Column<int64_t> sub_coef_idx, Column<TextEncodingDict> sub_feature | input_id=args<>,
581  Column<double> coef
582  */
583 // clang-format on
584 
587  const TextEncodingNone& model_name,
588  Column<int64_t>& output_coef_idx,
589  Column<TextEncodingDict>& output_feature,
590  Column<int64_t>& output_sub_coef_idx,
591  Column<TextEncodingDict>& output_sub_feature,
592  Column<double>& output_coef);
593 
594 // clang-format off
595 /*
596  UDTF: linear_reg_coefs__cpu_2(TableFunctionManager,
597  Cursor<Column<TextEncodingDict> name> model_name) ->
598  Column<int64_t> coef_idx, Column<TextEncodingDict> feature | input_id=args<>,
599  Column<int64_t> sub_coef_idx, Column<TextEncodingDict> sub_feature | input_id=args<>,
600  Column<double> coef
601  */
602 // clang-format on
603 
606  const Column<TextEncodingDict>& model_name,
607  Column<int64_t>& output_coef_idx,
608  Column<TextEncodingDict>& output_feature,
609  Column<int64_t>& output_sub_coef_idx,
610  Column<TextEncodingDict>& output_sub_feature,
611  Column<double>& output_coef);
612 
613 template <typename T>
614 NEVER_INLINE HOST int32_t
616  const TextEncodingNone& model_name,
617  const Column<T>& input_labels,
618  const ColumnList<T>& input_features,
619  const std::vector<std::vector<std::string>>& cat_feature_keys,
620  const int64_t max_tree_depth,
621  const int64_t min_observations_per_leaf_node,
622  const TextEncodingNone& preferred_ml_framework_str,
623  const TextEncodingNone& model_metadata,
624  Column<TextEncodingDict>& output_model_name) {
625  if (input_labels.size() == 0) {
626  return mgr.ERROR_MESSAGE(
627  "No rows exist in training data. Training data must at least contain 1 row.");
628  }
629  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
630  if (preferred_ml_framework == MLFramework::INVALID) {
631  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
632  preferred_ml_framework_str.getString());
633  }
634  if (preferred_ml_framework == MLFramework::MLPACK) {
635  return mgr.ERROR_MESSAGE(
636  "Only OneDAL framework supported for decision tree regression.");
637  }
638 #ifndef HAVE_ONEDAL
639  return mgr.ERROR_MESSAGE(
640  "Only OneDAL framework supported for decision tree regression.");
641 #endif
642 
643  const auto denulled_data = denull_data(input_labels, input_features);
644  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
645  const auto features_ptrs =
646  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
647  mgr.set_output_row_size(1);
648  try {
649  bool did_execute = false;
650 #ifdef HAVE_ONEDAL
651  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
652  preferred_ml_framework == MLFramework::DEFAULT)) {
653  onedal_decision_tree_reg_fit_impl<T>(model_name,
654  labels_ptrs[0],
655  features_ptrs,
656  model_metadata,
657  cat_feature_keys,
658  denulled_data.masked_num_rows,
659  max_tree_depth,
660  min_observations_per_leaf_node);
661  const TextEncodingDict model_name_str_id =
662  output_model_name.getOrAddTransient(model_name);
663  output_model_name[0] = model_name_str_id;
664  did_execute = true;
665  }
666 #endif
667  if (!did_execute) {
668  return mgr.ERROR_MESSAGE(
669  "Cannot find " + preferred_ml_framework_str.getString() +
670  " ML library to support decision tree regression implementation.");
671  }
672  } catch (std::runtime_error& e) {
673  return mgr.ERROR_MESSAGE(e.what());
674  }
675  return 1;
676 }
677 
678 // clang-format off
679 /*
680  UDTF: decision_tree_reg_fit__cpu_template(TableFunctionManager,
681  TextEncodingNone model_name,
682  Cursor<Column<T> labels, ColumnList<T> features> data,
683  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
684  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node >= 0" | default=5,
685  TextEncodingNone preferred_ml_framework | default="DEFAULT",
686  TextEncodingNone model_metadata | default="DEFAULT") ->
687  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
688  */
689 // clang-format on
690 
691 template <typename T>
692 NEVER_INLINE HOST int32_t
694  const TextEncodingNone& model_name,
695  const Column<T>& input_labels,
696  const ColumnList<T>& input_features,
697  const int64_t max_tree_depth,
698  const int64_t min_observations_per_leaf_node,
699  const TextEncodingNone& preferred_ml_framework_str,
700  const TextEncodingNone& model_metadata,
701  Column<TextEncodingDict>& output_model_name) {
702  std::vector<std::vector<std::string>> empty_cat_feature_keys;
703  return decision_tree_reg_impl(mgr,
704  model_name,
705  input_labels,
706  input_features,
707  empty_cat_feature_keys,
708  max_tree_depth,
709  min_observations_per_leaf_node,
710  preferred_ml_framework_str,
711  model_metadata,
712  output_model_name);
713 }
714 
715 // clang-format off
716 /*
717  UDTF: decision_tree_reg_fit__cpu_template(TableFunctionManager,
718  TextEncodingNone model_name,
719  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data,
720  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
721  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node >= 0" | default=5,
722  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
723  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
724  TextEncodingNone preferred_ml_framework | default="DEFAULT",
725  TextEncodingNone model_metadata | default="DEFAULT") ->
726  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
727  */
728 // clang-format on
729 
730 template <typename T>
733  const TextEncodingNone& model_name,
734  const Column<T>& input_labels,
735  const ColumnList<TextEncodingDict>& input_cat_features,
736  const ColumnList<T>& input_numeric_features,
737  const int64_t max_tree_depth,
738  const int64_t min_observations_per_leaf_node,
739  const int32_t cat_top_k,
740  const float cat_min_fraction,
741  const TextEncodingNone& preferred_ml_framework_str,
742  const TextEncodingNone& model_metadata,
743  Column<TextEncodingDict>& output_model_name) {
744  std::vector<std::vector<std::string>> empty_cat_feature_keys;
745  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
746  input_numeric_features,
747  cat_top_k,
748  cat_min_fraction,
749  false /* cat_include_others */);
750  return decision_tree_reg_impl(mgr,
751  model_name,
752  input_labels,
753  cat_features_builder.getFeatures(),
754  cat_features_builder.getCatFeatureKeys(),
755  max_tree_depth,
756  min_observations_per_leaf_node,
757  preferred_ml_framework_str,
758  model_metadata,
759  output_model_name);
760 }
761 
762 // clang-format off
763 /*
764  UDTF: decision_tree_reg_fit__cpu_template(TableFunctionManager,
765  TextEncodingNone model_name,
766  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data,
767  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
768  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node >= 0" | default=5,
769  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
770  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
771  TextEncodingNone preferred_ml_framework | default="DEFAULT",
772  TextEncodingNone model_metadata | default="DEFAULT") ->
773  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
774  */
775 // clang-format on
776 
777 template <typename T>
780  const TextEncodingNone& model_name,
781  const Column<T>& input_labels,
782  const ColumnList<TextEncodingDict>& input_cat_features,
783  const int64_t max_tree_depth,
784  const int64_t min_observations_per_leaf_node,
785  const int32_t cat_top_k,
786  const float cat_min_fraction,
787  const TextEncodingNone& preferred_ml_framework_str,
788  const TextEncodingNone& model_metadata,
789  Column<TextEncodingDict>& output_model_name) {
790  std::vector<std::vector<std::string>> empty_cat_feature_keys;
791  CategoricalFeaturesBuilder<T> cat_features_builder(
792  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
793  return decision_tree_reg_impl(mgr,
794  model_name,
795  input_labels,
796  cat_features_builder.getFeatures(),
797  cat_features_builder.getCatFeatureKeys(),
798  max_tree_depth,
799  min_observations_per_leaf_node,
800  preferred_ml_framework_str,
801  model_metadata,
802  output_model_name);
803 }
804 
805 template <typename T>
806 NEVER_INLINE HOST int32_t
808  const TextEncodingNone& model_name,
809  const Column<T>& input_labels,
810  const ColumnList<T>& input_features,
811  const std::vector<std::vector<std::string>>& cat_feature_keys,
812  const int64_t max_iterations,
813  const int64_t max_tree_depth,
814  const double shrinkage,
815  const double min_split_loss,
816  const double lambda,
817  const double obs_per_tree_fraction,
818  const int64_t features_per_node,
819  const int64_t min_observations_per_leaf_node,
820  const int64_t max_bins,
821  const int64_t min_bin_size,
822  const TextEncodingNone& preferred_ml_framework_str,
823  const TextEncodingNone& model_metadata,
824  Column<TextEncodingDict>& output_model_name) {
825  if (input_labels.size() == 0) {
826  return mgr.ERROR_MESSAGE(
827  "No rows exist in training data. Training data must at least contain 1 row.");
828  }
829  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
830  if (preferred_ml_framework == MLFramework::INVALID) {
831  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
832  preferred_ml_framework_str.getString());
833  }
834  if (preferred_ml_framework == MLFramework::MLPACK) {
835  return mgr.ERROR_MESSAGE("Only OneDAL framework supported for GBT regression.");
836  }
837 #ifndef HAVE_ONEDAL
838  return mgr.ERROR_MESSAGE("Only OneDAL framework supported for GBT regression.");
839 #endif
840 
841  const auto denulled_data = denull_data(input_labels, input_features);
842  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
843  const auto features_ptrs =
844  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
845  mgr.set_output_row_size(1);
846  try {
847  bool did_execute = false;
848 #ifdef HAVE_ONEDAL
849  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
850  preferred_ml_framework == MLFramework::DEFAULT)) {
851  onedal_gbt_reg_fit_impl<T>(model_name,
852  labels_ptrs[0],
853  features_ptrs,
854  model_metadata,
855  cat_feature_keys,
856  denulled_data.masked_num_rows,
857  max_iterations,
858  max_tree_depth,
859  shrinkage,
860  min_split_loss,
861  lambda,
862  obs_per_tree_fraction,
863  features_per_node,
864  min_observations_per_leaf_node,
865  max_bins,
866  min_bin_size);
867  const TextEncodingDict model_name_str_id =
868  output_model_name.getOrAddTransient(model_name);
869  output_model_name[0] = model_name_str_id;
870  did_execute = true;
871  }
872 #endif
873  if (!did_execute) {
874  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
875  " ML library to support GBT regression implementation.");
876  }
877  } catch (std::runtime_error& e) {
878  return mgr.ERROR_MESSAGE(e.what());
879  }
880  return 1;
881 }
882 
883 // clang-format off
884 /*
885  UDTF: gbt_reg_fit__cpu_template(TableFunctionManager,
886  TextEncodingNone model_name,
887  Cursor<Column<T> labels, ColumnList<T> features> data,
888  int64_t max_iterations | require="max_iterations > 0" | default=50,
889  int64_t max_tree_depth | require="max_tree_depth > 0" | default=6,
890  double shrinkage | require="shrinkage > 0.0" | require="shrinkage <= 1.0" | default=0.3,
891  double min_split_loss | require="min_split_loss >= 0.0" | default=0.0,
892  double lambda | require="lambda >= 0.0" | default=1.0,
893  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
894  int64_t features_per_node | require="features_per_node >= 0" | default=0,
895  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
896  int64_t max_bins | require="max_bins > 0" | default=256,
897  int64_t min_bin_size | require="min_bin_size >= 0" | default=5,
898  TextEncodingNone preferred_ml_framework | default="DEFAULT",
899  TextEncodingNone model_metadata | default="DEFAULT") ->
900  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
901  */
902 // clang-format on
903 
904 template <typename T>
905 NEVER_INLINE HOST int32_t
907  const TextEncodingNone& model_name,
908  const Column<T>& input_labels,
909  const ColumnList<T>& input_features,
910  const int64_t max_iterations,
911  const int64_t max_tree_depth,
912  const double shrinkage,
913  const double min_split_loss,
914  const double lambda,
915  const double obs_per_tree_fraction,
916  const int64_t features_per_node,
917  const int64_t min_observations_per_leaf_node,
918  const int64_t max_bins,
919  const int64_t min_bin_size,
920  const TextEncodingNone& preferred_ml_framework_str,
921  const TextEncodingNone& model_metadata,
922  Column<TextEncodingDict>& output_model_name) {
923  std::vector<std::vector<std::string>> empty_cat_feature_keys;
924  return gbt_reg_fit_impl(mgr,
925  model_name,
926  input_labels,
927  input_features,
928  empty_cat_feature_keys,
929  max_iterations,
930  max_tree_depth,
931  shrinkage,
932  min_split_loss,
933  lambda,
934  obs_per_tree_fraction,
935  features_per_node,
936  min_observations_per_leaf_node,
937  max_bins,
938  min_bin_size,
939  preferred_ml_framework_str,
940  model_metadata,
941  output_model_name);
942 }
943 
944 // clang-format off
945 /*
946  UDTF: gbt_reg_fit__cpu_template(TableFunctionManager,
947  TextEncodingNone model_name,
948  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data,
949  int64_t max_iterations | require="max_iterations > 0" | default=50,
950  int64_t max_tree_depth | require="max_tree_depth > 0" | default=6,
951  double shrinkage | require="shrinkage > 0.0" | require="shrinkage <= 1.0" | default=0.3,
952  double min_split_loss | require="min_split_loss >= 0.0" | default=0.0,
953  double lambda | require="lambda >= 0.0" | default=1.0,
954  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
955  int64_t features_per_node | require="features_per_node >= 0" | default=0,
956  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
957  int64_t max_bins | require="max_bins > 0" | default=256,
958  int64_t min_bin_size | require="min_bin_size >= 0" | default=5,
959  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
960  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
961  TextEncodingNone preferred_ml_framework | default="DEFAULT",
962  TextEncodingNone model_metadata | default="DEFAULT") ->
963  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
964  */
965 // clang-format on
966 
967 template <typename T>
968 NEVER_INLINE HOST int32_t
970  const TextEncodingNone& model_name,
971  const Column<T>& input_labels,
972  const ColumnList<TextEncodingDict>& input_cat_features,
973  const ColumnList<T>& input_numeric_features,
974  const int64_t max_iterations,
975  const int64_t max_tree_depth,
976  const double shrinkage,
977  const double min_split_loss,
978  const double lambda,
979  const double obs_per_tree_fraction,
980  const int64_t features_per_node,
981  const int64_t min_observations_per_leaf_node,
982  const int64_t max_bins,
983  const int64_t min_bin_size,
984  const int32_t cat_top_k,
985  const float cat_min_fraction,
986  const TextEncodingNone& preferred_ml_framework_str,
987  const TextEncodingNone& model_metadata,
988  Column<TextEncodingDict>& output_model_name) {
989  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
990  input_numeric_features,
991  cat_top_k,
992  cat_min_fraction,
993  false /* cat_include_others */);
994  return gbt_reg_fit_impl(mgr,
995  model_name,
996  input_labels,
997  cat_features_builder.getFeatures(),
998  cat_features_builder.getCatFeatureKeys(),
999  max_iterations,
1000  max_tree_depth,
1001  shrinkage,
1002  min_split_loss,
1003  lambda,
1004  obs_per_tree_fraction,
1005  features_per_node,
1006  min_observations_per_leaf_node,
1007  max_bins,
1008  min_bin_size,
1009  preferred_ml_framework_str,
1010  model_metadata,
1011  output_model_name);
1012 }
1013 
1014 // clang-format off
1015 /*
1016  UDTF: gbt_reg_fit__cpu_template(TableFunctionManager,
1017  TextEncodingNone model_name,
1018  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data,
1019  int64_t max_iterations | require="max_iterations > 0" | default=50,
1020  int64_t max_tree_depth | require="max_tree_depth > 0" | default=6,
1021  double shrinkage | require="shrinkage > 0.0" | require="shrinkage <= 1.0" | default=0.3,
1022  double min_split_loss | require="min_split_loss >= 0.0" | default=0.0,
1023  double lambda | require="lambda >= 0.0" | default=1.0,
1024  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
1025  int64_t features_per_node | require="features_per_node >= 0" | default=0,
1026  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
1027  int64_t max_bins | require="max_bins > 0" | default=256,
1028  int64_t min_bin_size | require="min_bin_size >= 0" | default=5,
1029  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1030  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1031  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1032  TextEncodingNone model_metadata | default="DEFAULT") ->
1033  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1034  */
1035 // clang-format on
1036 
1037 template <typename T>
1038 NEVER_INLINE HOST int32_t
1040  const TextEncodingNone& model_name,
1041  const Column<T>& input_labels,
1042  const ColumnList<TextEncodingDict>& input_cat_features,
1043  const int64_t max_iterations,
1044  const int64_t max_tree_depth,
1045  const double shrinkage,
1046  const double min_split_loss,
1047  const double lambda,
1048  const double obs_per_tree_fraction,
1049  const int64_t features_per_node,
1050  const int64_t min_observations_per_leaf_node,
1051  const int64_t max_bins,
1052  const int64_t min_bin_size,
1053  const int32_t cat_top_k,
1054  const float cat_min_fraction,
1055  const TextEncodingNone& preferred_ml_framework_str,
1056  const TextEncodingNone& model_metadata,
1057  Column<TextEncodingDict>& output_model_name) {
1058  CategoricalFeaturesBuilder<T> cat_features_builder(
1059  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
1060  return gbt_reg_fit_impl(mgr,
1061  model_name,
1062  input_labels,
1063  cat_features_builder.getFeatures(),
1064  cat_features_builder.getCatFeatureKeys(),
1065  max_iterations,
1066  max_tree_depth,
1067  shrinkage,
1068  min_split_loss,
1069  lambda,
1070  obs_per_tree_fraction,
1071  features_per_node,
1072  min_observations_per_leaf_node,
1073  max_bins,
1074  min_bin_size,
1075  preferred_ml_framework_str,
1076  model_metadata,
1077  output_model_name);
1078 }
1079 
1080 template <typename T>
1081 NEVER_INLINE HOST int32_t
1083  const TextEncodingNone& model_name,
1084  const Column<T>& input_labels,
1085  const ColumnList<T>& input_features,
1086  const std::vector<std::vector<std::string>>& cat_feature_keys,
1087  const int64_t num_trees,
1088  const double obs_per_tree_fraction,
1089  const int64_t max_tree_depth,
1090  const int64_t features_per_node,
1091  const double impurity_threshold,
1092  const bool bootstrap,
1093  const int64_t min_obs_per_leaf_node,
1094  const int64_t min_obs_per_split_node,
1095  const double min_weight_fraction_in_leaf_node,
1096  const double min_impurity_decrease_in_split_node,
1097  const int64_t max_leaf_nodes,
1098  const bool use_histogram,
1099  const TextEncodingNone& var_importance_metric_str,
1100  const TextEncodingNone& preferred_ml_framework_str,
1101  const TextEncodingNone& model_metadata,
1102  Column<TextEncodingDict>& output_model_name) {
1103  if (input_labels.size() == 0) {
1104  return mgr.ERROR_MESSAGE(
1105  "No rows exist in training data. Training data must at least contain 1 row.");
1106  }
1107  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1108  if (preferred_ml_framework == MLFramework::INVALID) {
1109  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1110  preferred_ml_framework_str.getString());
1111  }
1112  if (preferred_ml_framework == MLFramework::MLPACK) {
1113  return mgr.ERROR_MESSAGE(
1114  "Only OneDAL framework supported for random forest regression.");
1115  }
1116 #ifndef HAVE_ONEDAL
1117  return mgr.ERROR_MESSAGE(
1118  "Only OneDAL framework supported for random forest regression.");
1119 #endif
1120 
1121  const auto denulled_data = denull_data(input_labels, input_features);
1122  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
1123  const auto features_ptrs =
1124  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
1125  mgr.set_output_row_size(1);
1126  try {
1127  bool did_execute = false;
1128  const auto var_importance_metric =
1129  get_var_importance_metric(var_importance_metric_str);
1130  if (var_importance_metric == VarImportanceMetric::INVALID) {
1131  return mgr.ERROR_MESSAGE("Invalid variable importance metric: " +
1132  var_importance_metric_str.getString());
1133  }
1134 #ifdef HAVE_ONEDAL
1135  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1136  preferred_ml_framework == MLFramework::DEFAULT)) {
1137  if (use_histogram) {
1138  onedal_random_forest_reg_fit_impl<T, decision_forest::regression::training::hist>(
1139  model_name,
1140  labels_ptrs[0],
1141  features_ptrs,
1142  model_metadata,
1143  cat_feature_keys,
1144  denulled_data.masked_num_rows,
1145  num_trees,
1146  obs_per_tree_fraction,
1147  max_tree_depth,
1148  features_per_node,
1149  impurity_threshold,
1150  bootstrap,
1151  min_obs_per_leaf_node,
1152  min_obs_per_split_node,
1153  min_weight_fraction_in_leaf_node,
1154  min_impurity_decrease_in_split_node,
1155  max_leaf_nodes,
1156  var_importance_metric);
1157  } else {
1158  onedal_random_forest_reg_fit_impl<
1159  T,
1160  decision_forest::regression::training::defaultDense>(
1161  model_name,
1162  labels_ptrs[0],
1163  features_ptrs,
1164  model_metadata,
1165  cat_feature_keys,
1166  denulled_data.masked_num_rows,
1167  num_trees,
1168  obs_per_tree_fraction,
1169  max_tree_depth,
1170  features_per_node,
1171  impurity_threshold,
1172  bootstrap,
1173  min_obs_per_leaf_node,
1174  min_obs_per_split_node,
1175  min_weight_fraction_in_leaf_node,
1176  min_impurity_decrease_in_split_node,
1177  max_leaf_nodes,
1178  var_importance_metric);
1179  }
1180  const TextEncodingDict model_name_str_id =
1181  output_model_name.getOrAddTransient(model_name);
1182  output_model_name[0] = model_name_str_id;
1183  did_execute = true;
1184  }
1185 #endif
1186  if (!did_execute) {
1187  return mgr.ERROR_MESSAGE(
1188  "Cannot find " + preferred_ml_framework_str.getString() +
1189  " ML library to support random forest regression implementation.");
1190  }
1191  } catch (std::runtime_error& e) {
1192  return mgr.ERROR_MESSAGE(e.what());
1193  }
1194  return 1;
1195 }
1196 
1197 // clang-format off
1198 /*
1199  UDTF: random_forest_reg_fit__cpu_template(TableFunctionManager,
1200  TextEncodingNone model_name,
1201  Cursor<Column<T> labels, ColumnList<T> features> data,
1202  int64_t num_trees | require="num_trees > 0" | default=10,
1203  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
1204  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
1205  int64_t features_per_node | require="features_per_node >= 0" | default=0,
1206  double impurity_threshold | require="impurity_threshold >= 0.0" | default=0.0,
1207  bool bootstrap | default=true,
1208  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
1209  int64_t min_obs_per_split_node | require="min_obs_per_leaf_node > 0" | default=2,
1210  double min_weight_fraction_in_leaf_node | require="min_weight_fraction_in_leaf_node >= 0.0" | default=0.0,
1211  double min_impurity_decrease_in_split_node | require="min_impurity_decrease_in_split_node >= 0.0" | default=0.0,
1212  int64_t max_leaf_nodes | require="max_leaf_nodes >=0" | default=0,
1213  bool use_histogram | default=false,
1214  TextEncodingNone var_importance_metric | default="MDI",
1215  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1216  TextEncodingNone model_metadata | default="DEFAULT") ->
1217  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1218  */
1219 // clang-format on
1220 
1221 template <typename T>
1222 NEVER_INLINE HOST int32_t
1224  const TextEncodingNone& model_name,
1225  const Column<T>& input_labels,
1226  const ColumnList<T>& input_features,
1227  const int64_t num_trees,
1228  const double obs_per_tree_fraction,
1229  const int64_t max_tree_depth,
1230  const int64_t features_per_node,
1231  const double impurity_threshold,
1232  const bool bootstrap,
1233  const int64_t min_obs_per_leaf_node,
1234  const int64_t min_obs_per_split_node,
1235  const double min_weight_fraction_in_leaf_node,
1236  const double min_impurity_decrease_in_split_node,
1237  const int64_t max_leaf_nodes,
1238  const bool use_histogram,
1239  const TextEncodingNone& var_importance_metric_str,
1240  const TextEncodingNone& preferred_ml_framework_str,
1241  const TextEncodingNone& model_metadata,
1242  Column<TextEncodingDict>& output_model_name) {
1243  std::vector<std::vector<std::string>> empty_cat_feature_keys;
1244  return random_forest_reg_fit_impl(mgr,
1245  model_name,
1246  input_labels,
1247  input_features,
1248  empty_cat_feature_keys,
1249  num_trees,
1250  obs_per_tree_fraction,
1251  max_tree_depth,
1252  features_per_node,
1253  impurity_threshold,
1254  bootstrap,
1255  min_obs_per_leaf_node,
1256  min_obs_per_split_node,
1257  min_weight_fraction_in_leaf_node,
1258  min_impurity_decrease_in_split_node,
1259  max_leaf_nodes,
1260  use_histogram,
1261  var_importance_metric_str,
1262  preferred_ml_framework_str,
1263  model_metadata,
1264  output_model_name);
1265 }
1266 
1267 // clang-format off
1268 /*
1269  UDTF: random_forest_reg_fit__cpu_template(TableFunctionManager,
1270  TextEncodingNone model_name,
1271  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data,
1272  int64_t num_trees | require="num_trees > 0" | default=10,
1273  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
1274  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
1275  int64_t features_per_node | require="features_per_node >= 0" | default=0,
1276  double impurity_threshold | require="impurity_threshold >= 0.0" | default=0.0,
1277  bool bootstrap | default=true,
1278  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
1279  int64_t min_obs_per_split_node | require="min_obs_per_leaf_node > 0" | default=2,
1280  double min_weight_fraction_in_leaf_node | require="min_weight_fraction_in_leaf_node >= 0.0" | default=0.0,
1281  double min_impurity_decrease_in_split_node | require="min_impurity_decrease_in_split_node >= 0.0" | default=0.0,
1282  int64_t max_leaf_nodes | require="max_leaf_nodes >=0" | default=0,
1283  bool use_histogram | default=false,
1284  TextEncodingNone var_importance_metric | default="MDI",
1285  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1286  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1287  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1288  TextEncodingNone model_metadata | default="DEFAULT") ->
1289  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1290  */
1291 // clang-format on
1292 
1293 template <typename T>
1295  TableFunctionManager& mgr,
1296  const TextEncodingNone& model_name,
1297  const Column<T>& input_labels,
1298  const ColumnList<TextEncodingDict>& input_cat_features,
1299  const ColumnList<T>& input_numeric_features,
1300  const int64_t num_trees,
1301  const double obs_per_tree_fraction,
1302  const int64_t max_tree_depth,
1303  const int64_t features_per_node,
1304  const double impurity_threshold,
1305  const bool bootstrap,
1306  const int64_t min_obs_per_leaf_node,
1307  const int64_t min_obs_per_split_node,
1308  const double min_weight_fraction_in_leaf_node,
1309  const double min_impurity_decrease_in_split_node,
1310  const int64_t max_leaf_nodes,
1311  const bool use_histogram,
1312  const TextEncodingNone& var_importance_metric_str,
1313  const int32_t cat_top_k,
1314  const float cat_min_fraction,
1315  const TextEncodingNone& preferred_ml_framework_str,
1316  const TextEncodingNone& model_metadata,
1317  Column<TextEncodingDict>& output_model_name) {
1318  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1319  input_numeric_features,
1320  cat_top_k,
1321  cat_min_fraction,
1322  false /* cat_include_others */);
1323  return random_forest_reg_fit_impl(mgr,
1324  model_name,
1325  input_labels,
1326  cat_features_builder.getFeatures(),
1327  cat_features_builder.getCatFeatureKeys(),
1328  num_trees,
1329  obs_per_tree_fraction,
1330  max_tree_depth,
1331  features_per_node,
1332  impurity_threshold,
1333  bootstrap,
1334  min_obs_per_leaf_node,
1335  min_obs_per_split_node,
1336  min_weight_fraction_in_leaf_node,
1337  min_impurity_decrease_in_split_node,
1338  max_leaf_nodes,
1339  use_histogram,
1340  var_importance_metric_str,
1341  preferred_ml_framework_str,
1342  model_metadata,
1343  output_model_name);
1344 }
1345 
1346 // clang-format off
1347 /*
1348  UDTF: random_forest_reg_fit__cpu_template(TableFunctionManager,
1349  TextEncodingNone model_name,
1350  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data,
1351  int64_t num_trees | require="num_trees > 0" | default=10,
1352  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
1353  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
1354  int64_t features_per_node | require="features_per_node >= 0" | default=0,
1355  double impurity_threshold | require="impurity_threshold >= 0.0" | default=0.0,
1356  bool bootstrap | default=true,
1357  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
1358  int64_t min_obs_per_split_node | require="min_obs_per_leaf_node > 0" | default=2,
1359  double min_weight_fraction_in_leaf_node | require="min_weight_fraction_in_leaf_node >= 0.0" | default=0.0,
1360  double min_impurity_decrease_in_split_node | require="min_impurity_decrease_in_split_node >= 0.0" | default=0.0,
1361  int64_t max_leaf_nodes | require="max_leaf_nodes >=0" | default=0,
1362  bool use_histogram | default=false,
1363  TextEncodingNone var_importance_metric | default="MDI",
1364  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1365  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1366  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1367  TextEncodingNone model_metadata | default="DEFAULT") ->
1368  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1369  */
1370 // clang-format on
1371 
1372 template <typename T>
1374  TableFunctionManager& mgr,
1375  const TextEncodingNone& model_name,
1376  const Column<T>& input_labels,
1377  const ColumnList<TextEncodingDict>& input_cat_features,
1378  const int64_t num_trees,
1379  const double obs_per_tree_fraction,
1380  const int64_t max_tree_depth,
1381  const int64_t features_per_node,
1382  const double impurity_threshold,
1383  const bool bootstrap,
1384  const int64_t min_obs_per_leaf_node,
1385  const int64_t min_obs_per_split_node,
1386  const double min_weight_fraction_in_leaf_node,
1387  const double min_impurity_decrease_in_split_node,
1388  const int64_t max_leaf_nodes,
1389  const bool use_histogram,
1390  const TextEncodingNone& var_importance_metric_str,
1391  const int32_t cat_top_k,
1392  const float cat_min_fraction,
1393  const TextEncodingNone& preferred_ml_framework_str,
1394  const TextEncodingNone& model_metadata,
1395  Column<TextEncodingDict>& output_model_name) {
1396  CategoricalFeaturesBuilder<T> cat_features_builder(
1397  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
1398  return random_forest_reg_fit_impl(mgr,
1399  model_name,
1400  input_labels,
1401  cat_features_builder.getFeatures(),
1402  cat_features_builder.getCatFeatureKeys(),
1403  num_trees,
1404  obs_per_tree_fraction,
1405  max_tree_depth,
1406  features_per_node,
1407  impurity_threshold,
1408  bootstrap,
1409  min_obs_per_leaf_node,
1410  min_obs_per_split_node,
1411  min_weight_fraction_in_leaf_node,
1412  min_impurity_decrease_in_split_node,
1413  max_leaf_nodes,
1414  use_histogram,
1415  var_importance_metric_str,
1416  preferred_ml_framework_str,
1417  model_metadata,
1418  output_model_name);
1419 }
1420 
1421 template <typename T>
1422 NEVER_INLINE HOST int32_t
1424  const TextEncodingNone& model_name,
1425  const ColumnList<T>& input_features,
1426  const std::vector<std::vector<std::string>>& cat_feature_keys,
1427  const TextEncodingNone& preferred_ml_framework_str,
1428  const TextEncodingNone& model_metadata,
1429  Column<TextEncodingDict>& output_model_name) {
1430  if (input_features.size() == 0) {
1431  return mgr.ERROR_MESSAGE(
1432  "No rows exist in training data. Training data must at least contain 1 row.");
1433  }
1434  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1435  if (preferred_ml_framework == MLFramework::INVALID) {
1436  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1437  preferred_ml_framework_str.getString());
1438  }
1439  try {
1440  const auto denulled_data = denull_data(input_features);
1441  const int64_t num_rows = denulled_data.masked_num_rows;
1442  if (num_rows == 0) {
1443  return mgr.ERROR_MESSAGE(
1444  "No non-null rows exist in training data. Training data must at least contain "
1445  "1 "
1446  "non-null row.");
1447  }
1448  const auto features_ptrs =
1449  pluck_ptrs(denulled_data.data, 0L, input_features.numCols());
1450  // z_std_normalize_data_with_summary_stats can throw if std dev is 0
1451  const auto z_std_norm_summary_stats =
1452  z_std_normalize_data_with_summary_stats(denulled_data.data, num_rows);
1453  const auto normalized_ptrs =
1454  pluck_ptrs(z_std_norm_summary_stats.normalized_data,
1455  0L,
1456  z_std_norm_summary_stats.normalized_data.size());
1457  bool did_execute = false;
1458 #ifdef HAVE_ONEDAL
1459  if (preferred_ml_framework == MLFramework::ONEDAL ||
1460  preferred_ml_framework == MLFramework::DEFAULT) {
1461  const auto [eigenvectors, eigenvalues] =
1462  onedal_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
1463  auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
1464  z_std_norm_summary_stats.std_devs,
1465  eigenvectors,
1466  eigenvalues,
1467  model_metadata,
1468  cat_feature_keys);
1469  g_ml_models.addModel(model_name, model);
1470  did_execute = true;
1471  }
1472 #endif
1473  if (!did_execute) {
1474  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
1475  " ML library to support PCA implementation.");
1476  }
1477  mgr.set_output_row_size(1);
1478  const TextEncodingDict model_name_str_id =
1479  output_model_name.getOrAddTransient(model_name);
1480  output_model_name[0] = model_name_str_id;
1481  return 1;
1482  } catch (std::runtime_error& e) {
1483  return mgr.ERROR_MESSAGE(e.what());
1484  }
1485 }
1486 
1487 // clang-format off
1488 /*
1489  UDTF: pca_fit__cpu_template(TableFunctionManager,
1490  TextEncodingNone model_name,
1491  Cursor<ColumnList<T> features> data,
1492  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1493  TextEncodingNone model_metadata | default="DEFAULT") ->
1494  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1495  */
1496 // clang-format on
1497 
1498 template <typename T>
1499 NEVER_INLINE HOST int32_t
1501  const TextEncodingNone& model_name,
1502  const ColumnList<T>& input_features,
1503  const TextEncodingNone& preferred_ml_framework_str,
1504  const TextEncodingNone& model_metadata,
1505  Column<TextEncodingDict>& output_model_name) {
1506  std::vector<std::vector<std::string>> empty_cat_feature_keys;
1507  return pca_fit_impl(mgr,
1508  model_name,
1509  input_features,
1510  empty_cat_feature_keys,
1511  preferred_ml_framework_str,
1512  model_metadata,
1513  output_model_name);
1514 }
1515 
1516 // clang-format off
1517 /*
1518  UDTF: pca_fit__cpu_template(TableFunctionManager,
1519  TextEncodingNone model_name,
1520  Cursor<ColumnList<TextEncodingDict> cat_features, ColumnList<T> features> data,
1521  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1522  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1523  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1524  TextEncodingNone model_metadata | default="DEFAULT") ->
1525  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1526  */
1527 // clang-format on
1528 
1529 template <typename T>
1530 NEVER_INLINE HOST int32_t
1532  const TextEncodingNone& model_name,
1533  const ColumnList<TextEncodingDict>& input_cat_features,
1534  const ColumnList<T>& input_numeric_features,
1535  const int32_t cat_top_k,
1536  const float cat_min_fraction,
1537  const TextEncodingNone& preferred_ml_framework_str,
1538  const TextEncodingNone& model_metadata,
1539  Column<TextEncodingDict>& output_model_name) {
1540  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1541  input_numeric_features,
1542  cat_top_k,
1543  cat_min_fraction,
1544  false /* cat_include_others */);
1545  return pca_fit_impl(mgr,
1546  model_name,
1547  cat_features_builder.getFeatures(),
1548  cat_features_builder.getCatFeatureKeys(),
1549  preferred_ml_framework_str,
1550  model_metadata,
1551  output_model_name);
1552 }
1553 
1554 // clang-format off
1555 /*
1556  UDTF: pca_fit__cpu_1(TableFunctionManager,
1557  TextEncodingNone model_name,
1558  Cursor<ColumnList<TextEncodingDict> cat_features> data,
1559  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1560  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1561  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1562  TextEncodingNone model_metadata | default="DEFAULT") ->
1563  Column<TextEncodingDict> model_name | input_id=args<>
1564 */
1565 // clang-format on
1566 
1569  const TextEncodingNone& model_name,
1570  const ColumnList<TextEncodingDict>& input_cat_features,
1571  const int32_t cat_top_k,
1572  const float cat_min_fraction,
1573  const TextEncodingNone& preferred_ml_framework_str,
1574  const TextEncodingNone& model_metadata,
1575  Column<TextEncodingDict>& output_model_name);
1576 
1577 template <typename T, typename K>
1578 NEVER_INLINE HOST int32_t
1580  const std::shared_ptr<AbstractMLModel>& model,
1581  const Column<K>& input_ids,
1582  const ColumnList<T>& input_features,
1583  const TextEncodingNone& preferred_ml_framework_str,
1584  Column<K>& output_ids,
1585  Column<T>& output_predictions) {
1586  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1587  if (preferred_ml_framework == MLFramework::INVALID) {
1588  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1589  preferred_ml_framework_str.getString());
1590  }
1591  const auto denulled_data = denull_data(input_features);
1592  const int64_t num_rows = denulled_data.masked_num_rows;
1593  const bool data_is_masked =
1594  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
1595  std::vector<T> denulled_output_allocation(data_is_masked ? num_rows : 0);
1596  mgr.set_output_row_size(input_ids.size());
1597  T* denulled_output =
1598  data_is_masked ? denulled_output_allocation.data() : output_predictions.ptr_;
1599  const auto features_ptrs = pluck_ptrs(denulled_data.data, 0L, input_features.numCols());
1600 
1601  try {
1602  bool did_execute = false;
1603  const auto model_type = model->getModelType();
1604  switch (model_type) {
1605  case MLModelType::LINEAR_REG: {
1606  const auto linear_reg_model =
1607  std::dynamic_pointer_cast<LinearRegressionModel>(model);
1608  CHECK(linear_reg_model);
1609 #ifdef HAVE_ONEDAL
1610  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1611  preferred_ml_framework == MLFramework::DEFAULT)) {
1612  onedal_linear_reg_predict_impl(
1613  linear_reg_model, features_ptrs, denulled_output, num_rows);
1614  did_execute = true;
1615  }
1616 #endif
1617 #ifdef HAVE_MLPACK
1618  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
1619  preferred_ml_framework == MLFramework::DEFAULT)) {
1620  mlpack_linear_reg_predict_impl(
1621  linear_reg_model, features_ptrs, denulled_output, num_rows);
1622  did_execute = true;
1623  }
1624 #endif
1625  break;
1626  }
1628 #ifdef HAVE_ONEDAL
1629  const auto decision_tree_reg_model =
1630  std::dynamic_pointer_cast<DecisionTreeRegressionModel>(model);
1631  CHECK(decision_tree_reg_model);
1632  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1633  preferred_ml_framework == MLFramework::DEFAULT)) {
1634  onedal_decision_tree_reg_predict_impl(
1635  decision_tree_reg_model, features_ptrs, denulled_output, num_rows);
1636  did_execute = true;
1637  }
1638 #endif
1639  break;
1640  }
1641  case MLModelType::GBT_REG: {
1642 #ifdef HAVE_ONEDAL
1643  const auto gbt_reg_model = std::dynamic_pointer_cast<GbtRegressionModel>(model);
1644  CHECK(gbt_reg_model);
1645  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1646  preferred_ml_framework == MLFramework::DEFAULT)) {
1647  onedal_gbt_reg_predict_impl(
1648  gbt_reg_model, features_ptrs, denulled_output, num_rows);
1649  did_execute = true;
1650  }
1651 #endif
1652  break;
1653  }
1655 #ifdef HAVE_ONEDAL
1656  const auto random_forest_reg_model =
1657  std::dynamic_pointer_cast<RandomForestRegressionModel>(model);
1658  CHECK(random_forest_reg_model);
1659  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1660  preferred_ml_framework == MLFramework::DEFAULT)) {
1661  onedal_random_forest_reg_predict_impl(
1662  random_forest_reg_model, features_ptrs, denulled_output, num_rows);
1663  did_execute = true;
1664  }
1665 #endif
1666  break;
1667  }
1668  default: {
1669  throw std::runtime_error("Unsupported model type");
1670  }
1671  }
1672  if (!did_execute) {
1673  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
1674  " ML library to support model implementation.");
1675  }
1676  } catch (std::runtime_error& e) {
1677  const std::string error_str(e.what());
1678  return mgr.ERROR_MESSAGE(error_str);
1679  }
1680  output_ids = input_ids;
1681  if (data_is_masked) {
1682  unmask_data(denulled_output,
1683  denulled_data.reverse_index_map,
1684  output_predictions.ptr_,
1685  denulled_data.unmasked_num_rows,
1686  inline_null_value<T>());
1687  }
1688  return input_ids.size();
1689 }
1690 
1691 // clang-format off
1692 /*
1693  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1694  TextEncodingNone model_name,
1695  Cursor<Column<K> id, ColumnList<T> features> data,
1696  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1697  Column<K> id | input_id=args<0>, Column<T> prediction,
1698  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1699  */
1700 // clang-format on
1701 
1702 template <typename T, typename K>
1703 NEVER_INLINE HOST int32_t
1705  const TextEncodingNone& model_name,
1706  const Column<K>& input_ids,
1707  const ColumnList<T>& input_features,
1708  const TextEncodingNone& preferred_ml_framework_str,
1709  Column<K>& output_ids,
1710  Column<T>& output_predictions) {
1711  try {
1712  const auto model = g_ml_models.getModel(model_name);
1713  check_model_params(model, 0, input_features.numCols());
1714  return ml_reg_predict_impl(mgr,
1715  model,
1716  input_ids,
1717  input_features,
1718  preferred_ml_framework_str,
1719  output_ids,
1720  output_predictions);
1721  } catch (std::runtime_error& e) {
1722  const std::string error_str(e.what());
1723  return mgr.ERROR_MESSAGE(error_str);
1724  }
1725 }
1726 
1727 // clang-format off
1728 /*
1729  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1730  TextEncodingNone model_name,
1731  Cursor<Column<K> id, ColumnList<TextEncodingDict> cat_features, ColumnList<T> features> data,
1732  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1733  Column<K> id | input_id=args<0>, Column<T> prediction,
1734  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1735  */
1736 // clang-format on
1737 
1738 template <typename T, typename K>
1739 NEVER_INLINE HOST int32_t
1741  const TextEncodingNone& model_name,
1742  const Column<K>& input_ids,
1743  const ColumnList<TextEncodingDict>& input_cat_features,
1744  const ColumnList<T>& input_numeric_features,
1745  const TextEncodingNone& preferred_ml_framework_str,
1746  Column<K>& output_ids,
1747  Column<T>& output_predictions) {
1748  try {
1749  const auto model = g_ml_models.getModel(model_name);
1751  model, input_cat_features.numCols(), input_numeric_features.numCols());
1752  CategoricalFeaturesBuilder<T> cat_features_builder(
1753  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
1754  return ml_reg_predict_impl(mgr,
1755  model,
1756  input_ids,
1757  cat_features_builder.getFeatures(),
1758  preferred_ml_framework_str,
1759  output_ids,
1760  output_predictions);
1761  } catch (std::runtime_error& e) {
1762  const std::string error_str(e.what());
1763  return mgr.ERROR_MESSAGE(error_str);
1764  }
1765 }
1766 
1767 // clang-format off
1768 /*
1769  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1770  TextEncodingNone model_name,
1771  Cursor<Column<K> id, ColumnList<TextEncodingDict> cat_features> data,
1772  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1773  Column<K> id | input_id=args<0>, Column<T> prediction,
1774  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1775  */
1776 // clang-format on
1777 
1778 template <typename T, typename K>
1779 NEVER_INLINE HOST int32_t
1781  const TextEncodingNone& model_name,
1782  const Column<K>& input_ids,
1783  const ColumnList<TextEncodingDict>& input_cat_features,
1784  const TextEncodingNone& preferred_ml_framework_str,
1785  Column<K>& output_ids,
1786  Column<T>& output_predictions) {
1787  try {
1788  const auto model = g_ml_models.getModel(model_name);
1789  check_model_params(model, input_cat_features.numCols(), 0);
1790  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1791  model->getCatFeatureKeys());
1792  return ml_reg_predict_impl(mgr,
1793  model,
1794  input_ids,
1795  cat_features_builder.getFeatures(),
1796  preferred_ml_framework_str,
1797  output_ids,
1798  output_predictions);
1799  } catch (std::runtime_error& e) {
1800  const std::string error_str(e.what());
1801  return mgr.ERROR_MESSAGE(error_str);
1802  }
1803 }
1804 
1805 // clang-format off
1806 /*
1807  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1808  Cursor<Column<TextEncodingDict> name> model_name,
1809  Cursor<Column<K> id, ColumnList<T> features> data,
1810  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1811  Column<K> id | input_id=args<0>, Column<T> prediction,
1812  K=[int64_t, TextEncodingDict], T=[double]
1813  */
1814 // clang-format on
1815 
1816 template <typename T, typename K>
1817 NEVER_INLINE HOST int32_t
1819  const Column<TextEncodingDict>& model_name,
1820  const Column<K>& input_ids,
1821  const ColumnList<T>& input_features,
1822  const TextEncodingNone& preferred_ml_framework_str,
1823  Column<K>& output_ids,
1824  Column<T>& output_predictions) {
1825  if (model_name.size() != 1) {
1826  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1827  }
1828  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1829  return ml_reg_predict__cpu_template(mgr,
1830  model_name_text_enc_none,
1831  input_ids,
1832  input_features,
1833  preferred_ml_framework_str,
1834  output_ids,
1835  output_predictions);
1836 }
1837 
1838 // clang-format off
1839 /*
1840  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1841  Cursor<Column<TextEncodingDict> name> model_name,
1842  Cursor<Column<K> id, ColumnList<TextEncodingDict> cat_features, ColumnList<T> features> data,
1843  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1844  Column<K> id | input_id=args<0>, Column<T> prediction,
1845  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1846  */
1847 // clang-format on
1848 
1849 template <typename T, typename K>
1850 NEVER_INLINE HOST int32_t
1852  const Column<TextEncodingDict>& model_name,
1853  const Column<K>& input_ids,
1854  const ColumnList<TextEncodingDict>& input_cat_features,
1855  const ColumnList<T>& input_numeric_features,
1856  const TextEncodingNone& preferred_ml_framework_str,
1857  Column<K>& output_ids,
1858  Column<T>& output_predictions) {
1859  if (model_name.size() != 1) {
1860  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1861  }
1862  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1863  return ml_reg_predict__cpu_template(mgr,
1864  model_name_text_enc_none,
1865  input_ids,
1866  input_cat_features,
1867  input_numeric_features,
1868  preferred_ml_framework_str,
1869  output_ids,
1870  output_predictions);
1871 }
1872 
1873 // clang-format off
1874 /*
1875  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1876  Cursor<Column<TextEncodingDict> name> model_name,
1877  Cursor<Column<K> id, ColumnList<TextEncodingDict> cat_features> data,
1878  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1879  Column<K> id | input_id=args<0>, Column<T> prediction,
1880  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1881  */
1882 // clang-format on
1883 
1884 template <typename T, typename K>
1885 NEVER_INLINE HOST int32_t
1887  const Column<TextEncodingDict>& model_name,
1888  const Column<K>& input_ids,
1889  const ColumnList<TextEncodingDict>& input_cat_features,
1890  const TextEncodingNone& preferred_ml_framework_str,
1891  Column<K>& output_ids,
1892  Column<T>& output_predictions) {
1893  if (model_name.size() != 1) {
1894  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1895  }
1896  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1897  return ml_reg_predict__cpu_template(mgr,
1898  model_name_text_enc_none,
1899  input_ids,
1900  input_cat_features,
1901  preferred_ml_framework_str,
1902  output_ids,
1903  output_predictions);
1904 }
1905 
1906 template <typename T>
1908  const std::shared_ptr<AbstractMLModel>& model,
1909  const Column<T>& input_labels,
1910  const ColumnList<T>& input_features,
1911  Column<double>& output_r2) {
1912  const int64_t num_rows = input_labels.size();
1913  if (num_rows == 0) {
1914  return mgr.ERROR_MESSAGE(
1915  "No rows exist in evaluation data. Evaluation data must at least contain 1 row.");
1916  }
1917  std::vector<T> output_predictions_vec(num_rows);
1918  Column<T> output_predictions(output_predictions_vec);
1919  std::vector<int64_t> input_ids_vec(num_rows);
1920  std::vector<int64_t> output_ids_vec(num_rows);
1921  Column<int64_t> input_ids(input_ids_vec);
1922  Column<int64_t> output_ids(output_ids_vec);
1924  TextEncodingNone ml_framework_encoding_none("DEFAULT");
1925 
1926  try {
1927  auto ret = ml_reg_predict_impl(mgr,
1928  model,
1929  input_ids,
1930  input_features,
1931  ml_framework_encoding_none,
1932  output_ids,
1933  output_predictions);
1934 
1935  if (ret < 0) {
1936  // A return of less than 0 symbolizes an error
1937  return ret;
1938  }
1939  } catch (std::runtime_error& e) {
1941  return mgr.ERROR_MESSAGE(e.what());
1942  }
1943 
1945  mgr.set_output_row_size(1);
1946 
1947  const auto labels_mean = get_column_mean(input_labels);
1948  const size_t max_thread_count = std::thread::hardware_concurrency();
1949  const size_t max_inputs_per_thread = 20000;
1950  const size_t num_threads = std::min(
1951  max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
1952 
1953  std::vector<double> local_sum_squared_regressions(num_threads, 0.0);
1954  std::vector<double> local_sum_squares(num_threads, 0.0);
1955 
1956  tbb::task_arena limited_arena(num_threads);
1957 
1958  limited_arena.execute([&] {
1960  tbb::blocked_range<int64_t>(0, num_rows),
1961  [&](const tbb::blocked_range<int64_t>& r) {
1962  const int64_t start_idx = r.begin();
1963  const int64_t end_idx = r.end();
1964  double local_sum_squared_regression{0.0};
1965  double local_sum_square{0.0};
1966  for (int64_t row_idx = start_idx; row_idx < end_idx; ++row_idx) {
1967  if (output_predictions[row_idx] != inline_null_value<T>()) {
1968  local_sum_squared_regression +=
1969  (input_labels[row_idx] - output_predictions[row_idx]) *
1970  (input_labels[row_idx] - output_predictions[row_idx]);
1971  local_sum_square += (input_labels[row_idx] - labels_mean) *
1972  (input_labels[row_idx] - labels_mean);
1973  }
1974  }
1975  const size_t thread_idx = tbb::this_task_arena::current_thread_index();
1976  local_sum_squared_regressions[thread_idx] += local_sum_squared_regression;
1977  local_sum_squares[thread_idx] += local_sum_square;
1978  });
1979  });
1980  double sum_squared_regression{0.0};
1981  double sum_squares{0.0};
1982  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
1983  sum_squared_regression += local_sum_squared_regressions[thread_idx];
1984  sum_squares += local_sum_squares[thread_idx];
1985  }
1986  output_r2[0] = sum_squares == 0.0 ? 1.0 : 1.0 - (sum_squared_regression / sum_squares);
1987  return 1;
1988 }
1989 
1990 // clang-format off
1991 /*
1992  UDTF: r2_score__cpu_template(TableFunctionManager,
1993  TextEncodingNone model_name,
1994  Cursor<Column<T> labels, ColumnList<T> features> data) ->
1995  Column<double> r2, T=[double]
1996  */
1997 // clang-format on
1998 
1999 template <typename T>
2001  const TextEncodingNone& model_name,
2002  const Column<T>& input_labels,
2003  const ColumnList<T>& input_features,
2004  Column<double>& output_r2) {
2005  try {
2006  const auto model = g_ml_models.getModel(model_name);
2007  check_model_params(model, 0, input_features.numCols());
2008  return r2_score_impl(mgr, model, input_labels, input_features, output_r2);
2009  } catch (std::runtime_error& e) {
2010  const std::string error_str(e.what());
2011  return mgr.ERROR_MESSAGE(error_str);
2012  }
2013 }
2014 
2015 // clang-format off
2016 /*
2017  UDTF: r2_score__cpu_template(TableFunctionManager,
2018  Cursor<Column<TextEncodingDict> name> model_name,
2019  Cursor<Column<T> labels, ColumnList<T> features> data) ->
2020  Column<double> r2, T=[double]
2021  */
2022 // clang-format on
2023 
2024 template <typename T>
2025 NEVER_INLINE HOST int32_t
2027  const Column<TextEncodingDict>& model_name,
2028  const Column<T>& input_labels,
2029  const ColumnList<T>& input_features,
2030  Column<double>& output_r2) {
2031  if (model_name.size() != 1) {
2032  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
2033  }
2034  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
2035  return r2_score__cpu_template(
2036  mgr, model_name_text_enc_none, input_labels, input_features, output_r2);
2037 }
2038 
2039 // clang-format off
2040 /*
2041  UDTF: r2_score__cpu_template(TableFunctionManager,
2042  TextEncodingNone model_name,
2043  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data) -> Column<double> r2, T=[double]
2044  */
2045 // clang-format on
2046 
2047 template <typename T>
2048 NEVER_INLINE HOST int32_t
2050  const TextEncodingNone& model_name,
2051  const Column<T>& input_labels,
2052  const ColumnList<TextEncodingDict>& input_cat_features,
2053  const ColumnList<T>& input_numeric_features,
2054  Column<double>& output_r2) {
2055  try {
2056  const auto model = g_ml_models.getModel(model_name);
2058  model, input_cat_features.numCols(), input_numeric_features.numCols());
2059  CategoricalFeaturesBuilder<T> cat_features_builder(
2060  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2061  return r2_score_impl(
2062  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2063  } catch (std::runtime_error& e) {
2064  const std::string error_str(e.what());
2065  return mgr.ERROR_MESSAGE(error_str);
2066  }
2067 }
2068 
2069 // clang-format off
2070 /*
2071  UDTF: r2_score__cpu_template(TableFunctionManager,
2072  TextEncodingNone model_name,
2073  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data) -> Column<double> r2, T=[double]
2074  */
2075 // clang-format on
2076 
2077 template <typename T>
2078 NEVER_INLINE HOST int32_t
2080  const TextEncodingNone& model_name,
2081  const Column<T>& input_labels,
2082  const ColumnList<TextEncodingDict>& input_cat_features,
2083  Column<double>& output_r2) {
2084  try {
2085  const auto model = g_ml_models.getModel(model_name);
2086  check_model_params(model, input_cat_features.numCols(), 0);
2087  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
2088  model->getCatFeatureKeys());
2089  return r2_score_impl(
2090  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2091  } catch (std::runtime_error& e) {
2092  const std::string error_str(e.what());
2093  return mgr.ERROR_MESSAGE(error_str);
2094  }
2095 }
2096 
2097 // clang-format off
2098 /*
2099  UDTF: r2_score__cpu_template(TableFunctionManager,
2100  Cursor<Column<TextEncodingDict> name> model_name,
2101  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data) -> Column<double> r2, T=[double]
2102  */
2103 // clang-format on
2104 
2105 template <typename T>
2106 NEVER_INLINE HOST int32_t
2108  const Column<TextEncodingDict>& model_name,
2109  const Column<T>& input_labels,
2110  const ColumnList<TextEncodingDict>& input_cat_features,
2111  const ColumnList<T>& input_numeric_features,
2112  Column<double>& output_r2) {
2113  if (model_name.size() != 1) {
2114  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
2115  }
2116  const std::string model_name_str{model_name.getString(0)};
2117  try {
2118  const auto model = g_ml_models.getModel(model_name_str);
2120  model, input_cat_features.numCols(), input_numeric_features.numCols());
2121  CategoricalFeaturesBuilder<T> cat_features_builder(
2122  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2123  return r2_score_impl(
2124  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2125  } catch (std::runtime_error& e) {
2126  const std::string error_str(e.what());
2127  return mgr.ERROR_MESSAGE(error_str);
2128  }
2129 }
2130 
2131 // clang-format off
2132 /*
2133  UDTF: random_forest_reg_var_importance__cpu_1(TableFunctionManager,
2134  TextEncodingNone model_name) ->
2135  Column<int64_t> feature_id, Column<TextEncodingDict> feature | input_id=args<>,
2136  Column<int64_t> sub_feature_id, Column<TextEncodingDict> sub_feature | input_id=args<>, Column<double> importance_score
2137  */
2138 // clang-format on
2139 
2142  const TextEncodingNone& model_name,
2143  Column<int64_t>& feature_id,
2144  Column<TextEncodingDict>& feature,
2145  Column<int64_t>& sub_feature_id,
2146  Column<TextEncodingDict>& sub_feature,
2147  Column<double>& importance_score);
2148 
2149 // clang-format off
2150 /*
2151  UDTF: random_forest_reg_var_importance__cpu_2(TableFunctionManager,
2152  Cursor<Column<TextEncodingDict> name> model_name) ->
2153  Column<int64_t> feature_id, Column<TextEncodingDict> feature | input_id=args<>,
2154  Column<int64_t> sub_feature_id, Column<TextEncodingDict> sub_feature | input_id=args<>, Column<double> importance_score
2155  */
2156 // clang-format on
2157 
2160  const Column<TextEncodingDict>& model_name,
2161  Column<int64_t>& feature_id,
2162  Column<TextEncodingDict>& feature,
2163  Column<int64_t>& sub_feature_id,
2164  Column<TextEncodingDict>& sub_feature,
2165  Column<double>& importance_score);
2166 
2167 // clang-format off
2168 /*
2169  UDTF: get_decision_trees__cpu_1(TableFunctionManager,
2170  TextEncodingNone model_name) ->
2171  Column<int64_t> tree_id,
2172  Column<int64_t> entry_id,
2173  Column<bool> is_split_node,
2174  Column<int64_t> feature_id,
2175  Column<int64_t> left_child,
2176  Column<int64_t> right_child,
2177  Column<double> value
2178  */
2179 // clang-format on
2180 
2183  const TextEncodingNone& model_name,
2184  Column<int64_t>& tree_id,
2185  Column<int64_t>& entry_id,
2186  Column<bool>& is_split_node,
2187  Column<int64_t>& feature_id,
2188  Column<int64_t>& left_child,
2189  Column<int64_t>& right_child,
2190  Column<double>& value);
2191 
2192 // clang-format off
2193 /*
2194  UDTF: get_decision_trees__cpu_2(TableFunctionManager,
2195  Cursor<Column<TextEncodingDict> name> model_name) ->
2196  Column<int64_t> tree_id,
2197  Column<int64_t> entry_id,
2198  Column<bool> is_split_node,
2199  Column<int64_t> feature_id,
2200  Column<int64_t> left_child,
2201  Column<int64_t> right_child,
2202  Column<double> value
2203  */
2204 // clang-format on
2205 
2208  const Column<TextEncodingDict>& model_name,
2209  Column<int64_t>& tree_id,
2210  Column<int64_t>& entry_id,
2211  Column<bool>& is_split_node,
2212  Column<int64_t>& feature_id,
2213  Column<int64_t>& left_child,
2214  Column<int64_t>& right_child,
2215  Column<double>& value);
2216 
2217 #endif // #ifndef __CUDACC__
DEVICE const std::string getString(int64_t index) const
NEVER_INLINE HOST int32_t pca_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const ColumnList< T > &numeric_features, const int32_t cat_top_k, const float cat_min_fraction, const bool cat_include_others)
KMeansInitStrategy get_kmeans_init_type(const std::string &init_type_str)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const int32_t cat_top_k, const float cat_min_fraction, const bool cat_include_others)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
NEVER_INLINE HOST int32_t r2_score_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
std::string getString() const
Definition: heavydbTypes.h:641
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
DEVICE int64_t size() const
NEVER_INLINE HOST int32_t decision_tree_reg_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST int32_t pca_fit__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< TextEncodingDict > &input_cat_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
EXTENSION_NOINLINE_HOST int32_t supported_ml_frameworks__cpu_(TableFunctionManager &mgr, Column< TextEncodingDict > &output_ml_frameworks, Column< bool > &output_availability, Column< bool > &output_default)
#define CHECK_GE(x, y)
Definition: Logger.h:306
MaskedData< T > denull_data(const ColumnList< T > &features)
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t kmeans__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const int num_clusters, const int num_iterations, const TextEncodingNone &init_type_str, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
NEVER_INLINE HOST int32_t pca_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
#define CHECK_GT(x, y)
Definition: Logger.h:305
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
MLFramework get_ml_framework(const std::string &ml_framework_str)
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
#define HOST
const size_t max_inputs_per_thread
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const ColumnList< T > &numeric_features, const std::vector< std::vector< std::string >> &cat_feature_keys)
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
void addModel(const std::string &model_name, std::shared_ptr< AbstractMLModel > model)
Definition: MLModel.h:37
VarImportanceMetric get_var_importance_metric(const std::string &var_importance_metric_str)
NEVER_INLINE HOST int32_t gbt_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
#define EXTENSION_NOINLINE_HOST
Definition: heavydbTypes.h:55
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:50
void disable_output_allocations()
Definition: heavydbTypes.h:379
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
DEVICE int64_t numCols() const
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
int8_t ** ptrs_
MLModelMap g_ml_models
Definition: MLModel.h:124
std::vector< int8_t * > col_ptrs_
#define CHECK_LE(x, y)
Definition: Logger.h:304
std::vector< TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodedCol< T > > one_hot_encoded_cols_
void unmask_data(const T *masked_input, const std::vector< int32_t > &reverse_index_map, T *unmasked_output, const int64_t num_unmasked_rows, const T null_val)
NEVER_INLINE HOST int32_t linear_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t dbscan__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const double epsilon, const int32_t min_observations, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const std::vector< std::vector< std::string >> &cat_feature_keys)
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
#define NEVER_INLINE
std::vector< std::vector< T > > z_std_normalize_data(const std::vector< T * > &input_data, const int64_t num_rows)
#define CHECK(condition)
Definition: Logger.h:291
NEVER_INLINE HOST int32_t ml_reg_predict_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
std::vector< std::vector< std::string > > cat_feature_keys_
DEVICE int64_t size() const
DEVICE int64_t size() const
NEVER_INLINE HOST int32_t r2_score__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
ZStdNormalizationSummaryStats< T > z_std_normalize_data_with_summary_stats(const std::vector< T * > &input_data, const int64_t num_rows)
void enable_output_allocations()
Definition: heavydbTypes.h:381
Column< T > create_wrapper_col(std::vector< T > &col_vec)
NEVER_INLINE HOST double get_column_mean(const T *data, const int64_t num_rows)
const std::vector< std::vector< std::string > > & getCatFeatureKeys() const