OmniSciDB  f17484ade4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MLTableFunctions.hpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc., Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #ifndef __CUDACC__
20 
24 
27 
28 #ifdef HAVE_ONEDAL
30 #endif
31 
32 #ifdef HAVE_MLPACK
34 #endif
35 
36 #include <tbb/parallel_for.h>
37 #include <tbb/task_arena.h>
38 
39 using namespace TableFunctions_Namespace;
40 
41 template <typename T>
42 std::vector<const T*> pluck_ptrs(const std::vector<std::vector<T>>& data,
43  const int64_t start_idx,
44  const int64_t end_idx) {
45  std::vector<const T*> raw_ptrs;
46  CHECK_GE(start_idx, 0L);
47  CHECK_GT(end_idx, start_idx);
48  CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
49  for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
50  raw_ptrs.emplace_back(data[col_idx].data());
51  }
52  return raw_ptrs;
53 }
54 
55 template <typename T>
56 std::vector<const T*> pluck_ptrs(const std::vector<T*>& data,
57  const int64_t start_idx,
58  const int64_t end_idx) {
59  std::vector<const T*> raw_ptrs;
60  CHECK_GE(start_idx, 0L);
61  CHECK_GT(end_idx, start_idx);
62  CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
63  for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
64  raw_ptrs.emplace_back(data[col_idx]);
65  }
66  return raw_ptrs;
67 }
68 
69 // clang-format off
70 /*
71  UDTF: supported_ml_frameworks__cpu_(TableFunctionManager) ->
72  Column<TextEncodingDict> ml_framework | input_id=args<>, Column<bool> is_available, Column<bool> is_default
73 */
74 // clang-format on
75 
78  Column<TextEncodingDict>& output_ml_frameworks,
79  Column<bool>& output_availability,
80  Column<bool>& output_default);
82 void check_model_params(const std::shared_ptr<AbstractMLModel>& model,
83  const int64_t num_cat_features,
84  const int64_t num_numeric_features);
85 
86 // clang-format off
87 /*
88  UDTF: kmeans__cpu_template(TableFunctionManager,
89  Cursor<Column<K> input_ids, ColumnList<T> input_features> data,
90  int32_t num_clusters | require="num_clusters > 0" | require="num_clusters <= input_ids.size()",
91  int32_t num_iterations | require="num_iterations > 0" | default=10,
92  TextEncodingNone init_type | default="DEFAULT",
93  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
94  Column<K> id | input_id=args<0>,
95  Column<int32_t> cluster_id,
96  K=[int64_t, TextEncodingDict], T=[double]
97 */
98 // clang-format on
99 
100 template <typename K, typename T>
101 NEVER_INLINE HOST int32_t
103  const Column<K>& input_ids,
104  const ColumnList<T>& input_features,
105  const int num_clusters,
106  const int num_iterations,
107  const TextEncodingNone& init_type_str,
108  const TextEncodingNone& preferred_ml_framework_str,
109  Column<K>& output_ids,
110  Column<int32_t>& output_clusters) {
111  mgr.set_output_row_size(input_ids.size());
112  output_ids = input_ids;
113  const auto kmeans_init_strategy = get_kmeans_init_type(init_type_str);
114  if (kmeans_init_strategy == KMeansInitStrategy::INVALID) {
115  return mgr.ERROR_MESSAGE("Invalid KMeans initializaiton strategy: " +
116  init_type_str.getString());
117  }
118 
119  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
120  if (preferred_ml_framework == MLFramework::INVALID) {
121  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
122  preferred_ml_framework_str.getString());
123  }
124 
125  try {
126  const auto denulled_data = denull_data(input_features);
127  const int64_t num_rows = denulled_data.masked_num_rows;
128  const bool data_is_masked =
129  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
130  std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
131  int32_t* denulled_output =
132  data_is_masked ? denulled_output_allocation.data() : output_clusters.ptr_;
133 
134  // z_std_normalize_data can throw if std dev is 0
135  const auto normalized_data = z_std_normalize_data(denulled_data.data, num_rows);
136  const auto normalized_ptrs = pluck_ptrs(normalized_data, 0L, normalized_data.size());
137 
138  bool did_execute = false;
139 #ifdef HAVE_ONEDAL
140  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
141  preferred_ml_framework == MLFramework::DEFAULT)) {
142  onedal_kmeans_impl(normalized_ptrs,
143  denulled_output,
144  num_rows,
145  num_clusters,
146  num_iterations,
147  kmeans_init_strategy);
148  did_execute = true;
149  }
150 #endif
151 #ifdef HAVE_MLPACK
152  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
153  preferred_ml_framework == MLFramework::DEFAULT)) {
154  mlpack_kmeans_impl(normalized_ptrs,
155  denulled_output,
156  num_rows,
157  num_clusters,
158  num_iterations,
159  kmeans_init_strategy);
160  did_execute = true;
161  }
162 #endif
163  if (!did_execute) {
164  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
165  " ML library to support kmeans implementation.");
166  }
167 
168  if (data_is_masked) {
169  unmask_data(denulled_output,
170  denulled_data.reverse_index_map,
171  output_clusters.ptr_,
172  denulled_data.unmasked_num_rows,
173  inline_null_value<int32_t>());
174  }
175  } catch (std::runtime_error& e) {
176  return mgr.ERROR_MESSAGE(e.what());
177  }
178  return input_ids.size();
179 }
180 
181 // clang-format off
182 /*
183  UDTF: dbscan__cpu_template(TableFunctionManager,
184  Cursor<Column<K> input_ids, ColumnList<T> input_features> data,
185  double epsilon | require="epsilon > 0.0",
186  int32_t min_observations | require="min_observations > 0",
187  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
188  Column<K> id | input_id=args<0>, Column<int32_t> cluster_id,
189  K=[int64_t, TextEncodingDict], T=[double]
190  */
191 // clang-format on
192 
193 template <typename K, typename T>
194 NEVER_INLINE HOST int32_t
196  const Column<K>& input_ids,
197  const ColumnList<T>& input_features,
198  const double epsilon,
199  const int32_t min_observations,
200  const TextEncodingNone& preferred_ml_framework_str,
201  Column<K>& output_ids,
202  Column<int32_t>& output_clusters) {
203  mgr.set_output_row_size(input_ids.size());
204  output_ids = input_ids;
205 
206  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
207  if (preferred_ml_framework == MLFramework::INVALID) {
208  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
209  preferred_ml_framework_str.getString());
210  }
211 
212  try {
213  const auto denulled_data = denull_data(input_features);
214  const int64_t num_rows = denulled_data.masked_num_rows;
215  const bool data_is_masked =
216  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
217  std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
218  int32_t* denulled_output =
219  data_is_masked ? denulled_output_allocation.data() : output_clusters.ptr_;
220 
221  // z_std_normalize_data can throw if std dev is 0
222  const auto normalized_data = z_std_normalize_data(denulled_data.data, num_rows);
223  const auto normalized_ptrs = pluck_ptrs(normalized_data, 0L, normalized_data.size());
224 
225  bool did_execute = false;
226 #ifdef HAVE_ONEDAL
227  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
228  preferred_ml_framework == MLFramework::DEFAULT)) {
229  onedal_dbscan_impl(
230  normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
231  did_execute = true;
232  }
233 #endif
234 #ifdef HAVE_MLPACK
235  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
236  preferred_ml_framework == MLFramework::DEFAULT)) {
237  mlpack_dbscan_impl(
238  normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
239  did_execute = true;
240  }
241 #endif
242  if (!did_execute) {
243  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
244  " ML library to support dbscan implementation.");
245  }
246 
247  if (data_is_masked) {
248  unmask_data(denulled_output,
249  denulled_data.reverse_index_map,
250  output_clusters.ptr_,
251  denulled_data.unmasked_num_rows,
252  inline_null_value<int32_t>());
253  }
254  } catch (std::runtime_error& e) {
255  return mgr.ERROR_MESSAGE(e.what());
256  }
257  return input_ids.size();
258 }
259 
260 template <typename T>
261 NEVER_INLINE HOST int32_t
263  const TextEncodingNone& model_name,
264  const Column<T>& input_labels,
265  const ColumnList<T>& input_features,
266  const std::vector<std::vector<std::string>>& cat_feature_keys,
267  const TextEncodingNone& preferred_ml_framework_str,
268  const TextEncodingNone& model_metadata,
269  Column<TextEncodingDict>& output_model_name) {
270  if (input_labels.size() == 0) {
271  return mgr.ERROR_MESSAGE(
272  "No rows exist in training data. Training data must at least contain 1 row.");
273  }
274  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
275  if (preferred_ml_framework == MLFramework::INVALID) {
276  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
277  preferred_ml_framework_str.getString());
278  }
279  const auto denulled_data = denull_data(input_labels, input_features);
280  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
281  const auto features_ptrs =
282  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
283  const int64_t num_coefs = input_features.numCols() + 1;
284  mgr.set_output_row_size(num_coefs);
285  std::vector<int64_t> coef_idxs(num_coefs);
286  std::vector<double> coefs(num_coefs);
287  try {
288  bool did_execute = false;
289 #ifdef HAVE_ONEDAL
290  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
291  preferred_ml_framework == MLFramework::DEFAULT)) {
292  onedal_linear_reg_fit_impl(labels_ptrs[0],
293  features_ptrs,
294  coef_idxs.data(),
295  coefs.data(),
296  denulled_data.masked_num_rows);
297  did_execute = true;
298  }
299 #endif
300 #ifdef HAVE_MLPACK
301  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
302  preferred_ml_framework == MLFramework::DEFAULT)) {
303  mlpack_linear_reg_fit_impl(labels_ptrs[0],
304  features_ptrs,
305  coef_idxs.data(),
306  coefs.data(),
307  denulled_data.masked_num_rows);
308  did_execute = true;
309  }
310 #endif
311  if (!did_execute) {
312  return mgr.ERROR_MESSAGE(
313  "Cannot find " + preferred_ml_framework_str.getString() +
314  " ML library to support linear regression implementation.");
315  }
316  } catch (std::runtime_error& e) {
317  return mgr.ERROR_MESSAGE(e.what());
318  }
319  auto model =
320  std::make_shared<LinearRegressionModel>(coefs, model_metadata, cat_feature_keys);
321  g_ml_models.addModel(model_name, model);
322  const std::string model_name_str = model_name.getString();
323  const TextEncodingDict model_name_str_id =
324  output_model_name.getOrAddTransient(model_name);
325  output_model_name[0] = model_name_str_id;
326  return 1;
327 }
328 
329 // clang-format off
330 /*
331  UDTF: linear_reg_fit__cpu_template(TableFunctionManager,
332  TextEncodingNone model_name,
333  Cursor<Column<T> labels, ColumnList<T> features> data,
334  TextEncodingNone preferred_ml_framework | default="DEFAULT",
335  TextEncodingNone model_metadata | default="e30=") ->
336  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
337  */
338 // clang-format on
339 
340 // default value for model_metadata of "e30=" is base64 encoded "{}"
341 
342 template <typename T>
343 NEVER_INLINE HOST int32_t
345  const TextEncodingNone& model_name,
346  const Column<T>& input_labels,
347  const ColumnList<T>& input_features,
348  const TextEncodingNone& preferred_ml_framework_str,
349  const TextEncodingNone& model_metadata,
350  Column<TextEncodingDict>& output_model_name) {
351  std::vector<std::vector<std::string>> empty_cat_feature_keys;
352  return linear_reg_fit_impl(mgr,
353  model_name,
354  input_labels,
355  input_features,
356  empty_cat_feature_keys,
357  preferred_ml_framework_str,
358  model_metadata,
359  output_model_name);
360 }
361 
362 template <typename T>
364  public:
366  const ColumnList<T>& numeric_features,
367  const int32_t cat_top_k,
368  const float cat_min_fraction,
369  const bool cat_include_others)
370  : num_rows_(numeric_features.size()) {
372  one_hot_encoding_info(cat_top_k, cat_min_fraction, cat_include_others);
373  const size_t num_cat_features = static_cast<size_t>(cat_features.numCols());
374  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
375  one_hot_encoding_infos;
376  for (size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
377  one_hot_encoding_infos.emplace_back(one_hot_encoding_info);
378  }
379  one_hot_encoded_cols_ =
380  TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
381  cat_features, one_hot_encoding_infos);
382  for (auto& one_hot_encoded_col : one_hot_encoded_cols_) {
383  cat_feature_keys_.emplace_back(one_hot_encoded_col.cat_features);
384  for (auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
385  col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
386  }
387  }
388  const int64_t num_numeric_features = numeric_features.numCols();
389  for (int64_t numeric_feature_idx = 0; numeric_feature_idx < num_numeric_features;
390  ++numeric_feature_idx) {
391  col_ptrs_.emplace_back(numeric_features.ptrs_[numeric_feature_idx]);
392  }
393  }
394 
396  const int32_t cat_top_k,
397  const float cat_min_fraction,
398  const bool cat_include_others)
399  : num_rows_(cat_features.size()) {
401  one_hot_encoding_info(cat_top_k, cat_min_fraction, cat_include_others);
402  const size_t num_cat_features = static_cast<size_t>(cat_features.numCols());
403  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
404  one_hot_encoding_infos;
405  for (size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
406  one_hot_encoding_infos.emplace_back(one_hot_encoding_info);
407  }
408  one_hot_encoded_cols_ =
409  TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
410  cat_features, one_hot_encoding_infos);
411  for (auto& one_hot_encoded_col : one_hot_encoded_cols_) {
412  cat_feature_keys_.emplace_back(one_hot_encoded_col.cat_features);
413  for (auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
414  col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
415  }
416  }
417  }
418 
420  const ColumnList<TextEncodingDict>& cat_features,
421  const ColumnList<T>& numeric_features,
422  const std::vector<std::vector<std::string>>& cat_feature_keys)
423  : num_rows_(numeric_features.size()), cat_feature_keys_(cat_feature_keys) {
424  const size_t num_cat_features = static_cast<size_t>(cat_features.numCols());
425  if (num_cat_features != cat_feature_keys_.size()) {
426  throw std::runtime_error(
427  "Number of provided categorical features does not match number of categorical "
428  "features in the model.");
429  }
430  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
431  one_hot_encoding_infos;
432  for (size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
433  one_hot_encoding_infos.emplace_back(cat_feature_keys_[cat_idx]);
434  }
435  one_hot_encoded_cols_ =
436  TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
437  cat_features, one_hot_encoding_infos);
438  for (auto& one_hot_encoded_col : one_hot_encoded_cols_) {
439  for (auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
440  col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
441  }
442  }
443  const int64_t num_numeric_features = numeric_features.numCols();
444  for (int64_t numeric_feature_idx = 0; numeric_feature_idx < num_numeric_features;
445  ++numeric_feature_idx) {
446  col_ptrs_.emplace_back(numeric_features.ptrs_[numeric_feature_idx]);
447  }
448  }
449 
451  const ColumnList<TextEncodingDict>& cat_features,
452  const std::vector<std::vector<std::string>>& cat_feature_keys)
453  : num_rows_(cat_features.size()), cat_feature_keys_(cat_feature_keys) {
454  const size_t num_cat_features = static_cast<size_t>(cat_features.numCols());
455  if (num_cat_features != cat_feature_keys_.size()) {
456  throw std::runtime_error(
457  "Number of provided categorical features does not match number of categorical "
458  "features in the model.");
459  }
460  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodingInfo>
461  one_hot_encoding_infos;
462  for (size_t cat_idx = 0; cat_idx < num_cat_features; ++cat_idx) {
463  one_hot_encoding_infos.emplace_back(cat_feature_keys_[cat_idx]);
464  }
465  one_hot_encoded_cols_ =
466  TableFunctions_Namespace::OneHotEncoder_Namespace::one_hot_encode<T>(
467  cat_features, one_hot_encoding_infos);
468  for (auto& one_hot_encoded_col : one_hot_encoded_cols_) {
469  for (auto& one_hot_encoded_vec : one_hot_encoded_col.encoded_buffers) {
470  col_ptrs_.emplace_back(reinterpret_cast<int8_t*>(one_hot_encoded_vec.data()));
471  }
472  }
473  }
474 
476  return ColumnList<T>(
477  col_ptrs_.data(), static_cast<int64_t>(col_ptrs_.size()), num_rows_);
478  }
479 
480  const std::vector<std::vector<std::string>>& getCatFeatureKeys() const {
481  return cat_feature_keys_;
482  }
483 
484  private:
485  int64_t num_rows_;
486  std::vector<TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodedCol<T>>
488  std::vector<std::vector<std::string>> cat_feature_keys_;
489  std::vector<int8_t*> col_ptrs_;
490 };
491 
492 // clang-format off
493 /*
494  UDTF: linear_reg_fit__cpu_template(TableFunctionManager,
495  TextEncodingNone model_name,
496  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features,
497  ColumnList<T> numeric_features> data,
498  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
499  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
500  TextEncodingNone preferred_ml_framework | default="DEFAULT",
501  TextEncodingNone model_metadata | default="e30=") ->
502  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
503  */
504 // clang-format on
505 
506 // default value for model_metadata of "e30=" is base64 encoded "{}"
507 
508 template <typename T>
509 NEVER_INLINE HOST int32_t
511  const TextEncodingNone& model_name,
512  const Column<T>& input_labels,
513  const ColumnList<TextEncodingDict>& input_cat_features,
514  const ColumnList<T>& input_numeric_features,
515  const int32_t cat_top_k,
516  const float cat_min_fraction,
517  const TextEncodingNone& preferred_ml_framework_str,
518  const TextEncodingNone& model_metadata,
519  Column<TextEncodingDict>& output_model_name) {
520  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
521  input_numeric_features,
522  cat_top_k,
523  cat_min_fraction,
524  false /* cat_include_others */);
525 
526  return linear_reg_fit_impl(mgr,
527  model_name,
528  input_labels,
529  cat_features_builder.getFeatures(),
530  cat_features_builder.getCatFeatureKeys(),
531  preferred_ml_framework_str,
532  model_metadata,
533  output_model_name);
534 }
535 
536 // clang-format off
537 /*
538  UDTF: linear_reg_fit__cpu_template(TableFunctionManager,
539  TextEncodingNone model_name,
540  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data,
541  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
542  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
543  TextEncodingNone preferred_ml_framework | default="DEFAULT",
544  TextEncodingNone model_metadata | default="e30=") ->
545  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
546  */
547 // clang-format on
548 
549 // default value for model_metadata of "e30=" is base64 encoded "{}"
550 
551 template <typename T>
552 NEVER_INLINE HOST int32_t
554  const TextEncodingNone& model_name,
555  const Column<T>& input_labels,
556  const ColumnList<TextEncodingDict>& input_cat_features,
557  const int32_t cat_top_k,
558  const float cat_min_fraction,
559  const TextEncodingNone& preferred_ml_framework_str,
560  const TextEncodingNone& model_metadata,
561  Column<TextEncodingDict>& output_model_name) {
562  CategoricalFeaturesBuilder<T> cat_features_builder(
563  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
564 
565  return linear_reg_fit_impl(mgr,
566  model_name,
567  input_labels,
568  cat_features_builder.getFeatures(),
569  cat_features_builder.getCatFeatureKeys(),
570  preferred_ml_framework_str,
571  model_metadata,
572  output_model_name);
573 }
574 
575 template <typename T>
576 Column<T> create_wrapper_col(std::vector<T>& col_vec) {
577  Column<T> wrapper_col(col_vec.data(), static_cast<int64_t>(col_vec.size()));
578  return wrapper_col;
579 }
580 
581 // clang-format off
582 /*
583  UDTF: linear_reg_coefs__cpu_1(TableFunctionManager,
584  TextEncodingNone model_name) ->
585  Column<int64_t> coef_idx, Column<TextEncodingDict> feature | input_id=args<>,
586  Column<int64_t> sub_coef_idx, Column<TextEncodingDict> sub_feature | input_id=args<>,
587  Column<double> coef
588  */
589 // clang-format on
590 
593  const TextEncodingNone& model_name,
594  Column<int64_t>& output_coef_idx,
595  Column<TextEncodingDict>& output_feature,
596  Column<int64_t>& output_sub_coef_idx,
597  Column<TextEncodingDict>& output_sub_feature,
598  Column<double>& output_coef);
599 
600 // clang-format off
601 /*
602  UDTF: linear_reg_coefs__cpu_2(TableFunctionManager,
603  Cursor<Column<TextEncodingDict> name> model_name) ->
604  Column<int64_t> coef_idx, Column<TextEncodingDict> feature | input_id=args<>,
605  Column<int64_t> sub_coef_idx, Column<TextEncodingDict> sub_feature | input_id=args<>,
606  Column<double> coef
607  */
608 // clang-format on
609 
612  const Column<TextEncodingDict>& model_name,
613  Column<int64_t>& output_coef_idx,
614  Column<TextEncodingDict>& output_feature,
615  Column<int64_t>& output_sub_coef_idx,
616  Column<TextEncodingDict>& output_sub_feature,
617  Column<double>& output_coef);
618 
619 template <typename T>
620 NEVER_INLINE HOST int32_t
622  const TextEncodingNone& model_name,
623  const Column<T>& input_labels,
624  const ColumnList<T>& input_features,
625  const std::vector<std::vector<std::string>>& cat_feature_keys,
626  const int64_t max_tree_depth,
627  const int64_t min_observations_per_leaf_node,
628  const TextEncodingNone& preferred_ml_framework_str,
629  const TextEncodingNone& model_metadata,
630  Column<TextEncodingDict>& output_model_name) {
631  if (input_labels.size() == 0) {
632  return mgr.ERROR_MESSAGE(
633  "No rows exist in training data. Training data must at least contain 1 row.");
634  }
635  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
636  if (preferred_ml_framework == MLFramework::INVALID) {
637  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
638  preferred_ml_framework_str.getString());
639  }
640  if (preferred_ml_framework == MLFramework::MLPACK) {
641  return mgr.ERROR_MESSAGE(
642  "Only OneDAL framework supported for decision tree regression.");
643  }
644 #ifndef HAVE_ONEDAL
645  return mgr.ERROR_MESSAGE(
646  "Only OneDAL framework supported for decision tree regression.");
647 #endif
648 
649  const auto denulled_data = denull_data(input_labels, input_features);
650  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
651  const auto features_ptrs =
652  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
653  mgr.set_output_row_size(1);
654  try {
655  bool did_execute = false;
656 #ifdef HAVE_ONEDAL
657  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
658  preferred_ml_framework == MLFramework::DEFAULT)) {
659  onedal_decision_tree_reg_fit_impl<T>(model_name,
660  labels_ptrs[0],
661  features_ptrs,
662  model_metadata,
663  cat_feature_keys,
664  denulled_data.masked_num_rows,
665  max_tree_depth,
666  min_observations_per_leaf_node);
667  const TextEncodingDict model_name_str_id =
668  output_model_name.getOrAddTransient(model_name);
669  output_model_name[0] = model_name_str_id;
670  did_execute = true;
671  }
672 #endif
673  if (!did_execute) {
674  return mgr.ERROR_MESSAGE(
675  "Cannot find " + preferred_ml_framework_str.getString() +
676  " ML library to support decision tree regression implementation.");
677  }
678  } catch (std::runtime_error& e) {
679  return mgr.ERROR_MESSAGE(e.what());
680  }
681  return 1;
682 }
683 
684 // clang-format off
685 /*
686  UDTF: decision_tree_reg_fit__cpu_template(TableFunctionManager,
687  TextEncodingNone model_name,
688  Cursor<Column<T> labels, ColumnList<T> features> data,
689  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
690  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node >= 0" | default=5,
691  TextEncodingNone preferred_ml_framework | default="DEFAULT",
692  TextEncodingNone model_metadata | default="e30=") ->
693  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
694  */
695 // clang-format on
696 
697 // default value for model_metadata of "e30=" is base64 encoded "{}"
698 
699 template <typename T>
700 NEVER_INLINE HOST int32_t
702  const TextEncodingNone& model_name,
703  const Column<T>& input_labels,
704  const ColumnList<T>& input_features,
705  const int64_t max_tree_depth,
706  const int64_t min_observations_per_leaf_node,
707  const TextEncodingNone& preferred_ml_framework_str,
708  const TextEncodingNone& model_metadata,
709  Column<TextEncodingDict>& output_model_name) {
710  std::vector<std::vector<std::string>> empty_cat_feature_keys;
711  return decision_tree_reg_impl(mgr,
712  model_name,
713  input_labels,
714  input_features,
715  empty_cat_feature_keys,
716  max_tree_depth,
717  min_observations_per_leaf_node,
718  preferred_ml_framework_str,
719  model_metadata,
720  output_model_name);
721 }
722 
723 // clang-format off
724 /*
725  UDTF: decision_tree_reg_fit__cpu_template(TableFunctionManager,
726  TextEncodingNone model_name,
727  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data,
728  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
729  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node >= 0" | default=5,
730  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
731  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
732  TextEncodingNone preferred_ml_framework | default="DEFAULT",
733  TextEncodingNone model_metadata | default="e30=") ->
734  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
735  */
736 // clang-format on
737 
738 // default value for model_metadata of "e30=" is base64 encoded "{}"
739 
740 template <typename T>
743  const TextEncodingNone& model_name,
744  const Column<T>& input_labels,
745  const ColumnList<TextEncodingDict>& input_cat_features,
746  const ColumnList<T>& input_numeric_features,
747  const int64_t max_tree_depth,
748  const int64_t min_observations_per_leaf_node,
749  const int32_t cat_top_k,
750  const float cat_min_fraction,
751  const TextEncodingNone& preferred_ml_framework_str,
752  const TextEncodingNone& model_metadata,
753  Column<TextEncodingDict>& output_model_name) {
754  std::vector<std::vector<std::string>> empty_cat_feature_keys;
755  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
756  input_numeric_features,
757  cat_top_k,
758  cat_min_fraction,
759  false /* cat_include_others */);
760  return decision_tree_reg_impl(mgr,
761  model_name,
762  input_labels,
763  cat_features_builder.getFeatures(),
764  cat_features_builder.getCatFeatureKeys(),
765  max_tree_depth,
766  min_observations_per_leaf_node,
767  preferred_ml_framework_str,
768  model_metadata,
769  output_model_name);
770 }
771 
772 // clang-format off
773 /*
774  UDTF: decision_tree_reg_fit__cpu_template(TableFunctionManager,
775  TextEncodingNone model_name,
776  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data,
777  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
778  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node >= 0" | default=5,
779  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
780  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
781  TextEncodingNone preferred_ml_framework | default="DEFAULT",
782  TextEncodingNone model_metadata | default="e30=") ->
783  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
784  */
785 // clang-format on
786 
787 // default value for model_metadata of "e30=" is base64 encoded "{}"
788 
789 template <typename T>
792  const TextEncodingNone& model_name,
793  const Column<T>& input_labels,
794  const ColumnList<TextEncodingDict>& input_cat_features,
795  const int64_t max_tree_depth,
796  const int64_t min_observations_per_leaf_node,
797  const int32_t cat_top_k,
798  const float cat_min_fraction,
799  const TextEncodingNone& preferred_ml_framework_str,
800  const TextEncodingNone& model_metadata,
801  Column<TextEncodingDict>& output_model_name) {
802  std::vector<std::vector<std::string>> empty_cat_feature_keys;
803  CategoricalFeaturesBuilder<T> cat_features_builder(
804  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
805  return decision_tree_reg_impl(mgr,
806  model_name,
807  input_labels,
808  cat_features_builder.getFeatures(),
809  cat_features_builder.getCatFeatureKeys(),
810  max_tree_depth,
811  min_observations_per_leaf_node,
812  preferred_ml_framework_str,
813  model_metadata,
814  output_model_name);
815 }
816 
817 template <typename T>
818 NEVER_INLINE HOST int32_t
820  const TextEncodingNone& model_name,
821  const Column<T>& input_labels,
822  const ColumnList<T>& input_features,
823  const std::vector<std::vector<std::string>>& cat_feature_keys,
824  const int64_t max_iterations,
825  const int64_t max_tree_depth,
826  const double shrinkage,
827  const double min_split_loss,
828  const double lambda,
829  const double obs_per_tree_fraction,
830  const int64_t features_per_node,
831  const int64_t min_observations_per_leaf_node,
832  const int64_t max_bins,
833  const int64_t min_bin_size,
834  const TextEncodingNone& preferred_ml_framework_str,
835  const TextEncodingNone& model_metadata,
836  Column<TextEncodingDict>& output_model_name) {
837  if (input_labels.size() == 0) {
838  return mgr.ERROR_MESSAGE(
839  "No rows exist in training data. Training data must at least contain 1 row.");
840  }
841  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
842  if (preferred_ml_framework == MLFramework::INVALID) {
843  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
844  preferred_ml_framework_str.getString());
845  }
846  if (preferred_ml_framework == MLFramework::MLPACK) {
847  return mgr.ERROR_MESSAGE("Only OneDAL framework supported for GBT regression.");
848  }
849 #ifndef HAVE_ONEDAL
850  return mgr.ERROR_MESSAGE("Only OneDAL framework supported for GBT regression.");
851 #endif
852 
853  const auto denulled_data = denull_data(input_labels, input_features);
854  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
855  const auto features_ptrs =
856  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
857  mgr.set_output_row_size(1);
858  try {
859  bool did_execute = false;
860 #ifdef HAVE_ONEDAL
861  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
862  preferred_ml_framework == MLFramework::DEFAULT)) {
863  onedal_gbt_reg_fit_impl<T>(model_name,
864  labels_ptrs[0],
865  features_ptrs,
866  model_metadata,
867  cat_feature_keys,
868  denulled_data.masked_num_rows,
869  max_iterations,
870  max_tree_depth,
871  shrinkage,
872  min_split_loss,
873  lambda,
874  obs_per_tree_fraction,
875  features_per_node,
876  min_observations_per_leaf_node,
877  max_bins,
878  min_bin_size);
879  const TextEncodingDict model_name_str_id =
880  output_model_name.getOrAddTransient(model_name);
881  output_model_name[0] = model_name_str_id;
882  did_execute = true;
883  }
884 #endif
885  if (!did_execute) {
886  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
887  " ML library to support GBT regression implementation.");
888  }
889  } catch (std::runtime_error& e) {
890  return mgr.ERROR_MESSAGE(e.what());
891  }
892  return 1;
893 }
894 
895 // clang-format off
896 /*
897  UDTF: gbt_reg_fit__cpu_template(TableFunctionManager,
898  TextEncodingNone model_name,
899  Cursor<Column<T> labels, ColumnList<T> features> data,
900  int64_t max_iterations | require="max_iterations > 0" | default=50,
901  int64_t max_tree_depth | require="max_tree_depth > 0" | default=6,
902  double shrinkage | require="shrinkage > 0.0" | require="shrinkage <= 1.0" | default=0.3,
903  double min_split_loss | require="min_split_loss >= 0.0" | default=0.0,
904  double lambda | require="lambda >= 0.0" | default=1.0,
905  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
906  int64_t features_per_node | require="features_per_node >= 0" | default=0,
907  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
908  int64_t max_bins | require="max_bins > 0" | default=256,
909  int64_t min_bin_size | require="min_bin_size >= 0" | default=5,
910  TextEncodingNone preferred_ml_framework | default="DEFAULT",
911  TextEncodingNone model_metadata | default="DEFAULT") ->
912  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
913  */
914 // clang-format on
915 
916 template <typename T>
917 NEVER_INLINE HOST int32_t
919  const TextEncodingNone& model_name,
920  const Column<T>& input_labels,
921  const ColumnList<T>& input_features,
922  const int64_t max_iterations,
923  const int64_t max_tree_depth,
924  const double shrinkage,
925  const double min_split_loss,
926  const double lambda,
927  const double obs_per_tree_fraction,
928  const int64_t features_per_node,
929  const int64_t min_observations_per_leaf_node,
930  const int64_t max_bins,
931  const int64_t min_bin_size,
932  const TextEncodingNone& preferred_ml_framework_str,
933  const TextEncodingNone& model_metadata,
934  Column<TextEncodingDict>& output_model_name) {
935  std::vector<std::vector<std::string>> empty_cat_feature_keys;
936  return gbt_reg_fit_impl(mgr,
937  model_name,
938  input_labels,
939  input_features,
940  empty_cat_feature_keys,
941  max_iterations,
942  max_tree_depth,
943  shrinkage,
944  min_split_loss,
945  lambda,
946  obs_per_tree_fraction,
947  features_per_node,
948  min_observations_per_leaf_node,
949  max_bins,
950  min_bin_size,
951  preferred_ml_framework_str,
952  model_metadata,
953  output_model_name);
954 }
955 
956 // clang-format off
957 /*
958  UDTF: gbt_reg_fit__cpu_template(TableFunctionManager,
959  TextEncodingNone model_name,
960  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data,
961  int64_t max_iterations | require="max_iterations > 0" | default=50,
962  int64_t max_tree_depth | require="max_tree_depth > 0" | default=6,
963  double shrinkage | require="shrinkage > 0.0" | require="shrinkage <= 1.0" | default=0.3,
964  double min_split_loss | require="min_split_loss >= 0.0" | default=0.0,
965  double lambda | require="lambda >= 0.0" | default=1.0,
966  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
967  int64_t features_per_node | require="features_per_node >= 0" | default=0,
968  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
969  int64_t max_bins | require="max_bins > 0" | default=256,
970  int64_t min_bin_size | require="min_bin_size >= 0" | default=5,
971  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
972  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
973  TextEncodingNone preferred_ml_framework | default="DEFAULT",
974  TextEncodingNone model_metadata | default="e30=") ->
975  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
976  */
977 // clang-format on
978 
979 // default value for model_metadata of "e30=" is base64 encoded "{}"
980 
981 template <typename T>
982 NEVER_INLINE HOST int32_t
984  const TextEncodingNone& model_name,
985  const Column<T>& input_labels,
986  const ColumnList<TextEncodingDict>& input_cat_features,
987  const ColumnList<T>& input_numeric_features,
988  const int64_t max_iterations,
989  const int64_t max_tree_depth,
990  const double shrinkage,
991  const double min_split_loss,
992  const double lambda,
993  const double obs_per_tree_fraction,
994  const int64_t features_per_node,
995  const int64_t min_observations_per_leaf_node,
996  const int64_t max_bins,
997  const int64_t min_bin_size,
998  const int32_t cat_top_k,
999  const float cat_min_fraction,
1000  const TextEncodingNone& preferred_ml_framework_str,
1001  const TextEncodingNone& model_metadata,
1002  Column<TextEncodingDict>& output_model_name) {
1003  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1004  input_numeric_features,
1005  cat_top_k,
1006  cat_min_fraction,
1007  false /* cat_include_others */);
1008  return gbt_reg_fit_impl(mgr,
1009  model_name,
1010  input_labels,
1011  cat_features_builder.getFeatures(),
1012  cat_features_builder.getCatFeatureKeys(),
1013  max_iterations,
1014  max_tree_depth,
1015  shrinkage,
1016  min_split_loss,
1017  lambda,
1018  obs_per_tree_fraction,
1019  features_per_node,
1020  min_observations_per_leaf_node,
1021  max_bins,
1022  min_bin_size,
1023  preferred_ml_framework_str,
1024  model_metadata,
1025  output_model_name);
1026 }
1027 
1028 // clang-format off
1029 /*
1030  UDTF: gbt_reg_fit__cpu_template(TableFunctionManager,
1031  TextEncodingNone model_name,
1032  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data,
1033  int64_t max_iterations | require="max_iterations > 0" | default=50,
1034  int64_t max_tree_depth | require="max_tree_depth > 0" | default=6,
1035  double shrinkage | require="shrinkage > 0.0" | require="shrinkage <= 1.0" | default=0.3,
1036  double min_split_loss | require="min_split_loss >= 0.0" | default=0.0,
1037  double lambda | require="lambda >= 0.0" | default=1.0,
1038  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
1039  int64_t features_per_node | require="features_per_node >= 0" | default=0,
1040  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
1041  int64_t max_bins | require="max_bins > 0" | default=256,
1042  int64_t min_bin_size | require="min_bin_size >= 0" | default=5,
1043  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1044  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1045  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1046  TextEncodingNone model_metadata | default="e30=") ->
1047  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1048  */
1049 // clang-format on
1050 
1051 // default value for model_metadata of "e30=" is base64 encoded "{}"
1052 
1053 template <typename T>
1054 NEVER_INLINE HOST int32_t
1056  const TextEncodingNone& model_name,
1057  const Column<T>& input_labels,
1058  const ColumnList<TextEncodingDict>& input_cat_features,
1059  const int64_t max_iterations,
1060  const int64_t max_tree_depth,
1061  const double shrinkage,
1062  const double min_split_loss,
1063  const double lambda,
1064  const double obs_per_tree_fraction,
1065  const int64_t features_per_node,
1066  const int64_t min_observations_per_leaf_node,
1067  const int64_t max_bins,
1068  const int64_t min_bin_size,
1069  const int32_t cat_top_k,
1070  const float cat_min_fraction,
1071  const TextEncodingNone& preferred_ml_framework_str,
1072  const TextEncodingNone& model_metadata,
1073  Column<TextEncodingDict>& output_model_name) {
1074  CategoricalFeaturesBuilder<T> cat_features_builder(
1075  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
1076  return gbt_reg_fit_impl(mgr,
1077  model_name,
1078  input_labels,
1079  cat_features_builder.getFeatures(),
1080  cat_features_builder.getCatFeatureKeys(),
1081  max_iterations,
1082  max_tree_depth,
1083  shrinkage,
1084  min_split_loss,
1085  lambda,
1086  obs_per_tree_fraction,
1087  features_per_node,
1088  min_observations_per_leaf_node,
1089  max_bins,
1090  min_bin_size,
1091  preferred_ml_framework_str,
1092  model_metadata,
1093  output_model_name);
1094 }
1095 
1096 template <typename T>
1097 NEVER_INLINE HOST int32_t
1099  const TextEncodingNone& model_name,
1100  const Column<T>& input_labels,
1101  const ColumnList<T>& input_features,
1102  const std::vector<std::vector<std::string>>& cat_feature_keys,
1103  const int64_t num_trees,
1104  const double obs_per_tree_fraction,
1105  const int64_t max_tree_depth,
1106  const int64_t features_per_node,
1107  const double impurity_threshold,
1108  const bool bootstrap,
1109  const int64_t min_obs_per_leaf_node,
1110  const int64_t min_obs_per_split_node,
1111  const double min_weight_fraction_in_leaf_node,
1112  const double min_impurity_decrease_in_split_node,
1113  const int64_t max_leaf_nodes,
1114  const bool use_histogram,
1115  const TextEncodingNone& var_importance_metric_str,
1116  const TextEncodingNone& preferred_ml_framework_str,
1117  const TextEncodingNone& model_metadata,
1118  Column<TextEncodingDict>& output_model_name) {
1119  if (input_labels.size() == 0) {
1120  return mgr.ERROR_MESSAGE(
1121  "No rows exist in training data. Training data must at least contain 1 row.");
1122  }
1123  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1124  if (preferred_ml_framework == MLFramework::INVALID) {
1125  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1126  preferred_ml_framework_str.getString());
1127  }
1128  if (preferred_ml_framework == MLFramework::MLPACK) {
1129  return mgr.ERROR_MESSAGE(
1130  "Only OneDAL framework supported for random forest regression.");
1131  }
1132 #ifndef HAVE_ONEDAL
1133  return mgr.ERROR_MESSAGE(
1134  "Only OneDAL framework supported for random forest regression.");
1135 #endif
1136 
1137  const auto denulled_data = denull_data(input_labels, input_features);
1138  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
1139  const auto features_ptrs =
1140  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
1141  mgr.set_output_row_size(1);
1142  try {
1143  bool did_execute = false;
1144  const auto var_importance_metric =
1145  get_var_importance_metric(var_importance_metric_str);
1146  if (var_importance_metric == VarImportanceMetric::INVALID) {
1147  return mgr.ERROR_MESSAGE("Invalid variable importance metric: " +
1148  var_importance_metric_str.getString());
1149  }
1150 #ifdef HAVE_ONEDAL
1151  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1152  preferred_ml_framework == MLFramework::DEFAULT)) {
1153  if (use_histogram) {
1154  onedal_random_forest_reg_fit_impl<T, decision_forest::regression::training::hist>(
1155  model_name,
1156  labels_ptrs[0],
1157  features_ptrs,
1158  model_metadata,
1159  cat_feature_keys,
1160  denulled_data.masked_num_rows,
1161  num_trees,
1162  obs_per_tree_fraction,
1163  max_tree_depth,
1164  features_per_node,
1165  impurity_threshold,
1166  bootstrap,
1167  min_obs_per_leaf_node,
1168  min_obs_per_split_node,
1169  min_weight_fraction_in_leaf_node,
1170  min_impurity_decrease_in_split_node,
1171  max_leaf_nodes,
1172  var_importance_metric);
1173  } else {
1174  onedal_random_forest_reg_fit_impl<
1175  T,
1176  decision_forest::regression::training::defaultDense>(
1177  model_name,
1178  labels_ptrs[0],
1179  features_ptrs,
1180  model_metadata,
1181  cat_feature_keys,
1182  denulled_data.masked_num_rows,
1183  num_trees,
1184  obs_per_tree_fraction,
1185  max_tree_depth,
1186  features_per_node,
1187  impurity_threshold,
1188  bootstrap,
1189  min_obs_per_leaf_node,
1190  min_obs_per_split_node,
1191  min_weight_fraction_in_leaf_node,
1192  min_impurity_decrease_in_split_node,
1193  max_leaf_nodes,
1194  var_importance_metric);
1195  }
1196  const TextEncodingDict model_name_str_id =
1197  output_model_name.getOrAddTransient(model_name);
1198  output_model_name[0] = model_name_str_id;
1199  did_execute = true;
1200  }
1201 #endif
1202  if (!did_execute) {
1203  return mgr.ERROR_MESSAGE(
1204  "Cannot find " + preferred_ml_framework_str.getString() +
1205  " ML library to support random forest regression implementation.");
1206  }
1207  } catch (std::runtime_error& e) {
1208  return mgr.ERROR_MESSAGE(e.what());
1209  }
1210  return 1;
1211 }
1212 
1213 // clang-format off
1214 /*
1215  UDTF: random_forest_reg_fit__cpu_template(TableFunctionManager,
1216  TextEncodingNone model_name,
1217  Cursor<Column<T> labels, ColumnList<T> features> data,
1218  int64_t num_trees | require="num_trees > 0" | default=10,
1219  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
1220  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
1221  int64_t features_per_node | require="features_per_node >= 0" | default=0,
1222  double impurity_threshold | require="impurity_threshold >= 0.0" | default=0.0,
1223  bool bootstrap | default=true,
1224  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
1225  int64_t min_obs_per_split_node | require="min_obs_per_leaf_node > 0" | default=2,
1226  double min_weight_fraction_in_leaf_node | require="min_weight_fraction_in_leaf_node >= 0.0" | default=0.0,
1227  double min_impurity_decrease_in_split_node | require="min_impurity_decrease_in_split_node >= 0.0" | default=0.0,
1228  int64_t max_leaf_nodes | require="max_leaf_nodes >=0" | default=0,
1229  bool use_histogram | default=false,
1230  TextEncodingNone var_importance_metric | default="MDI",
1231  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1232  TextEncodingNone model_metadata | default="e30=") ->
1233  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1234  */
1235 // clang-format on
1236 
1237 // default value for model_metadata of "e30=" is base64 encoded "{}"
1238 
1239 template <typename T>
1240 NEVER_INLINE HOST int32_t
1242  const TextEncodingNone& model_name,
1243  const Column<T>& input_labels,
1244  const ColumnList<T>& input_features,
1245  const int64_t num_trees,
1246  const double obs_per_tree_fraction,
1247  const int64_t max_tree_depth,
1248  const int64_t features_per_node,
1249  const double impurity_threshold,
1250  const bool bootstrap,
1251  const int64_t min_obs_per_leaf_node,
1252  const int64_t min_obs_per_split_node,
1253  const double min_weight_fraction_in_leaf_node,
1254  const double min_impurity_decrease_in_split_node,
1255  const int64_t max_leaf_nodes,
1256  const bool use_histogram,
1257  const TextEncodingNone& var_importance_metric_str,
1258  const TextEncodingNone& preferred_ml_framework_str,
1259  const TextEncodingNone& model_metadata,
1260  Column<TextEncodingDict>& output_model_name) {
1261  std::vector<std::vector<std::string>> empty_cat_feature_keys;
1262  return random_forest_reg_fit_impl(mgr,
1263  model_name,
1264  input_labels,
1265  input_features,
1266  empty_cat_feature_keys,
1267  num_trees,
1268  obs_per_tree_fraction,
1269  max_tree_depth,
1270  features_per_node,
1271  impurity_threshold,
1272  bootstrap,
1273  min_obs_per_leaf_node,
1274  min_obs_per_split_node,
1275  min_weight_fraction_in_leaf_node,
1276  min_impurity_decrease_in_split_node,
1277  max_leaf_nodes,
1278  use_histogram,
1279  var_importance_metric_str,
1280  preferred_ml_framework_str,
1281  model_metadata,
1282  output_model_name);
1283 }
1284 
1285 // clang-format off
1286 /*
1287  UDTF: random_forest_reg_fit__cpu_template(TableFunctionManager,
1288  TextEncodingNone model_name,
1289  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data,
1290  int64_t num_trees | require="num_trees > 0" | default=10,
1291  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
1292  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
1293  int64_t features_per_node | require="features_per_node >= 0" | default=0,
1294  double impurity_threshold | require="impurity_threshold >= 0.0" | default=0.0,
1295  bool bootstrap | default=true,
1296  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
1297  int64_t min_obs_per_split_node | require="min_obs_per_leaf_node > 0" | default=2,
1298  double min_weight_fraction_in_leaf_node | require="min_weight_fraction_in_leaf_node >= 0.0" | default=0.0,
1299  double min_impurity_decrease_in_split_node | require="min_impurity_decrease_in_split_node >= 0.0" | default=0.0,
1300  int64_t max_leaf_nodes | require="max_leaf_nodes >=0" | default=0,
1301  bool use_histogram | default=false,
1302  TextEncodingNone var_importance_metric | default="MDI",
1303  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1304  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1305  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1306  TextEncodingNone model_metadata | default="e30=") ->
1307  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1308  */
1309 // clang-format on
1310 
1311 // default value for model_metadata of "e30=" is base64 encoded "{}"
1312 
1313 template <typename T>
1315  TableFunctionManager& mgr,
1316  const TextEncodingNone& model_name,
1317  const Column<T>& input_labels,
1318  const ColumnList<TextEncodingDict>& input_cat_features,
1319  const ColumnList<T>& input_numeric_features,
1320  const int64_t num_trees,
1321  const double obs_per_tree_fraction,
1322  const int64_t max_tree_depth,
1323  const int64_t features_per_node,
1324  const double impurity_threshold,
1325  const bool bootstrap,
1326  const int64_t min_obs_per_leaf_node,
1327  const int64_t min_obs_per_split_node,
1328  const double min_weight_fraction_in_leaf_node,
1329  const double min_impurity_decrease_in_split_node,
1330  const int64_t max_leaf_nodes,
1331  const bool use_histogram,
1332  const TextEncodingNone& var_importance_metric_str,
1333  const int32_t cat_top_k,
1334  const float cat_min_fraction,
1335  const TextEncodingNone& preferred_ml_framework_str,
1336  const TextEncodingNone& model_metadata,
1337  Column<TextEncodingDict>& output_model_name) {
1338  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1339  input_numeric_features,
1340  cat_top_k,
1341  cat_min_fraction,
1342  false /* cat_include_others */);
1343  return random_forest_reg_fit_impl(mgr,
1344  model_name,
1345  input_labels,
1346  cat_features_builder.getFeatures(),
1347  cat_features_builder.getCatFeatureKeys(),
1348  num_trees,
1349  obs_per_tree_fraction,
1350  max_tree_depth,
1351  features_per_node,
1352  impurity_threshold,
1353  bootstrap,
1354  min_obs_per_leaf_node,
1355  min_obs_per_split_node,
1356  min_weight_fraction_in_leaf_node,
1357  min_impurity_decrease_in_split_node,
1358  max_leaf_nodes,
1359  use_histogram,
1360  var_importance_metric_str,
1361  preferred_ml_framework_str,
1362  model_metadata,
1363  output_model_name);
1364 }
1365 
1366 // clang-format off
1367 /*
1368  UDTF: random_forest_reg_fit__cpu_template(TableFunctionManager,
1369  TextEncodingNone model_name,
1370  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data,
1371  int64_t num_trees | require="num_trees > 0" | default=10,
1372  double obs_per_tree_fraction | require="obs_per_tree_fraction > 0.0" | require="obs_per_tree_fraction <= 1.0" | default=1.0,
1373  int64_t max_tree_depth | require="max_tree_depth >= 0" | default=0,
1374  int64_t features_per_node | require="features_per_node >= 0" | default=0,
1375  double impurity_threshold | require="impurity_threshold >= 0.0" | default=0.0,
1376  bool bootstrap | default=true,
1377  int64_t min_obs_per_leaf_node | require="min_obs_per_leaf_node > 0" | default=5,
1378  int64_t min_obs_per_split_node | require="min_obs_per_leaf_node > 0" | default=2,
1379  double min_weight_fraction_in_leaf_node | require="min_weight_fraction_in_leaf_node >= 0.0" | default=0.0,
1380  double min_impurity_decrease_in_split_node | require="min_impurity_decrease_in_split_node >= 0.0" | default=0.0,
1381  int64_t max_leaf_nodes | require="max_leaf_nodes >=0" | default=0,
1382  bool use_histogram | default=false,
1383  TextEncodingNone var_importance_metric | default="MDI",
1384  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1385  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1386  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1387  TextEncodingNone model_metadata | default="e30=") ->
1388  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1389  */
1390 // clang-format on
1391 
1392 // default value for model_metadata of "e30=" is base64 encoded "{}"
1393 
1394 template <typename T>
1396  TableFunctionManager& mgr,
1397  const TextEncodingNone& model_name,
1398  const Column<T>& input_labels,
1399  const ColumnList<TextEncodingDict>& input_cat_features,
1400  const int64_t num_trees,
1401  const double obs_per_tree_fraction,
1402  const int64_t max_tree_depth,
1403  const int64_t features_per_node,
1404  const double impurity_threshold,
1405  const bool bootstrap,
1406  const int64_t min_obs_per_leaf_node,
1407  const int64_t min_obs_per_split_node,
1408  const double min_weight_fraction_in_leaf_node,
1409  const double min_impurity_decrease_in_split_node,
1410  const int64_t max_leaf_nodes,
1411  const bool use_histogram,
1412  const TextEncodingNone& var_importance_metric_str,
1413  const int32_t cat_top_k,
1414  const float cat_min_fraction,
1415  const TextEncodingNone& preferred_ml_framework_str,
1416  const TextEncodingNone& model_metadata,
1417  Column<TextEncodingDict>& output_model_name) {
1418  CategoricalFeaturesBuilder<T> cat_features_builder(
1419  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
1420  return random_forest_reg_fit_impl(mgr,
1421  model_name,
1422  input_labels,
1423  cat_features_builder.getFeatures(),
1424  cat_features_builder.getCatFeatureKeys(),
1425  num_trees,
1426  obs_per_tree_fraction,
1427  max_tree_depth,
1428  features_per_node,
1429  impurity_threshold,
1430  bootstrap,
1431  min_obs_per_leaf_node,
1432  min_obs_per_split_node,
1433  min_weight_fraction_in_leaf_node,
1434  min_impurity_decrease_in_split_node,
1435  max_leaf_nodes,
1436  use_histogram,
1437  var_importance_metric_str,
1438  preferred_ml_framework_str,
1439  model_metadata,
1440  output_model_name);
1441 }
1442 
1443 template <typename T>
1444 NEVER_INLINE HOST int32_t
1446  const TextEncodingNone& model_name,
1447  const ColumnList<T>& input_features,
1448  const std::vector<std::vector<std::string>>& cat_feature_keys,
1449  const TextEncodingNone& preferred_ml_framework_str,
1450  const TextEncodingNone& model_metadata,
1451  Column<TextEncodingDict>& output_model_name) {
1452  if (input_features.size() == 0) {
1453  return mgr.ERROR_MESSAGE(
1454  "No rows exist in training data. Training data must at least contain 1 row.");
1455  }
1456  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1457  if (preferred_ml_framework == MLFramework::INVALID) {
1458  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1459  preferred_ml_framework_str.getString());
1460  }
1461  try {
1462  const auto denulled_data = denull_data(input_features);
1463  const int64_t num_rows = denulled_data.masked_num_rows;
1464  if (num_rows == 0) {
1465  return mgr.ERROR_MESSAGE(
1466  "No non-null rows exist in training data. Training data must at least contain "
1467  "1 "
1468  "non-null row.");
1469  }
1470  const auto features_ptrs =
1471  pluck_ptrs(denulled_data.data, 0L, input_features.numCols());
1472  // z_std_normalize_data_with_summary_stats can throw if std dev is 0
1473  const auto z_std_norm_summary_stats =
1474  z_std_normalize_data_with_summary_stats(denulled_data.data, num_rows);
1475  const auto normalized_ptrs =
1476  pluck_ptrs(z_std_norm_summary_stats.normalized_data,
1477  0L,
1478  z_std_norm_summary_stats.normalized_data.size());
1479  bool did_execute = false;
1480 #ifdef HAVE_ONEDAL
1481  if (preferred_ml_framework == MLFramework::ONEDAL ||
1482  preferred_ml_framework == MLFramework::DEFAULT) {
1483  const auto [eigenvectors, eigenvalues] =
1484  onedal_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
1485  auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
1486  z_std_norm_summary_stats.std_devs,
1487  eigenvectors,
1488  eigenvalues,
1489  model_metadata,
1490  cat_feature_keys);
1491  g_ml_models.addModel(model_name, model);
1492  did_execute = true;
1493  }
1494 #endif
1495  if (!did_execute) {
1496  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
1497  " ML library to support PCA implementation.");
1498  }
1499  mgr.set_output_row_size(1);
1500  const TextEncodingDict model_name_str_id =
1501  output_model_name.getOrAddTransient(model_name);
1502  output_model_name[0] = model_name_str_id;
1503  return 1;
1504  } catch (std::runtime_error& e) {
1505  return mgr.ERROR_MESSAGE(e.what());
1506  }
1507 }
1508 
1509 // clang-format off
1510 /*
1511  UDTF: pca_fit__cpu_template(TableFunctionManager,
1512  TextEncodingNone model_name,
1513  Cursor<ColumnList<T> features> data,
1514  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1515  TextEncodingNone model_metadata | default="e30=") ->
1516  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1517  */
1518 // clang-format on
1519 
1520 // default value for model_metadata of "e30=" is base64 encoded "{}"
1521 
1522 template <typename T>
1523 NEVER_INLINE HOST int32_t
1525  const TextEncodingNone& model_name,
1526  const ColumnList<T>& input_features,
1527  const TextEncodingNone& preferred_ml_framework_str,
1528  const TextEncodingNone& model_metadata,
1529  Column<TextEncodingDict>& output_model_name) {
1530  std::vector<std::vector<std::string>> empty_cat_feature_keys;
1531  return pca_fit_impl(mgr,
1532  model_name,
1533  input_features,
1534  empty_cat_feature_keys,
1535  preferred_ml_framework_str,
1536  model_metadata,
1537  output_model_name);
1538 }
1539 
1540 // clang-format off
1541 /*
1542  UDTF: pca_fit__cpu_template(TableFunctionManager,
1543  TextEncodingNone model_name,
1544  Cursor<ColumnList<TextEncodingDict> cat_features, ColumnList<T> features> data,
1545  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1546  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1547  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1548  TextEncodingNone model_metadata | default="e30=") ->
1549  Column<TextEncodingDict> model_name | input_id=args<>, T=[double]
1550  */
1551 // clang-format on
1552 
1553 // default value for model_metadata of "e30=" is base64 encoded "{}"
1554 
1555 template <typename T>
1556 NEVER_INLINE HOST int32_t
1558  const TextEncodingNone& model_name,
1559  const ColumnList<TextEncodingDict>& input_cat_features,
1560  const ColumnList<T>& input_numeric_features,
1561  const int32_t cat_top_k,
1562  const float cat_min_fraction,
1563  const TextEncodingNone& preferred_ml_framework_str,
1564  const TextEncodingNone& model_metadata,
1565  Column<TextEncodingDict>& output_model_name) {
1566  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1567  input_numeric_features,
1568  cat_top_k,
1569  cat_min_fraction,
1570  false /* cat_include_others */);
1571  return pca_fit_impl(mgr,
1572  model_name,
1573  cat_features_builder.getFeatures(),
1574  cat_features_builder.getCatFeatureKeys(),
1575  preferred_ml_framework_str,
1576  model_metadata,
1577  output_model_name);
1578 }
1579 
1580 // clang-format off
1581 /*
1582  UDTF: pca_fit__cpu_1(TableFunctionManager,
1583  TextEncodingNone model_name,
1584  Cursor<ColumnList<TextEncodingDict> cat_features> data,
1585  int32_t cat_top_k | require="cat_top_k >= 1" | default=10,
1586  float cat_min_fraction | require="cat_min_fraction > 0.0" | require="cat_min_fraction <= 1.0" | default=0.01,
1587  TextEncodingNone preferred_ml_framework | default="DEFAULT",
1588  TextEncodingNone model_metadata | default="e30=") ->
1589  Column<TextEncodingDict> model_name | input_id=args<>
1590 */
1591 // clang-format on
1592 
1593 // default value for model_metadata of "e30=" is base64 encoded "{}"
1594 
1597  const TextEncodingNone& model_name,
1598  const ColumnList<TextEncodingDict>& input_cat_features,
1599  const int32_t cat_top_k,
1600  const float cat_min_fraction,
1601  const TextEncodingNone& preferred_ml_framework_str,
1602  const TextEncodingNone& model_metadata,
1603  Column<TextEncodingDict>& output_model_name);
1604 
1605 template <typename T, typename K>
1606 NEVER_INLINE HOST int32_t
1608  const std::shared_ptr<AbstractMLModel>& model,
1609  const Column<K>& input_ids,
1610  const ColumnList<T>& input_features,
1611  const TextEncodingNone& preferred_ml_framework_str,
1612  Column<K>& output_ids,
1613  Column<T>& output_predictions) {
1614  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1615  if (preferred_ml_framework == MLFramework::INVALID) {
1616  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1617  preferred_ml_framework_str.getString());
1618  }
1619  const auto denulled_data = denull_data(input_features);
1620  const int64_t num_rows = denulled_data.masked_num_rows;
1621  const bool data_is_masked =
1622  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
1623  std::vector<T> denulled_output_allocation(data_is_masked ? num_rows : 0);
1624  mgr.set_output_row_size(input_ids.size());
1625  T* denulled_output =
1626  data_is_masked ? denulled_output_allocation.data() : output_predictions.ptr_;
1627  const auto features_ptrs = pluck_ptrs(denulled_data.data, 0L, input_features.numCols());
1628 
1629  try {
1630  bool did_execute = false;
1631  const auto model_type = model->getModelType();
1632  switch (model_type) {
1633  case MLModelType::LINEAR_REG: {
1634  const auto linear_reg_model =
1635  std::dynamic_pointer_cast<LinearRegressionModel>(model);
1636  CHECK(linear_reg_model);
1637 #ifdef HAVE_ONEDAL
1638  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1639  preferred_ml_framework == MLFramework::DEFAULT)) {
1640  onedal_linear_reg_predict_impl(
1641  linear_reg_model, features_ptrs, denulled_output, num_rows);
1642  did_execute = true;
1643  }
1644 #endif
1645 #ifdef HAVE_MLPACK
1646  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
1647  preferred_ml_framework == MLFramework::DEFAULT)) {
1648  mlpack_linear_reg_predict_impl(
1649  linear_reg_model, features_ptrs, denulled_output, num_rows);
1650  did_execute = true;
1651  }
1652 #endif
1653  break;
1654  }
1656 #ifdef HAVE_ONEDAL
1657  const auto decision_tree_reg_model =
1658  std::dynamic_pointer_cast<DecisionTreeRegressionModel>(model);
1659  CHECK(decision_tree_reg_model);
1660  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1661  preferred_ml_framework == MLFramework::DEFAULT)) {
1662  onedal_decision_tree_reg_predict_impl(
1663  decision_tree_reg_model, features_ptrs, denulled_output, num_rows);
1664  did_execute = true;
1665  }
1666 #endif
1667  break;
1668  }
1669  case MLModelType::GBT_REG: {
1670 #ifdef HAVE_ONEDAL
1671  const auto gbt_reg_model = std::dynamic_pointer_cast<GbtRegressionModel>(model);
1672  CHECK(gbt_reg_model);
1673  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1674  preferred_ml_framework == MLFramework::DEFAULT)) {
1675  onedal_gbt_reg_predict_impl(
1676  gbt_reg_model, features_ptrs, denulled_output, num_rows);
1677  did_execute = true;
1678  }
1679 #endif
1680  break;
1681  }
1683 #ifdef HAVE_ONEDAL
1684  const auto random_forest_reg_model =
1685  std::dynamic_pointer_cast<RandomForestRegressionModel>(model);
1686  CHECK(random_forest_reg_model);
1687  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1688  preferred_ml_framework == MLFramework::DEFAULT)) {
1689  onedal_random_forest_reg_predict_impl(
1690  random_forest_reg_model, features_ptrs, denulled_output, num_rows);
1691  did_execute = true;
1692  }
1693 #endif
1694  break;
1695  }
1696  default: {
1697  throw std::runtime_error("Unsupported model type");
1698  }
1699  }
1700  if (!did_execute) {
1701  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
1702  " ML library to support model implementation.");
1703  }
1704  } catch (std::runtime_error& e) {
1705  const std::string error_str(e.what());
1706  return mgr.ERROR_MESSAGE(error_str);
1707  }
1708  output_ids = input_ids;
1709  if (data_is_masked) {
1710  unmask_data(denulled_output,
1711  denulled_data.reverse_index_map,
1712  output_predictions.ptr_,
1713  denulled_data.unmasked_num_rows,
1714  inline_null_value<T>());
1715  }
1716  return input_ids.size();
1717 }
1718 
1719 // clang-format off
1720 /*
1721  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1722  TextEncodingNone model_name,
1723  Cursor<Column<K> id, ColumnList<T> features> data,
1724  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1725  Column<K> id | input_id=args<0>, Column<T> prediction,
1726  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1727  */
1728 // clang-format on
1729 
1730 template <typename T, typename K>
1731 NEVER_INLINE HOST int32_t
1733  const TextEncodingNone& model_name,
1734  const Column<K>& input_ids,
1735  const ColumnList<T>& input_features,
1736  const TextEncodingNone& preferred_ml_framework_str,
1737  Column<K>& output_ids,
1738  Column<T>& output_predictions) {
1739  try {
1740  const auto model = g_ml_models.getModel(model_name);
1741  check_model_params(model, 0, input_features.numCols());
1742  return ml_reg_predict_impl(mgr,
1743  model,
1744  input_ids,
1745  input_features,
1746  preferred_ml_framework_str,
1747  output_ids,
1748  output_predictions);
1749  } catch (std::runtime_error& e) {
1750  const std::string error_str(e.what());
1751  return mgr.ERROR_MESSAGE(error_str);
1752  }
1753 }
1754 
1755 // clang-format off
1756 /*
1757  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1758  TextEncodingNone model_name,
1759  Cursor<Column<K> id, ColumnList<TextEncodingDict> cat_features, ColumnList<T> features> data,
1760  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1761  Column<K> id | input_id=args<0>, Column<T> prediction,
1762  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1763  */
1764 // clang-format on
1765 
1766 template <typename T, typename K>
1767 NEVER_INLINE HOST int32_t
1769  const TextEncodingNone& model_name,
1770  const Column<K>& input_ids,
1771  const ColumnList<TextEncodingDict>& input_cat_features,
1772  const ColumnList<T>& input_numeric_features,
1773  const TextEncodingNone& preferred_ml_framework_str,
1774  Column<K>& output_ids,
1775  Column<T>& output_predictions) {
1776  try {
1777  const auto model = g_ml_models.getModel(model_name);
1779  model, input_cat_features.numCols(), input_numeric_features.numCols());
1780  CategoricalFeaturesBuilder<T> cat_features_builder(
1781  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
1782  return ml_reg_predict_impl(mgr,
1783  model,
1784  input_ids,
1785  cat_features_builder.getFeatures(),
1786  preferred_ml_framework_str,
1787  output_ids,
1788  output_predictions);
1789  } catch (std::runtime_error& e) {
1790  const std::string error_str(e.what());
1791  return mgr.ERROR_MESSAGE(error_str);
1792  }
1793 }
1794 
1795 // clang-format off
1796 /*
1797  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1798  TextEncodingNone model_name,
1799  Cursor<Column<K> id, ColumnList<TextEncodingDict> cat_features> data,
1800  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1801  Column<K> id | input_id=args<0>, Column<T> prediction,
1802  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1803  */
1804 // clang-format on
1805 
1806 template <typename T, typename K>
1807 NEVER_INLINE HOST int32_t
1809  const TextEncodingNone& model_name,
1810  const Column<K>& input_ids,
1811  const ColumnList<TextEncodingDict>& input_cat_features,
1812  const TextEncodingNone& preferred_ml_framework_str,
1813  Column<K>& output_ids,
1814  Column<T>& output_predictions) {
1815  try {
1816  const auto model = g_ml_models.getModel(model_name);
1817  check_model_params(model, input_cat_features.numCols(), 0);
1818  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1819  model->getCatFeatureKeys());
1820  return ml_reg_predict_impl(mgr,
1821  model,
1822  input_ids,
1823  cat_features_builder.getFeatures(),
1824  preferred_ml_framework_str,
1825  output_ids,
1826  output_predictions);
1827  } catch (std::runtime_error& e) {
1828  const std::string error_str(e.what());
1829  return mgr.ERROR_MESSAGE(error_str);
1830  }
1831 }
1832 
1833 // clang-format off
1834 /*
1835  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1836  Cursor<Column<TextEncodingDict> name> model_name,
1837  Cursor<Column<K> id, ColumnList<T> features> data,
1838  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1839  Column<K> id | input_id=args<0>, Column<T> prediction,
1840  K=[int64_t, TextEncodingDict], T=[double]
1841  */
1842 // clang-format on
1843 
1844 template <typename T, typename K>
1845 NEVER_INLINE HOST int32_t
1847  const Column<TextEncodingDict>& model_name,
1848  const Column<K>& input_ids,
1849  const ColumnList<T>& input_features,
1850  const TextEncodingNone& preferred_ml_framework_str,
1851  Column<K>& output_ids,
1852  Column<T>& output_predictions) {
1853  if (model_name.size() != 1) {
1854  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1855  }
1856  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1857  return ml_reg_predict__cpu_template(mgr,
1858  model_name_text_enc_none,
1859  input_ids,
1860  input_features,
1861  preferred_ml_framework_str,
1862  output_ids,
1863  output_predictions);
1864 }
1865 
1866 // clang-format off
1867 /*
1868  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1869  Cursor<Column<TextEncodingDict> name> model_name,
1870  Cursor<Column<K> id, ColumnList<TextEncodingDict> cat_features, ColumnList<T> features> data,
1871  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1872  Column<K> id | input_id=args<0>, Column<T> prediction,
1873  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1874  */
1875 // clang-format on
1876 
1877 template <typename T, typename K>
1878 NEVER_INLINE HOST int32_t
1880  const Column<TextEncodingDict>& model_name,
1881  const Column<K>& input_ids,
1882  const ColumnList<TextEncodingDict>& input_cat_features,
1883  const ColumnList<T>& input_numeric_features,
1884  const TextEncodingNone& preferred_ml_framework_str,
1885  Column<K>& output_ids,
1886  Column<T>& output_predictions) {
1887  if (model_name.size() != 1) {
1888  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1889  }
1890  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1891  return ml_reg_predict__cpu_template(mgr,
1892  model_name_text_enc_none,
1893  input_ids,
1894  input_cat_features,
1895  input_numeric_features,
1896  preferred_ml_framework_str,
1897  output_ids,
1898  output_predictions);
1899 }
1900 
1901 // clang-format off
1902 /*
1903  UDTF: ml_reg_predict__cpu_template(TableFunctionManager,
1904  Cursor<Column<TextEncodingDict> name> model_name,
1905  Cursor<Column<K> id, ColumnList<TextEncodingDict> cat_features> data,
1906  TextEncodingNone preferred_ml_framework | default="DEFAULT") ->
1907  Column<K> id | input_id=args<0>, Column<T> prediction,
1908  K=[int32_t, int64_t, TextEncodingDict], T=[double]
1909  */
1910 // clang-format on
1911 
1912 template <typename T, typename K>
1913 NEVER_INLINE HOST int32_t
1915  const Column<TextEncodingDict>& model_name,
1916  const Column<K>& input_ids,
1917  const ColumnList<TextEncodingDict>& input_cat_features,
1918  const TextEncodingNone& preferred_ml_framework_str,
1919  Column<K>& output_ids,
1920  Column<T>& output_predictions) {
1921  if (model_name.size() != 1) {
1922  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1923  }
1924  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1925  return ml_reg_predict__cpu_template(mgr,
1926  model_name_text_enc_none,
1927  input_ids,
1928  input_cat_features,
1929  preferred_ml_framework_str,
1930  output_ids,
1931  output_predictions);
1932 }
1933 
1934 template <typename T>
1936  const std::shared_ptr<AbstractMLModel>& model,
1937  const Column<T>& input_labels,
1938  const ColumnList<T>& input_features,
1939  Column<double>& output_r2) {
1940  const int64_t num_rows = input_labels.size();
1941  if (num_rows == 0) {
1942  return mgr.ERROR_MESSAGE(
1943  "No rows exist in evaluation data. Evaluation data must at least contain 1 row.");
1944  }
1945  std::vector<T> output_predictions_vec(num_rows);
1946  Column<T> output_predictions(output_predictions_vec);
1947  std::vector<int64_t> input_ids_vec(num_rows);
1948  std::vector<int64_t> output_ids_vec(num_rows);
1949  Column<int64_t> input_ids(input_ids_vec);
1950  Column<int64_t> output_ids(output_ids_vec);
1952  TextEncodingNone ml_framework_encoding_none("DEFAULT");
1953 
1954  try {
1955  auto ret = ml_reg_predict_impl(mgr,
1956  model,
1957  input_ids,
1958  input_features,
1959  ml_framework_encoding_none,
1960  output_ids,
1961  output_predictions);
1962 
1963  if (ret < 0) {
1964  // A return of less than 0 symbolizes an error
1965  return ret;
1966  }
1967  } catch (std::runtime_error& e) {
1969  return mgr.ERROR_MESSAGE(e.what());
1970  }
1971 
1973  mgr.set_output_row_size(1);
1974 
1975  const auto labels_mean = get_column_mean(input_labels);
1976  const size_t max_thread_count = std::thread::hardware_concurrency();
1977  const size_t max_inputs_per_thread = 20000;
1978  const size_t num_threads = std::min(
1979  max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
1980 
1981  std::vector<double> local_sum_squared_regressions(num_threads, 0.0);
1982  std::vector<double> local_sum_squares(num_threads, 0.0);
1983 
1984  tbb::task_arena limited_arena(num_threads);
1985 
1986  limited_arena.execute([&] {
1988  tbb::blocked_range<int64_t>(0, num_rows),
1989  [&](const tbb::blocked_range<int64_t>& r) {
1990  const int64_t start_idx = r.begin();
1991  const int64_t end_idx = r.end();
1992  double local_sum_squared_regression{0.0};
1993  double local_sum_square{0.0};
1994  for (int64_t row_idx = start_idx; row_idx < end_idx; ++row_idx) {
1995  if (output_predictions[row_idx] != inline_null_value<T>()) {
1996  local_sum_squared_regression +=
1997  (input_labels[row_idx] - output_predictions[row_idx]) *
1998  (input_labels[row_idx] - output_predictions[row_idx]);
1999  local_sum_square += (input_labels[row_idx] - labels_mean) *
2000  (input_labels[row_idx] - labels_mean);
2001  }
2002  }
2003  const size_t thread_idx = tbb::this_task_arena::current_thread_index();
2004  local_sum_squared_regressions[thread_idx] += local_sum_squared_regression;
2005  local_sum_squares[thread_idx] += local_sum_square;
2006  });
2007  });
2008  double sum_squared_regression{0.0};
2009  double sum_squares{0.0};
2010  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
2011  sum_squared_regression += local_sum_squared_regressions[thread_idx];
2012  sum_squares += local_sum_squares[thread_idx];
2013  }
2014  output_r2[0] = sum_squares == 0.0 ? 1.0 : 1.0 - (sum_squared_regression / sum_squares);
2015  return 1;
2016 }
2017 
2018 // clang-format off
2019 /*
2020  UDTF: r2_score__cpu_template(TableFunctionManager,
2021  TextEncodingNone model_name,
2022  Cursor<Column<T> labels, ColumnList<T> features> data) ->
2023  Column<double> r2, T=[double]
2024  */
2025 // clang-format on
2026 
2027 template <typename T>
2029  const TextEncodingNone& model_name,
2030  const Column<T>& input_labels,
2031  const ColumnList<T>& input_features,
2032  Column<double>& output_r2) {
2033  try {
2034  const auto model = g_ml_models.getModel(model_name);
2035  check_model_params(model, 0, input_features.numCols());
2036  return r2_score_impl(mgr, model, input_labels, input_features, output_r2);
2037  } catch (std::runtime_error& e) {
2038  const std::string error_str(e.what());
2039  return mgr.ERROR_MESSAGE(error_str);
2040  }
2041 }
2042 
2043 // clang-format off
2044 /*
2045  UDTF: r2_score__cpu_template(TableFunctionManager,
2046  Cursor<Column<TextEncodingDict> name> model_name,
2047  Cursor<Column<T> labels, ColumnList<T> features> data) ->
2048  Column<double> r2, T=[double]
2049  */
2050 // clang-format on
2051 
2052 template <typename T>
2053 NEVER_INLINE HOST int32_t
2055  const Column<TextEncodingDict>& model_name,
2056  const Column<T>& input_labels,
2057  const ColumnList<T>& input_features,
2058  Column<double>& output_r2) {
2059  if (model_name.size() != 1) {
2060  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
2061  }
2062  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
2063  return r2_score__cpu_template(
2064  mgr, model_name_text_enc_none, input_labels, input_features, output_r2);
2065 }
2066 
2067 // clang-format off
2068 /*
2069  UDTF: r2_score__cpu_template(TableFunctionManager,
2070  TextEncodingNone model_name,
2071  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data) -> Column<double> r2, T=[double]
2072  */
2073 // clang-format on
2074 
2075 template <typename T>
2076 NEVER_INLINE HOST int32_t
2078  const TextEncodingNone& model_name,
2079  const Column<T>& input_labels,
2080  const ColumnList<TextEncodingDict>& input_cat_features,
2081  const ColumnList<T>& input_numeric_features,
2082  Column<double>& output_r2) {
2083  try {
2084  const auto model = g_ml_models.getModel(model_name);
2086  model, input_cat_features.numCols(), input_numeric_features.numCols());
2087  CategoricalFeaturesBuilder<T> cat_features_builder(
2088  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2089  return r2_score_impl(
2090  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2091  } catch (std::runtime_error& e) {
2092  const std::string error_str(e.what());
2093  return mgr.ERROR_MESSAGE(error_str);
2094  }
2095 }
2096 
2097 // clang-format off
2098 /*
2099  UDTF: r2_score__cpu_template(TableFunctionManager,
2100  TextEncodingNone model_name,
2101  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features> data) -> Column<double> r2, T=[double]
2102  */
2103 // clang-format on
2104 
2105 template <typename T>
2106 NEVER_INLINE HOST int32_t
2108  const TextEncodingNone& model_name,
2109  const Column<T>& input_labels,
2110  const ColumnList<TextEncodingDict>& input_cat_features,
2111  Column<double>& output_r2) {
2112  try {
2113  const auto model = g_ml_models.getModel(model_name);
2114  check_model_params(model, input_cat_features.numCols(), 0);
2115  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
2116  model->getCatFeatureKeys());
2117  return r2_score_impl(
2118  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2119  } catch (std::runtime_error& e) {
2120  const std::string error_str(e.what());
2121  return mgr.ERROR_MESSAGE(error_str);
2122  }
2123 }
2124 
2125 // clang-format off
2126 /*
2127  UDTF: r2_score__cpu_template(TableFunctionManager,
2128  Cursor<Column<TextEncodingDict> name> model_name,
2129  Cursor<Column<T> labels, ColumnList<TextEncodingDict> cat_features, ColumnList<T> numeric_features> data) -> Column<double> r2, T=[double]
2130  */
2131 // clang-format on
2132 
2133 template <typename T>
2134 NEVER_INLINE HOST int32_t
2136  const Column<TextEncodingDict>& model_name,
2137  const Column<T>& input_labels,
2138  const ColumnList<TextEncodingDict>& input_cat_features,
2139  const ColumnList<T>& input_numeric_features,
2140  Column<double>& output_r2) {
2141  if (model_name.size() != 1) {
2142  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
2143  }
2144  const std::string model_name_str{model_name.getString(0)};
2145  try {
2146  const auto model = g_ml_models.getModel(model_name_str);
2148  model, input_cat_features.numCols(), input_numeric_features.numCols());
2149  CategoricalFeaturesBuilder<T> cat_features_builder(
2150  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2151  return r2_score_impl(
2152  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2153  } catch (std::runtime_error& e) {
2154  const std::string error_str(e.what());
2155  return mgr.ERROR_MESSAGE(error_str);
2156  }
2157 }
2158 
2159 // clang-format off
2160 /*
2161  UDTF: random_forest_reg_var_importance__cpu_1(TableFunctionManager,
2162  TextEncodingNone model_name) ->
2163  Column<int64_t> feature_id, Column<TextEncodingDict> feature | input_id=args<>,
2164  Column<int64_t> sub_feature_id, Column<TextEncodingDict> sub_feature | input_id=args<>, Column<double> importance_score
2165  */
2166 // clang-format on
2167 
2170  const TextEncodingNone& model_name,
2171  Column<int64_t>& feature_id,
2172  Column<TextEncodingDict>& feature,
2173  Column<int64_t>& sub_feature_id,
2174  Column<TextEncodingDict>& sub_feature,
2175  Column<double>& importance_score);
2176 
2177 // clang-format off
2178 /*
2179  UDTF: random_forest_reg_var_importance__cpu_2(TableFunctionManager,
2180  Cursor<Column<TextEncodingDict> name> model_name) ->
2181  Column<int64_t> feature_id, Column<TextEncodingDict> feature | input_id=args<>,
2182  Column<int64_t> sub_feature_id, Column<TextEncodingDict> sub_feature | input_id=args<>, Column<double> importance_score
2183  */
2184 // clang-format on
2185 
2188  const Column<TextEncodingDict>& model_name,
2189  Column<int64_t>& feature_id,
2190  Column<TextEncodingDict>& feature,
2191  Column<int64_t>& sub_feature_id,
2192  Column<TextEncodingDict>& sub_feature,
2193  Column<double>& importance_score);
2194 
2195 // clang-format off
2196 /*
2197  UDTF: get_decision_trees__cpu_1(TableFunctionManager,
2198  TextEncodingNone model_name) ->
2199  Column<int64_t> tree_id,
2200  Column<int64_t> entry_id,
2201  Column<bool> is_split_node,
2202  Column<int64_t> feature_id,
2203  Column<int64_t> left_child,
2204  Column<int64_t> right_child,
2205  Column<double> value
2206  */
2207 // clang-format on
2208 
2211  const TextEncodingNone& model_name,
2212  Column<int64_t>& tree_id,
2213  Column<int64_t>& entry_id,
2214  Column<bool>& is_split_node,
2215  Column<int64_t>& feature_id,
2216  Column<int64_t>& left_child,
2217  Column<int64_t>& right_child,
2218  Column<double>& value);
2219 
2220 // clang-format off
2221 /*
2222  UDTF: get_decision_trees__cpu_2(TableFunctionManager,
2223  Cursor<Column<TextEncodingDict> name> model_name) ->
2224  Column<int64_t> tree_id,
2225  Column<int64_t> entry_id,
2226  Column<bool> is_split_node,
2227  Column<int64_t> feature_id,
2228  Column<int64_t> left_child,
2229  Column<int64_t> right_child,
2230  Column<double> value
2231  */
2232 // clang-format on
2233 
2236  const Column<TextEncodingDict>& model_name,
2237  Column<int64_t>& tree_id,
2238  Column<int64_t>& entry_id,
2239  Column<bool>& is_split_node,
2240  Column<int64_t>& feature_id,
2241  Column<int64_t>& left_child,
2242  Column<int64_t>& right_child,
2243  Column<double>& value);
2244 
2245 #endif // #ifndef __CUDACC__
DEVICE const std::string getString(int64_t index) const
NEVER_INLINE HOST int32_t pca_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:371
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const ColumnList< T > &numeric_features, const int32_t cat_top_k, const float cat_min_fraction, const bool cat_include_others)
KMeansInitStrategy get_kmeans_init_type(const std::string &init_type_str)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const int32_t cat_top_k, const float cat_min_fraction, const bool cat_include_others)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
NEVER_INLINE HOST int32_t r2_score_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
std::string getString() const
Definition: heavydbTypes.h:639
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
DEVICE int64_t size() const
NEVER_INLINE HOST int32_t decision_tree_reg_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST int32_t pca_fit__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< TextEncodingDict > &input_cat_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
EXTENSION_NOINLINE_HOST int32_t supported_ml_frameworks__cpu_(TableFunctionManager &mgr, Column< TextEncodingDict > &output_ml_frameworks, Column< bool > &output_availability, Column< bool > &output_default)
#define CHECK_GE(x, y)
Definition: Logger.h:306
MaskedData< T > denull_data(const ColumnList< T > &features)
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t kmeans__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const int num_clusters, const int num_iterations, const TextEncodingNone &init_type_str, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
NEVER_INLINE HOST int32_t pca_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
#define CHECK_GT(x, y)
Definition: Logger.h:305
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
MLFramework get_ml_framework(const std::string &ml_framework_str)
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
#define HOST
const size_t max_inputs_per_thread
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const ColumnList< T > &numeric_features, const std::vector< std::vector< std::string >> &cat_feature_keys)
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_2(TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
void addModel(const std::string &model_name, std::shared_ptr< AbstractMLModel > model)
Definition: MLModel.h:37
VarImportanceMetric get_var_importance_metric(const std::string &var_importance_metric_str)
NEVER_INLINE HOST int32_t gbt_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
#define EXTENSION_NOINLINE_HOST
Definition: heavydbTypes.h:55
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:50
void disable_output_allocations()
Definition: heavydbTypes.h:377
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
DEVICE int64_t numCols() const
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
int8_t ** ptrs_
MLModelMap g_ml_models
Definition: MLModel.h:124
std::vector< int8_t * > col_ptrs_
#define CHECK_LE(x, y)
Definition: Logger.h:304
std::vector< TableFunctions_Namespace::OneHotEncoder_Namespace::OneHotEncodedCol< T > > one_hot_encoded_cols_
void unmask_data(const T *masked_input, const std::vector< int32_t > &reverse_index_map, T *unmasked_output, const int64_t num_unmasked_rows, const T null_val)
NEVER_INLINE HOST int32_t linear_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
NEVER_INLINE HOST int32_t dbscan__cpu_template(TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const double epsilon, const int32_t min_observations, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
CategoricalFeaturesBuilder(const ColumnList< TextEncodingDict > &cat_features, const std::vector< std::vector< std::string >> &cat_feature_keys)
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
#define NEVER_INLINE
std::vector< std::vector< T > > z_std_normalize_data(const std::vector< T * > &input_data, const int64_t num_rows)
#define CHECK(condition)
Definition: Logger.h:291
NEVER_INLINE HOST int32_t ml_reg_predict_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
std::vector< std::vector< std::string > > cat_feature_keys_
DEVICE int64_t size() const
DEVICE int64_t size() const
NEVER_INLINE HOST int32_t r2_score__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
ZStdNormalizationSummaryStats< T > z_std_normalize_data_with_summary_stats(const std::vector< T * > &input_data, const int64_t num_rows)
void enable_output_allocations()
Definition: heavydbTypes.h:379
Column< T > create_wrapper_col(std::vector< T > &col_vec)
NEVER_INLINE HOST double get_column_mean(const T *data, const int64_t num_rows)
const std::vector< std::vector< std::string > > & getCatFeatureKeys() const