OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MLTableFunctions.hpp File Reference
+ Include dependency graph for MLTableFunctions.hpp:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  CategoricalFeaturesBuilder< T >
 

Functions

template<typename T >
std::vector< const T * > pluck_ptrs (const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
 
template<typename T >
std::vector< const T * > pluck_ptrs (const std::vector< T * > &data, const int64_t start_idx, const int64_t end_idx)
 
EXTENSION_NOINLINE_HOST int32_t supported_ml_frameworks__cpu_ (TableFunctionManager &mgr, Column< TextEncodingDict > &output_ml_frameworks, Column< bool > &output_availability, Column< bool > &output_default)
 
EXTENSION_NOINLINE_HOST void check_model_params (const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
 
template<typename K , typename T >
NEVER_INLINE HOST int32_t kmeans__cpu_template (TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const int num_clusters, const int num_iterations, const TextEncodingNone &init_type_str, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
 
template<typename K , typename T >
NEVER_INLINE HOST int32_t dbscan__cpu_template (TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const double epsilon, const int32_t min_observations, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)
 
template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
Column< T > create_wrapper_col (std::vector< T > &col_vec)
 
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1 (TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
 
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_2 (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
 
template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t pca_fit_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t pca_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T >
NEVER_INLINE HOST int32_t pca_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
EXTENSION_NOINLINE_HOST int32_t pca_fit__cpu_1 (TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< TextEncodingDict > &input_cat_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)
 
template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict_impl (TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
 
template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
 
template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
 
template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< TextEncodingDict > &input_cat_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
 
template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
 
template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< K > &input_ids, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
 
template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< K > &input_ids, const ColumnList< TextEncodingDict > &input_cat_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
 
template<typename T >
NEVER_INLINE HOST int32_t r2_score_impl (TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
 
template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
 
template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
 
template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, Column< double > &output_r2)
 
template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, Column< double > &output_r2)
 
template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, Column< double > &output_r2)
 
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1 (TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
 
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_2 (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
 
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1 (TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
 
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_2 (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)
 

Function Documentation

EXTENSION_NOINLINE_HOST void check_model_params ( const std::shared_ptr< AbstractMLModel > &  model,
const int64_t  num_cat_features,
const int64_t  num_numeric_features 
)

Definition at line 362 of file MLTableFunctions.cpp.

Referenced by ml_reg_predict__cpu_template(), and r2_score__cpu_template().

364  {
365  if (model->getNumLogicalFeatures() != num_cat_features + num_numeric_features) {
366  std::ostringstream error_oss;
367  error_oss << "Model expects " << model->getNumLogicalFeatures() << " features but "
368  << num_cat_features + num_numeric_features << " were provided.";
369  throw std::runtime_error(error_oss.str());
370  }
371  if (model->getNumCatFeatures() != num_cat_features) {
372  std::ostringstream error_oss;
373  error_oss << "Model expects " << model->getNumCatFeatures()
374  << " categorical features but " << num_cat_features << " were provided.";
375  throw std::runtime_error(error_oss.str());
376  }
377 }

+ Here is the caller graph for this function:

template<typename T >
Column<T> create_wrapper_col ( std::vector< T > &  col_vec)

Definition at line 570 of file MLTableFunctions.hpp.

570  {
571  Column<T> wrapper_col(col_vec.data(), static_cast<int64_t>(col_vec.size()));
572  return wrapper_col;
573 }
template<typename K , typename T >
NEVER_INLINE HOST int32_t dbscan__cpu_template ( TableFunctionManager mgr,
const Column< K > &  input_ids,
const ColumnList< T > &  input_features,
const double  epsilon,
const int32_t  min_observations,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< int32_t > &  output_clusters 
)

Definition at line 195 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_ml_framework(), TextEncodingNone::getString(), INVALID, MLPACK, ONEDAL, pluck_ptrs(), Column< T >::ptr_, TableFunctionManager::set_output_row_size(), Column< T >::size(), TableFunctions_Namespace::unmask_data(), and z_std_normalize_data().

202  {
203  mgr.set_output_row_size(input_ids.size());
204  output_ids = input_ids;
205 
206  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
207  if (preferred_ml_framework == MLFramework::INVALID) {
208  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
209  preferred_ml_framework_str.getString());
210  }
211 
212  try {
213  const auto denulled_data = denull_data(input_features);
214  const int64_t num_rows = denulled_data.masked_num_rows;
215  const bool data_is_masked =
216  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
217  std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
218  int32_t* denulled_output =
219  data_is_masked ? denulled_output_allocation.data() : output_clusters.ptr_;
220 
221  // z_std_normalize_data can throw if std dev is 0
222  const auto normalized_data = z_std_normalize_data(denulled_data.data, num_rows);
223  const auto normalized_ptrs = pluck_ptrs(normalized_data, 0L, normalized_data.size());
224 
225  bool did_execute = false;
226 #ifdef HAVE_ONEDAL
227  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
228  preferred_ml_framework == MLFramework::DEFAULT)) {
229  onedal_dbscan_impl(
230  normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
231  did_execute = true;
232  }
233 #endif
234 #ifdef HAVE_MLPACK
235  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
236  preferred_ml_framework == MLFramework::DEFAULT)) {
237  mlpack_dbscan_impl(
238  normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
239  did_execute = true;
240  }
241 #endif
242  if (!did_execute) {
243  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
244  " ML library to support dbscan implementation.");
245  }
246 
247  if (data_is_masked) {
248  unmask_data(denulled_output,
249  denulled_data.reverse_index_map,
250  output_clusters.ptr_,
251  denulled_data.unmasked_num_rows,
252  inline_null_value<int32_t>());
253  }
254  } catch (std::runtime_error& e) {
255  return mgr.ERROR_MESSAGE(e.what());
256  }
257  return input_ids.size();
258 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
void unmask_data(const T *masked_input, const std::vector< int32_t > &reverse_index_map, T *unmasked_output, const int64_t num_unmasked_rows, const T null_val)
std::vector< std::vector< T > > z_std_normalize_data(const std::vector< T * > &input_data, const int64_t num_rows)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const int64_t  max_tree_depth,
const int64_t  min_observations_per_leaf_node,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 693 of file MLTableFunctions.hpp.

References decision_tree_reg_impl().

701  {
702  std::vector<std::vector<std::string>> empty_cat_feature_keys;
703  return decision_tree_reg_impl(mgr,
704  model_name,
705  input_labels,
706  input_features,
707  empty_cat_feature_keys,
708  max_tree_depth,
709  min_observations_per_leaf_node,
710  preferred_ml_framework_str,
711  model_metadata,
712  output_model_name);
713 }
NEVER_INLINE HOST int32_t decision_tree_reg_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
const int64_t  max_tree_depth,
const int64_t  min_observations_per_leaf_node,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 731 of file MLTableFunctions.hpp.

References decision_tree_reg_impl(), CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), and CategoricalFeaturesBuilder< T >::getFeatures().

743  {
744  std::vector<std::vector<std::string>> empty_cat_feature_keys;
745  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
746  input_numeric_features,
747  cat_top_k,
748  cat_min_fraction,
749  false /* cat_include_others */);
750  return decision_tree_reg_impl(mgr,
751  model_name,
752  input_labels,
753  cat_features_builder.getFeatures(),
754  cat_features_builder.getCatFeatureKeys(),
755  max_tree_depth,
756  min_observations_per_leaf_node,
757  preferred_ml_framework_str,
758  model_metadata,
759  output_model_name);
760 }
NEVER_INLINE HOST int32_t decision_tree_reg_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const int64_t  max_tree_depth,
const int64_t  min_observations_per_leaf_node,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 778 of file MLTableFunctions.hpp.

References decision_tree_reg_impl(), CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), and CategoricalFeaturesBuilder< T >::getFeatures().

789  {
790  std::vector<std::vector<std::string>> empty_cat_feature_keys;
791  CategoricalFeaturesBuilder<T> cat_features_builder(
792  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
793  return decision_tree_reg_impl(mgr,
794  model_name,
795  input_labels,
796  cat_features_builder.getFeatures(),
797  cat_features_builder.getCatFeatureKeys(),
798  max_tree_depth,
799  min_observations_per_leaf_node,
800  preferred_ml_framework_str,
801  model_metadata,
802  output_model_name);
803 }
NEVER_INLINE HOST int32_t decision_tree_reg_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t decision_tree_reg_impl ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const std::vector< std::vector< std::string >> &  cat_feature_keys,
const int64_t  max_tree_depth,
const int64_t  min_observations_per_leaf_node,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 615 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_ml_framework(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, MLPACK, ColumnList< T >::numCols(), ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), and Column< T >::size().

Referenced by decision_tree_reg_fit__cpu_template().

624  {
625  if (input_labels.size() == 0) {
626  return mgr.ERROR_MESSAGE(
627  "No rows exist in training data. Training data must at least contain 1 row.");
628  }
629  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
630  if (preferred_ml_framework == MLFramework::INVALID) {
631  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
632  preferred_ml_framework_str.getString());
633  }
634  if (preferred_ml_framework == MLFramework::MLPACK) {
635  return mgr.ERROR_MESSAGE(
636  "Only OneDAL framework supported for decision tree regression.");
637  }
638 #ifndef HAVE_ONEDAL
639  return mgr.ERROR_MESSAGE(
640  "Only OneDAL framework supported for decision tree regression.");
641 #endif
642 
643  const auto denulled_data = denull_data(input_labels, input_features);
644  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
645  const auto features_ptrs =
646  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
647  mgr.set_output_row_size(1);
648  try {
649  bool did_execute = false;
650 #ifdef HAVE_ONEDAL
651  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
652  preferred_ml_framework == MLFramework::DEFAULT)) {
653  onedal_decision_tree_reg_fit_impl<T>(model_name,
654  labels_ptrs[0],
655  features_ptrs,
656  model_metadata,
657  cat_feature_keys,
658  denulled_data.masked_num_rows,
659  max_tree_depth,
660  min_observations_per_leaf_node);
661  const TextEncodingDict model_name_str_id =
662  output_model_name.getOrAddTransient(model_name);
663  output_model_name[0] = model_name_str_id;
664  did_execute = true;
665  }
666 #endif
667  if (!did_execute) {
668  return mgr.ERROR_MESSAGE(
669  "Cannot find " + preferred_ml_framework_str.getString() +
670  " ML library to support decision tree regression implementation.");
671  }
672  } catch (std::runtime_error& e) {
673  return mgr.ERROR_MESSAGE(e.what());
674  }
675  return 1;
676 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
DEVICE int64_t numCols() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const int64_t  max_iterations,
const int64_t  max_tree_depth,
const double  shrinkage,
const double  min_split_loss,
const double  lambda,
const double  obs_per_tree_fraction,
const int64_t  features_per_node,
const int64_t  min_observations_per_leaf_node,
const int64_t  max_bins,
const int64_t  min_bin_size,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 906 of file MLTableFunctions.hpp.

References gbt_reg_fit_impl().

922  {
923  std::vector<std::vector<std::string>> empty_cat_feature_keys;
924  return gbt_reg_fit_impl(mgr,
925  model_name,
926  input_labels,
927  input_features,
928  empty_cat_feature_keys,
929  max_iterations,
930  max_tree_depth,
931  shrinkage,
932  min_split_loss,
933  lambda,
934  obs_per_tree_fraction,
935  features_per_node,
936  min_observations_per_leaf_node,
937  max_bins,
938  min_bin_size,
939  preferred_ml_framework_str,
940  model_metadata,
941  output_model_name);
942 }
NEVER_INLINE HOST int32_t gbt_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
const int64_t  max_iterations,
const int64_t  max_tree_depth,
const double  shrinkage,
const double  min_split_loss,
const double  lambda,
const double  obs_per_tree_fraction,
const int64_t  features_per_node,
const int64_t  min_observations_per_leaf_node,
const int64_t  max_bins,
const int64_t  min_bin_size,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 969 of file MLTableFunctions.hpp.

References gbt_reg_fit_impl(), CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), and CategoricalFeaturesBuilder< T >::getFeatures().

988  {
989  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
990  input_numeric_features,
991  cat_top_k,
992  cat_min_fraction,
993  false /* cat_include_others */);
994  return gbt_reg_fit_impl(mgr,
995  model_name,
996  input_labels,
997  cat_features_builder.getFeatures(),
998  cat_features_builder.getCatFeatureKeys(),
999  max_iterations,
1000  max_tree_depth,
1001  shrinkage,
1002  min_split_loss,
1003  lambda,
1004  obs_per_tree_fraction,
1005  features_per_node,
1006  min_observations_per_leaf_node,
1007  max_bins,
1008  min_bin_size,
1009  preferred_ml_framework_str,
1010  model_metadata,
1011  output_model_name);
1012 }
NEVER_INLINE HOST int32_t gbt_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const int64_t  max_iterations,
const int64_t  max_tree_depth,
const double  shrinkage,
const double  min_split_loss,
const double  lambda,
const double  obs_per_tree_fraction,
const int64_t  features_per_node,
const int64_t  min_observations_per_leaf_node,
const int64_t  max_bins,
const int64_t  min_bin_size,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1039 of file MLTableFunctions.hpp.

References gbt_reg_fit_impl(), CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), and CategoricalFeaturesBuilder< T >::getFeatures().

1057  {
1058  CategoricalFeaturesBuilder<T> cat_features_builder(
1059  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
1060  return gbt_reg_fit_impl(mgr,
1061  model_name,
1062  input_labels,
1063  cat_features_builder.getFeatures(),
1064  cat_features_builder.getCatFeatureKeys(),
1065  max_iterations,
1066  max_tree_depth,
1067  shrinkage,
1068  min_split_loss,
1069  lambda,
1070  obs_per_tree_fraction,
1071  features_per_node,
1072  min_observations_per_leaf_node,
1073  max_bins,
1074  min_bin_size,
1075  preferred_ml_framework_str,
1076  model_metadata,
1077  output_model_name);
1078 }
NEVER_INLINE HOST int32_t gbt_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t gbt_reg_fit_impl ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const std::vector< std::vector< std::string >> &  cat_feature_keys,
const int64_t  max_iterations,
const int64_t  max_tree_depth,
const double  shrinkage,
const double  min_split_loss,
const double  lambda,
const double  obs_per_tree_fraction,
const int64_t  features_per_node,
const int64_t  min_observations_per_leaf_node,
const int64_t  max_bins,
const int64_t  min_bin_size,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 807 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_ml_framework(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, MLPACK, ColumnList< T >::numCols(), ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), and Column< T >::size().

Referenced by gbt_reg_fit__cpu_template().

824  {
825  if (input_labels.size() == 0) {
826  return mgr.ERROR_MESSAGE(
827  "No rows exist in training data. Training data must at least contain 1 row.");
828  }
829  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
830  if (preferred_ml_framework == MLFramework::INVALID) {
831  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
832  preferred_ml_framework_str.getString());
833  }
834  if (preferred_ml_framework == MLFramework::MLPACK) {
835  return mgr.ERROR_MESSAGE("Only OneDAL framework supported for GBT regression.");
836  }
837 #ifndef HAVE_ONEDAL
838  return mgr.ERROR_MESSAGE("Only OneDAL framework supported for GBT regression.");
839 #endif
840 
841  const auto denulled_data = denull_data(input_labels, input_features);
842  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
843  const auto features_ptrs =
844  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
845  mgr.set_output_row_size(1);
846  try {
847  bool did_execute = false;
848 #ifdef HAVE_ONEDAL
849  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
850  preferred_ml_framework == MLFramework::DEFAULT)) {
851  onedal_gbt_reg_fit_impl<T>(model_name,
852  labels_ptrs[0],
853  features_ptrs,
854  model_metadata,
855  cat_feature_keys,
856  denulled_data.masked_num_rows,
857  max_iterations,
858  max_tree_depth,
859  shrinkage,
860  min_split_loss,
861  lambda,
862  obs_per_tree_fraction,
863  features_per_node,
864  min_observations_per_leaf_node,
865  max_bins,
866  min_bin_size);
867  const TextEncodingDict model_name_str_id =
868  output_model_name.getOrAddTransient(model_name);
869  output_model_name[0] = model_name_str_id;
870  did_execute = true;
871  }
872 #endif
873  if (!did_execute) {
874  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
875  " ML library to support GBT regression implementation.");
876  }
877  } catch (std::runtime_error& e) {
878  return mgr.ERROR_MESSAGE(e.what());
879  }
880  return 1;
881 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
DEVICE int64_t numCols() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1 ( TableFunctionManager mgr,
const TextEncodingNone model_name,
Column< int64_t > &  tree_id,
Column< int64_t > &  entry_id,
Column< bool > &  is_split_node,
Column< int64_t > &  feature_id,
Column< int64_t > &  left_child,
Column< int64_t > &  right_child,
Column< double > &  value 
)

Definition at line 275 of file MLTableFunctions.cpp.

References g_ml_models, MLModelMap::getModel(), and TableFunctionManager::set_output_row_size().

Referenced by get_decision_trees__cpu_2().

283  {
284 #ifdef HAVE_ONEDAL
285  try {
286  const auto model = g_ml_models.getModel(model_name);
287  const auto tree_model = std::dynamic_pointer_cast<AbstractTreeModel>(model);
288  if (!tree_model) {
289  throw std::runtime_error("Model not a tree-type model.");
290  }
291  const auto num_trees = tree_model->getNumTrees();
292  std::vector<std::vector<DecisionTreeEntry>> decision_trees(num_trees);
293  for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
294  TreeModelVisitor tree_visitor(decision_trees[tree_idx]);
295  tree_model->traverseDF(tree_idx, tree_visitor);
296  }
297  std::vector<int64_t> decision_tree_offsets(num_trees + 1);
298  decision_tree_offsets[0] = 0;
299  for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
300  decision_tree_offsets[tree_idx + 1] =
301  decision_tree_offsets[tree_idx] +
302  static_cast<int64_t>(decision_trees[tree_idx].size());
303  }
304  const auto num_entries = decision_tree_offsets[num_trees];
305  mgr.set_output_row_size(num_entries);
306  for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
307  const auto& decision_tree = decision_trees[tree_idx];
308  const auto output_offset = decision_tree_offsets[tree_idx];
309  const int64_t num_tree_entries = decision_tree.size();
310  for (int64_t entry_idx = 0; entry_idx < num_tree_entries; ++entry_idx) {
311  const int64_t output_idx = output_offset + entry_idx;
312  const auto& tree_entry = decision_tree[entry_idx];
313  const bool entry_is_split_node = tree_entry.isSplitNode();
314  tree_id[output_idx] = tree_idx;
315  entry_id[output_idx] = entry_idx;
316  is_split_node[output_idx] = entry_is_split_node;
317  feature_id[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
318  : tree_entry.feature_index;
319  left_child[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
320  : tree_entry.left_child_row_idx;
321  right_child[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
322  : tree_entry.right_child_row_idx;
323  value[output_idx] = tree_entry.value;
324  }
325  }
326  return num_entries;
327  } catch (std::runtime_error& e) {
328  const std::string error_str(e.what());
329  return mgr.ERROR_MESSAGE(error_str);
330  }
331 #else // Not HAVE_ONEDAL
332  return mgr.ERROR_MESSAGE("OneDAL library must be available for get_decision_trees.");
333 #endif
334 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:50
MLModelMap g_ml_models
Definition: MLModel.h:124

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_2 ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
Column< int64_t > &  tree_id,
Column< int64_t > &  entry_id,
Column< bool > &  is_split_node,
Column< int64_t > &  feature_id,
Column< int64_t > &  left_child,
Column< int64_t > &  right_child,
Column< double > &  value 
)

Definition at line 337 of file MLTableFunctions.cpp.

References get_decision_trees__cpu_1(), Column< TextEncodingDict >::getString(), and Column< TextEncodingDict >::size().

345  {
346  if (model_name.size() != 1) {
347  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
348  }
349  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
350  return get_decision_trees__cpu_1(mgr,
351  model_name_text_enc_none,
352  tree_id,
353  entry_id,
354  is_split_node,
355  feature_id,
356  left_child,
357  right_child,
358  value);
359 }
DEVICE const std::string getString(int64_t index) const
DEVICE int64_t size() const
EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)

+ Here is the call graph for this function:

template<typename K , typename T >
NEVER_INLINE HOST int32_t kmeans__cpu_template ( TableFunctionManager mgr,
const Column< K > &  input_ids,
const ColumnList< T > &  input_features,
const int  num_clusters,
const int  num_iterations,
const TextEncodingNone init_type_str,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< int32_t > &  output_clusters 
)

Definition at line 102 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_kmeans_init_type(), get_ml_framework(), TextEncodingNone::getString(), INVALID, MLPACK, ONEDAL, pluck_ptrs(), Column< T >::ptr_, TableFunctionManager::set_output_row_size(), Column< T >::size(), TableFunctions_Namespace::unmask_data(), and z_std_normalize_data().

110  {
111  mgr.set_output_row_size(input_ids.size());
112  output_ids = input_ids;
113  const auto kmeans_init_strategy = get_kmeans_init_type(init_type_str);
114  if (kmeans_init_strategy == KMeansInitStrategy::INVALID) {
115  return mgr.ERROR_MESSAGE("Invalid KMeans initializaiton strategy: " +
116  init_type_str.getString());
117  }
118 
119  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
120  if (preferred_ml_framework == MLFramework::INVALID) {
121  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
122  preferred_ml_framework_str.getString());
123  }
124 
125  try {
126  const auto denulled_data = denull_data(input_features);
127  const int64_t num_rows = denulled_data.masked_num_rows;
128  const bool data_is_masked =
129  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
130  std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
131  int32_t* denulled_output =
132  data_is_masked ? denulled_output_allocation.data() : output_clusters.ptr_;
133 
134  // z_std_normalize_data can throw if std dev is 0
135  const auto normalized_data = z_std_normalize_data(denulled_data.data, num_rows);
136  const auto normalized_ptrs = pluck_ptrs(normalized_data, 0L, normalized_data.size());
137 
138  bool did_execute = false;
139 #ifdef HAVE_ONEDAL
140  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
141  preferred_ml_framework == MLFramework::DEFAULT)) {
142  onedal_kmeans_impl(normalized_ptrs,
143  denulled_output,
144  num_rows,
145  num_clusters,
146  num_iterations,
147  kmeans_init_strategy);
148  did_execute = true;
149  }
150 #endif
151 #ifdef HAVE_MLPACK
152  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
153  preferred_ml_framework == MLFramework::DEFAULT)) {
154  mlpack_kmeans_impl(normalized_ptrs,
155  denulled_output,
156  num_rows,
157  num_clusters,
158  num_iterations,
159  kmeans_init_strategy);
160  did_execute = true;
161  }
162 #endif
163  if (!did_execute) {
164  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
165  " ML library to support kmeans implementation.");
166  }
167 
168  if (data_is_masked) {
169  unmask_data(denulled_output,
170  denulled_data.reverse_index_map,
171  output_clusters.ptr_,
172  denulled_data.unmasked_num_rows,
173  inline_null_value<int32_t>());
174  }
175  } catch (std::runtime_error& e) {
176  return mgr.ERROR_MESSAGE(e.what());
177  }
178  return input_ids.size();
179 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
KMeansInitStrategy get_kmeans_init_type(const std::string &init_type_str)
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
void unmask_data(const T *masked_input, const std::vector< int32_t > &reverse_index_map, T *unmasked_output, const int64_t num_unmasked_rows, const T null_val)
std::vector< std::vector< T > > z_std_normalize_data(const std::vector< T * > &input_data, const int64_t num_rows)

+ Here is the call graph for this function:

EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1 ( TableFunctionManager mgr,
const TextEncodingNone model_name,
Column< int64_t > &  output_coef_idx,
Column< TextEncodingDict > &  output_feature,
Column< int64_t > &  output_sub_coef_idx,
Column< TextEncodingDict > &  output_sub_feature,
Column< double > &  output_coef 
)

Definition at line 87 of file MLTableFunctions.cpp.

References g_ml_models, get_model_features(), MLModelMap::getModel(), Column< TextEncodingDict >::getOrAddTransient(), and TableFunctionManager::set_output_row_size().

Referenced by linear_reg_coefs__cpu_2().

93  {
94  try {
95  const auto linear_reg_model = std::dynamic_pointer_cast<LinearRegressionModel>(
96  g_ml_models.getModel(model_name));
97  if (!linear_reg_model) {
98  throw std::runtime_error("Model is not of type linear regression.");
99  }
100 
101  const auto& coefs = linear_reg_model->getCoefs();
102  const auto& cat_feature_keys = linear_reg_model->getCatFeatureKeys();
103  const int64_t num_sub_coefs = static_cast<int64_t>(coefs.size());
104  const int64_t num_cat_features = static_cast<int64_t>(cat_feature_keys.size());
105  mgr.set_output_row_size(num_sub_coefs);
106 
107  std::vector<std::string> feature_names =
108  get_model_features(model_name, linear_reg_model);
109  feature_names.insert(feature_names.begin(), "intercept");
110 
111  for (int64_t sub_coef_idx = 0, coef_idx = 0; sub_coef_idx < num_sub_coefs;
112  ++coef_idx) {
113  if (num_cat_features >= coef_idx && coef_idx > 0) {
114  const auto& col_cat_feature_keys = cat_feature_keys[coef_idx - 1];
115  int64_t col_cat_feature_idx = 1;
116  for (const auto& col_cat_feature_key : col_cat_feature_keys) {
117  output_coef_idx[sub_coef_idx] = coef_idx;
118  if (feature_names[coef_idx].empty()) {
119  output_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
120  } else {
121  output_feature[sub_coef_idx] =
122  output_feature.getOrAddTransient(feature_names[coef_idx]);
123  }
124  output_sub_coef_idx[sub_coef_idx] = col_cat_feature_idx++;
125  output_sub_feature[sub_coef_idx] =
126  output_sub_feature.getOrAddTransient(col_cat_feature_key);
127  output_coef[sub_coef_idx] = coefs[sub_coef_idx];
128  ++sub_coef_idx;
129  }
130  } else {
131  output_coef_idx[sub_coef_idx] = coef_idx;
132  if (feature_names[coef_idx].empty()) {
133  output_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
134  } else {
135  output_feature[sub_coef_idx] =
136  output_feature.getOrAddTransient(feature_names[coef_idx]);
137  }
138  output_sub_coef_idx[sub_coef_idx] = 1;
139  output_sub_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
140  output_coef[sub_coef_idx] = coefs[sub_coef_idx];
141  ++sub_coef_idx;
142  }
143  }
144 
145  return num_sub_coefs;
146  } catch (std::runtime_error& e) {
147  return mgr.ERROR_MESSAGE(e.what());
148  }
149 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:50
MLModelMap g_ml_models
Definition: MLModel.h:124
std::vector< std::string > get_model_features(const std::string &model_name, const std::shared_ptr< AbstractMLModel > &model)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_2 ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
Column< int64_t > &  output_coef_idx,
Column< TextEncodingDict > &  output_feature,
Column< int64_t > &  output_sub_coef_idx,
Column< TextEncodingDict > &  output_sub_feature,
Column< double > &  output_coef 
)

Definition at line 152 of file MLTableFunctions.cpp.

References Column< TextEncodingDict >::getString(), linear_reg_coefs__cpu_1(), and Column< TextEncodingDict >::size().

158  {
159  if (model_name.size() != 1) {
160  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
161  }
162  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
163  return linear_reg_coefs__cpu_1(mgr,
164  model_name_text_enc_none,
165  output_coef_idx,
166  output_feature,
167  output_sub_coef_idx,
168  output_sub_feature,
169  output_coef);
170 }
DEVICE const std::string getString(int64_t index) const
EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)
DEVICE int64_t size() const

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 342 of file MLTableFunctions.hpp.

References linear_reg_fit_impl().

348  {
349  std::vector<std::vector<std::string>> empty_cat_feature_keys;
350  return linear_reg_fit_impl(mgr,
351  model_name,
352  input_labels,
353  input_features,
354  empty_cat_feature_keys,
355  preferred_ml_framework_str,
356  model_metadata,
357  output_model_name);
358 }
NEVER_INLINE HOST int32_t linear_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 506 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and linear_reg_fit_impl().

515  {
516  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
517  input_numeric_features,
518  cat_top_k,
519  cat_min_fraction,
520  false /* cat_include_others */);
521 
522  return linear_reg_fit_impl(mgr,
523  model_name,
524  input_labels,
525  cat_features_builder.getFeatures(),
526  cat_features_builder.getCatFeatureKeys(),
527  preferred_ml_framework_str,
528  model_metadata,
529  output_model_name);
530 }
NEVER_INLINE HOST int32_t linear_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 547 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and linear_reg_fit_impl().

555  {
556  CategoricalFeaturesBuilder<T> cat_features_builder(
557  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
558 
559  return linear_reg_fit_impl(mgr,
560  model_name,
561  input_labels,
562  cat_features_builder.getFeatures(),
563  cat_features_builder.getCatFeatureKeys(),
564  preferred_ml_framework_str,
565  model_metadata,
566  output_model_name);
567 }
NEVER_INLINE HOST int32_t linear_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t linear_reg_fit_impl ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const std::vector< std::vector< std::string >> &  cat_feature_keys,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 262 of file MLTableFunctions.hpp.

References MLModelMap::addModel(), DEFAULT, TableFunctions_Namespace::denull_data(), g_ml_models, get_ml_framework(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, MLPACK, ColumnList< T >::numCols(), ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), and Column< T >::size().

Referenced by linear_reg_fit__cpu_template().

269  {
270  if (input_labels.size() == 0) {
271  return mgr.ERROR_MESSAGE(
272  "No rows exist in training data. Training data must at least contain 1 row.");
273  }
274  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
275  if (preferred_ml_framework == MLFramework::INVALID) {
276  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
277  preferred_ml_framework_str.getString());
278  }
279  const auto denulled_data = denull_data(input_labels, input_features);
280  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
281  const auto features_ptrs =
282  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
283  const int64_t num_coefs = input_features.numCols() + 1;
284  mgr.set_output_row_size(num_coefs);
285  std::vector<int64_t> coef_idxs(num_coefs);
286  std::vector<double> coefs(num_coefs);
287  try {
288  bool did_execute = false;
289 #ifdef HAVE_ONEDAL
290  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
291  preferred_ml_framework == MLFramework::DEFAULT)) {
292  onedal_linear_reg_fit_impl(labels_ptrs[0],
293  features_ptrs,
294  coef_idxs.data(),
295  coefs.data(),
296  denulled_data.masked_num_rows);
297  did_execute = true;
298  }
299 #endif
300 #ifdef HAVE_MLPACK
301  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
302  preferred_ml_framework == MLFramework::DEFAULT)) {
303  mlpack_linear_reg_fit_impl(labels_ptrs[0],
304  features_ptrs,
305  coef_idxs.data(),
306  coefs.data(),
307  denulled_data.masked_num_rows);
308  did_execute = true;
309  }
310 #endif
311  if (!did_execute) {
312  return mgr.ERROR_MESSAGE(
313  "Cannot find " + preferred_ml_framework_str.getString() +
314  " ML library to support linear regression implementation.");
315  }
316  } catch (std::runtime_error& e) {
317  return mgr.ERROR_MESSAGE(e.what());
318  }
319  auto model =
320  std::make_shared<LinearRegressionModel>(coefs, model_metadata, cat_feature_keys);
321  g_ml_models.addModel(model_name, model);
322  const std::string model_name_str = model_name.getString();
323  const TextEncodingDict model_name_str_id =
324  output_model_name.getOrAddTransient(model_name);
325  output_model_name[0] = model_name_str_id;
326  return 1;
327 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
DEVICE int64_t numCols() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
void addModel(const std::string &model_name, std::shared_ptr< AbstractMLModel > model)
Definition: MLModel.h:37
MLModelMap g_ml_models
Definition: MLModel.h:124
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< K > &  input_ids,
const ColumnList< T > &  input_features,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< T > &  output_predictions 
)

Definition at line 1704 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ml_reg_predict_impl(), and ColumnList< T >::numCols().

Referenced by ml_reg_predict__cpu_template().

1710  {
1711  try {
1712  const auto model = g_ml_models.getModel(model_name);
1713  check_model_params(model, 0, input_features.numCols());
1714  return ml_reg_predict_impl(mgr,
1715  model,
1716  input_ids,
1717  input_features,
1718  preferred_ml_framework_str,
1719  output_ids,
1720  output_predictions);
1721  } catch (std::runtime_error& e) {
1722  const std::string error_str(e.what());
1723  return mgr.ERROR_MESSAGE(error_str);
1724  }
1725 }
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:50
MLModelMap g_ml_models
Definition: MLModel.h:124
NEVER_INLINE HOST int32_t ml_reg_predict_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< K > &  input_ids,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< T > &  output_predictions 
)

Definition at line 1740 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ml_reg_predict_impl(), ColumnList< T >::numCols(), and ColumnList< TextEncodingDict >::numCols().

1747  {
1748  try {
1749  const auto model = g_ml_models.getModel(model_name);
1751  model, input_cat_features.numCols(), input_numeric_features.numCols());
1752  CategoricalFeaturesBuilder<T> cat_features_builder(
1753  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
1754  return ml_reg_predict_impl(mgr,
1755  model,
1756  input_ids,
1757  cat_features_builder.getFeatures(),
1758  preferred_ml_framework_str,
1759  output_ids,
1760  output_predictions);
1761  } catch (std::runtime_error& e) {
1762  const std::string error_str(e.what());
1763  return mgr.ERROR_MESSAGE(error_str);
1764  }
1765 }
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:50
DEVICE int64_t numCols() const
MLModelMap g_ml_models
Definition: MLModel.h:124
NEVER_INLINE HOST int32_t ml_reg_predict_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)

+ Here is the call graph for this function:

template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< K > &  input_ids,
const ColumnList< TextEncodingDict > &  input_cat_features,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< T > &  output_predictions 
)

Definition at line 1780 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ml_reg_predict_impl(), and ColumnList< TextEncodingDict >::numCols().

1786  {
1787  try {
1788  const auto model = g_ml_models.getModel(model_name);
1789  check_model_params(model, input_cat_features.numCols(), 0);
1790  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1791  model->getCatFeatureKeys());
1792  return ml_reg_predict_impl(mgr,
1793  model,
1794  input_ids,
1795  cat_features_builder.getFeatures(),
1796  preferred_ml_framework_str,
1797  output_ids,
1798  output_predictions);
1799  } catch (std::runtime_error& e) {
1800  const std::string error_str(e.what());
1801  return mgr.ERROR_MESSAGE(error_str);
1802  }
1803 }
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:50
DEVICE int64_t numCols() const
MLModelMap g_ml_models
Definition: MLModel.h:124
NEVER_INLINE HOST int32_t ml_reg_predict_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)

+ Here is the call graph for this function:

template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
const Column< K > &  input_ids,
const ColumnList< T > &  input_features,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< T > &  output_predictions 
)

Definition at line 1818 of file MLTableFunctions.hpp.

References Column< TextEncodingDict >::getString(), ml_reg_predict__cpu_template(), and Column< TextEncodingDict >::size().

1824  {
1825  if (model_name.size() != 1) {
1826  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1827  }
1828  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1829  return ml_reg_predict__cpu_template(mgr,
1830  model_name_text_enc_none,
1831  input_ids,
1832  input_features,
1833  preferred_ml_framework_str,
1834  output_ids,
1835  output_predictions);
1836 }
DEVICE const std::string getString(int64_t index) const
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
DEVICE int64_t size() const

+ Here is the call graph for this function:

template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
const Column< K > &  input_ids,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< T > &  output_predictions 
)

Definition at line 1851 of file MLTableFunctions.hpp.

References Column< TextEncodingDict >::getString(), ml_reg_predict__cpu_template(), and Column< TextEncodingDict >::size().

1858  {
1859  if (model_name.size() != 1) {
1860  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1861  }
1862  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1863  return ml_reg_predict__cpu_template(mgr,
1864  model_name_text_enc_none,
1865  input_ids,
1866  input_cat_features,
1867  input_numeric_features,
1868  preferred_ml_framework_str,
1869  output_ids,
1870  output_predictions);
1871 }
DEVICE const std::string getString(int64_t index) const
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
DEVICE int64_t size() const

+ Here is the call graph for this function:

template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
const Column< K > &  input_ids,
const ColumnList< TextEncodingDict > &  input_cat_features,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< T > &  output_predictions 
)

Definition at line 1886 of file MLTableFunctions.hpp.

References Column< TextEncodingDict >::getString(), ml_reg_predict__cpu_template(), and Column< TextEncodingDict >::size().

1892  {
1893  if (model_name.size() != 1) {
1894  return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
1895  }
1896  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
1897  return ml_reg_predict__cpu_template(mgr,
1898  model_name_text_enc_none,
1899  input_ids,
1900  input_cat_features,
1901  preferred_ml_framework_str,
1902  output_ids,
1903  output_predictions);
1904 }
DEVICE const std::string getString(int64_t index) const
NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
DEVICE int64_t size() const

+ Here is the call graph for this function:

template<typename T , typename K >
NEVER_INLINE HOST int32_t ml_reg_predict_impl ( TableFunctionManager mgr,
const std::shared_ptr< AbstractMLModel > &  model,
const Column< K > &  input_ids,
const ColumnList< T > &  input_features,
const TextEncodingNone preferred_ml_framework_str,
Column< K > &  output_ids,
Column< T > &  output_predictions 
)

Definition at line 1579 of file MLTableFunctions.hpp.

References CHECK, DECISION_TREE_REG, DEFAULT, TableFunctions_Namespace::denull_data(), GBT_REG, get_ml_framework(), TextEncodingNone::getString(), INVALID, LINEAR_REG, MLPACK, ColumnList< T >::numCols(), ONEDAL, pluck_ptrs(), Column< T >::ptr_, RANDOM_FOREST_REG, TableFunctionManager::set_output_row_size(), Column< T >::size(), heavydb.dtypes::T, and TableFunctions_Namespace::unmask_data().

Referenced by ml_reg_predict__cpu_template(), and r2_score_impl().

1585  {
1586  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1587  if (preferred_ml_framework == MLFramework::INVALID) {
1588  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1589  preferred_ml_framework_str.getString());
1590  }
1591  const auto denulled_data = denull_data(input_features);
1592  const int64_t num_rows = denulled_data.masked_num_rows;
1593  const bool data_is_masked =
1594  denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
1595  std::vector<T> denulled_output_allocation(data_is_masked ? num_rows : 0);
1596  mgr.set_output_row_size(input_ids.size());
1597  T* denulled_output =
1598  data_is_masked ? denulled_output_allocation.data() : output_predictions.ptr_;
1599  const auto features_ptrs = pluck_ptrs(denulled_data.data, 0L, input_features.numCols());
1600 
1601  try {
1602  bool did_execute = false;
1603  const auto model_type = model->getModelType();
1604  switch (model_type) {
1605  case MLModelType::LINEAR_REG: {
1606  const auto linear_reg_model =
1607  std::dynamic_pointer_cast<LinearRegressionModel>(model);
1608  CHECK(linear_reg_model);
1609 #ifdef HAVE_ONEDAL
1610  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1611  preferred_ml_framework == MLFramework::DEFAULT)) {
1612  onedal_linear_reg_predict_impl(
1613  linear_reg_model, features_ptrs, denulled_output, num_rows);
1614  did_execute = true;
1615  }
1616 #endif
1617 #ifdef HAVE_MLPACK
1618  if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
1619  preferred_ml_framework == MLFramework::DEFAULT)) {
1620  mlpack_linear_reg_predict_impl(
1621  linear_reg_model, features_ptrs, denulled_output, num_rows);
1622  did_execute = true;
1623  }
1624 #endif
1625  break;
1626  }
1628 #ifdef HAVE_ONEDAL
1629  const auto decision_tree_reg_model =
1630  std::dynamic_pointer_cast<DecisionTreeRegressionModel>(model);
1631  CHECK(decision_tree_reg_model);
1632  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1633  preferred_ml_framework == MLFramework::DEFAULT)) {
1634  onedal_decision_tree_reg_predict_impl(
1635  decision_tree_reg_model, features_ptrs, denulled_output, num_rows);
1636  did_execute = true;
1637  }
1638 #endif
1639  break;
1640  }
1641  case MLModelType::GBT_REG: {
1642 #ifdef HAVE_ONEDAL
1643  const auto gbt_reg_model = std::dynamic_pointer_cast<GbtRegressionModel>(model);
1644  CHECK(gbt_reg_model);
1645  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1646  preferred_ml_framework == MLFramework::DEFAULT)) {
1647  onedal_gbt_reg_predict_impl(
1648  gbt_reg_model, features_ptrs, denulled_output, num_rows);
1649  did_execute = true;
1650  }
1651 #endif
1652  break;
1653  }
1655 #ifdef HAVE_ONEDAL
1656  const auto random_forest_reg_model =
1657  std::dynamic_pointer_cast<RandomForestRegressionModel>(model);
1658  CHECK(random_forest_reg_model);
1659  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1660  preferred_ml_framework == MLFramework::DEFAULT)) {
1661  onedal_random_forest_reg_predict_impl(
1662  random_forest_reg_model, features_ptrs, denulled_output, num_rows);
1663  did_execute = true;
1664  }
1665 #endif
1666  break;
1667  }
1668  default: {
1669  throw std::runtime_error("Unsupported model type");
1670  }
1671  }
1672  if (!did_execute) {
1673  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
1674  " ML library to support model implementation.");
1675  }
1676  } catch (std::runtime_error& e) {
1677  const std::string error_str(e.what());
1678  return mgr.ERROR_MESSAGE(error_str);
1679  }
1680  output_ids = input_ids;
1681  if (data_is_masked) {
1682  unmask_data(denulled_output,
1683  denulled_data.reverse_index_map,
1684  output_predictions.ptr_,
1685  denulled_data.unmasked_num_rows,
1686  inline_null_value<T>());
1687  }
1688  return input_ids.size();
1689 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
DEVICE int64_t numCols() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
void unmask_data(const T *masked_input, const std::vector< int32_t > &reverse_index_map, T *unmasked_output, const int64_t num_unmasked_rows, const T null_val)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t pca_fit__cpu_1 ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const ColumnList< TextEncodingDict > &  input_cat_features,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 67 of file MLTableFunctions.cpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and pca_fit_impl().

74  {
75  CategoricalFeaturesBuilder<double> cat_features_builder(
76  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
77  return pca_fit_impl(mgr,
78  model_name,
79  cat_features_builder.getFeatures(),
80  cat_features_builder.getCatFeatureKeys(),
81  preferred_ml_framework_str,
82  model_metadata,
83  output_model_name);
84 }
NEVER_INLINE HOST int32_t pca_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t pca_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const ColumnList< T > &  input_features,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1500 of file MLTableFunctions.hpp.

References pca_fit_impl().

1505  {
1506  std::vector<std::vector<std::string>> empty_cat_feature_keys;
1507  return pca_fit_impl(mgr,
1508  model_name,
1509  input_features,
1510  empty_cat_feature_keys,
1511  preferred_ml_framework_str,
1512  model_metadata,
1513  output_model_name);
1514 }
NEVER_INLINE HOST int32_t pca_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t pca_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1531 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and pca_fit_impl().

1539  {
1540  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1541  input_numeric_features,
1542  cat_top_k,
1543  cat_min_fraction,
1544  false /* cat_include_others */);
1545  return pca_fit_impl(mgr,
1546  model_name,
1547  cat_features_builder.getFeatures(),
1548  cat_features_builder.getCatFeatureKeys(),
1549  preferred_ml_framework_str,
1550  model_metadata,
1551  output_model_name);
1552 }
NEVER_INLINE HOST int32_t pca_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t pca_fit_impl ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const ColumnList< T > &  input_features,
const std::vector< std::vector< std::string >> &  cat_feature_keys,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1423 of file MLTableFunctions.hpp.

References MLModelMap::addModel(), DEFAULT, TableFunctions_Namespace::denull_data(), g_ml_models, get_ml_framework(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, ColumnList< T >::numCols(), ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), ColumnList< T >::size(), and z_std_normalize_data_with_summary_stats().

Referenced by pca_fit__cpu_1(), and pca_fit__cpu_template().

1429  {
1430  if (input_features.size() == 0) {
1431  return mgr.ERROR_MESSAGE(
1432  "No rows exist in training data. Training data must at least contain 1 row.");
1433  }
1434  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1435  if (preferred_ml_framework == MLFramework::INVALID) {
1436  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1437  preferred_ml_framework_str.getString());
1438  }
1439  try {
1440  const auto denulled_data = denull_data(input_features);
1441  const int64_t num_rows = denulled_data.masked_num_rows;
1442  if (num_rows == 0) {
1443  return mgr.ERROR_MESSAGE(
1444  "No non-null rows exist in training data. Training data must at least contain "
1445  "1 "
1446  "non-null row.");
1447  }
1448  const auto features_ptrs =
1449  pluck_ptrs(denulled_data.data, 0L, input_features.numCols());
1450  // z_std_normalize_data_with_summary_stats can throw if std dev is 0
1451  const auto z_std_norm_summary_stats =
1452  z_std_normalize_data_with_summary_stats(denulled_data.data, num_rows);
1453  const auto normalized_ptrs =
1454  pluck_ptrs(z_std_norm_summary_stats.normalized_data,
1455  0L,
1456  z_std_norm_summary_stats.normalized_data.size());
1457  bool did_execute = false;
1458 #ifdef HAVE_ONEDAL
1459  if (preferred_ml_framework == MLFramework::ONEDAL ||
1460  preferred_ml_framework == MLFramework::DEFAULT) {
1461  const auto [eigenvectors, eigenvalues] =
1462  onedal_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
1463  auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
1464  z_std_norm_summary_stats.std_devs,
1465  eigenvectors,
1466  eigenvalues,
1467  model_metadata,
1468  cat_feature_keys);
1469  g_ml_models.addModel(model_name, model);
1470  did_execute = true;
1471  }
1472 #endif
1473  if (!did_execute) {
1474  return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
1475  " ML library to support PCA implementation.");
1476  }
1477  mgr.set_output_row_size(1);
1478  const TextEncodingDict model_name_str_id =
1479  output_model_name.getOrAddTransient(model_name);
1480  output_model_name[0] = model_name_str_id;
1481  return 1;
1482  } catch (std::runtime_error& e) {
1483  return mgr.ERROR_MESSAGE(e.what());
1484  }
1485 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t numCols() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
void addModel(const std::string &model_name, std::shared_ptr< AbstractMLModel > model)
Definition: MLModel.h:37
MLModelMap g_ml_models
Definition: MLModel.h:124
DEVICE int64_t size() const
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)
ZStdNormalizationSummaryStats< T > z_std_normalize_data_with_summary_stats(const std::vector< T * > &input_data, const int64_t num_rows)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
std::vector<const T*> pluck_ptrs ( const std::vector< std::vector< T >> &  data,
const int64_t  start_idx,
const int64_t  end_idx 
)

Definition at line 42 of file MLTableFunctions.hpp.

References CHECK_GE, CHECK_GT, and CHECK_LE.

Referenced by dbscan__cpu_template(), decision_tree_reg_impl(), gbt_reg_fit_impl(), kmeans__cpu_template(), linear_reg_fit_impl(), ml_reg_predict_impl(), pca_fit_impl(), and random_forest_reg_fit_impl().

44  {
45  std::vector<const T*> raw_ptrs;
46  CHECK_GE(start_idx, 0L);
47  CHECK_GT(end_idx, start_idx);
48  CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
49  for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
50  raw_ptrs.emplace_back(data[col_idx].data());
51  }
52  return raw_ptrs;
53 }
#define CHECK_GE(x, y)
Definition: Logger.h:306
#define CHECK_GT(x, y)
Definition: Logger.h:305
#define CHECK_LE(x, y)
Definition: Logger.h:304

+ Here is the caller graph for this function:

template<typename T >
std::vector<const T*> pluck_ptrs ( const std::vector< T * > &  data,
const int64_t  start_idx,
const int64_t  end_idx 
)

Definition at line 56 of file MLTableFunctions.hpp.

References CHECK_GE, CHECK_GT, and CHECK_LE.

58  {
59  std::vector<const T*> raw_ptrs;
60  CHECK_GE(start_idx, 0L);
61  CHECK_GT(end_idx, start_idx);
62  CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
63  for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
64  raw_ptrs.emplace_back(data[col_idx]);
65  }
66  return raw_ptrs;
67 }
#define CHECK_GE(x, y)
Definition: Logger.h:306
#define CHECK_GT(x, y)
Definition: Logger.h:305
#define CHECK_LE(x, y)
Definition: Logger.h:304
template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
Column< double > &  output_r2 
)

Definition at line 2000 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ColumnList< T >::numCols(), and r2_score_impl().

Referenced by r2_score__cpu_template().

2004  {
2005  try {
2006  const auto model = g_ml_models.getModel(model_name);
2007  check_model_params(model, 0, input_features.numCols());
2008  return r2_score_impl(mgr, model, input_labels, input_features, output_r2);
2009  } catch (std::runtime_error& e) {
2010  const std::string error_str(e.what());
2011  return mgr.ERROR_MESSAGE(error_str);
2012  }
2013 }
NEVER_INLINE HOST int32_t r2_score_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:50
MLModelMap g_ml_models
Definition: MLModel.h:124

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
Column< double > &  output_r2 
)

Definition at line 2026 of file MLTableFunctions.hpp.

References Column< TextEncodingDict >::getString(), r2_score__cpu_template(), and Column< TextEncodingDict >::size().

2030  {
2031  if (model_name.size() != 1) {
2032  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
2033  }
2034  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
2035  return r2_score__cpu_template(
2036  mgr, model_name_text_enc_none, input_labels, input_features, output_r2);
2037 }
DEVICE const std::string getString(int64_t index) const
DEVICE int64_t size() const
NEVER_INLINE HOST int32_t r2_score__cpu_template(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
Column< double > &  output_r2 
)

Definition at line 2049 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ColumnList< T >::numCols(), ColumnList< TextEncodingDict >::numCols(), and r2_score_impl().

2054  {
2055  try {
2056  const auto model = g_ml_models.getModel(model_name);
2058  model, input_cat_features.numCols(), input_numeric_features.numCols());
2059  CategoricalFeaturesBuilder<T> cat_features_builder(
2060  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2061  return r2_score_impl(
2062  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2063  } catch (std::runtime_error& e) {
2064  const std::string error_str(e.what());
2065  return mgr.ERROR_MESSAGE(error_str);
2066  }
2067 }
NEVER_INLINE HOST int32_t r2_score_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:50
DEVICE int64_t numCols() const
MLModelMap g_ml_models
Definition: MLModel.h:124

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
Column< double > &  output_r2 
)

Definition at line 2079 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ColumnList< TextEncodingDict >::numCols(), and r2_score_impl().

2083  {
2084  try {
2085  const auto model = g_ml_models.getModel(model_name);
2086  check_model_params(model, input_cat_features.numCols(), 0);
2087  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
2088  model->getCatFeatureKeys());
2089  return r2_score_impl(
2090  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2091  } catch (std::runtime_error& e) {
2092  const std::string error_str(e.what());
2093  return mgr.ERROR_MESSAGE(error_str);
2094  }
2095 }
NEVER_INLINE HOST int32_t r2_score_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:50
DEVICE int64_t numCols() const
MLModelMap g_ml_models
Definition: MLModel.h:124

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t r2_score__cpu_template ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
Column< double > &  output_r2 
)

Definition at line 2107 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), Column< TextEncodingDict >::getString(), ColumnList< T >::numCols(), ColumnList< TextEncodingDict >::numCols(), r2_score_impl(), and Column< TextEncodingDict >::size().

2112  {
2113  if (model_name.size() != 1) {
2114  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
2115  }
2116  const std::string model_name_str{model_name.getString(0)};
2117  try {
2118  const auto model = g_ml_models.getModel(model_name_str);
2120  model, input_cat_features.numCols(), input_numeric_features.numCols());
2121  CategoricalFeaturesBuilder<T> cat_features_builder(
2122  input_cat_features, input_numeric_features, model->getCatFeatureKeys());
2123  return r2_score_impl(
2124  mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
2125  } catch (std::runtime_error& e) {
2126  const std::string error_str(e.what());
2127  return mgr.ERROR_MESSAGE(error_str);
2128  }
2129 }
DEVICE const std::string getString(int64_t index) const
NEVER_INLINE HOST int32_t r2_score_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)
DEVICE int64_t numCols() const
EXTENSION_NOINLINE_HOST void check_model_params(const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:50
DEVICE int64_t numCols() const
MLModelMap g_ml_models
Definition: MLModel.h:124
DEVICE int64_t size() const

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t r2_score_impl ( TableFunctionManager mgr,
const std::shared_ptr< AbstractMLModel > &  model,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
Column< double > &  output_r2 
)

Definition at line 1907 of file MLTableFunctions.hpp.

References TableFunctionManager::disable_output_allocations(), TableFunctionManager::enable_output_allocations(), get_column_mean(), max_inputs_per_thread, ml_reg_predict_impl(), threading_serial::parallel_for(), TableFunctionManager::set_output_row_size(), and Column< T >::size().

Referenced by r2_score__cpu_template().

1911  {
1912  const int64_t num_rows = input_labels.size();
1913  if (num_rows == 0) {
1914  return mgr.ERROR_MESSAGE(
1915  "No rows exist in evaluation data. Evaluation data must at least contain 1 row.");
1916  }
1917  std::vector<T> output_predictions_vec(num_rows);
1918  Column<T> output_predictions(output_predictions_vec);
1919  std::vector<int64_t> input_ids_vec(num_rows);
1920  std::vector<int64_t> output_ids_vec(num_rows);
1921  Column<int64_t> input_ids(input_ids_vec);
1922  Column<int64_t> output_ids(output_ids_vec);
1924  TextEncodingNone ml_framework_encoding_none("DEFAULT");
1925 
1926  try {
1927  auto ret = ml_reg_predict_impl(mgr,
1928  model,
1929  input_ids,
1930  input_features,
1931  ml_framework_encoding_none,
1932  output_ids,
1933  output_predictions);
1934 
1935  if (ret < 0) {
1936  // A return of less than 0 symbolizes an error
1937  return ret;
1938  }
1939  } catch (std::runtime_error& e) {
1941  return mgr.ERROR_MESSAGE(e.what());
1942  }
1943 
1945  mgr.set_output_row_size(1);
1946 
1947  const auto labels_mean = get_column_mean(input_labels);
1948  const size_t max_thread_count = std::thread::hardware_concurrency();
1949  const size_t max_inputs_per_thread = 20000;
1950  const size_t num_threads = std::min(
1951  max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
1952 
1953  std::vector<double> local_sum_squared_regressions(num_threads, 0.0);
1954  std::vector<double> local_sum_squares(num_threads, 0.0);
1955 
1956  tbb::task_arena limited_arena(num_threads);
1957 
1958  limited_arena.execute([&] {
1960  tbb::blocked_range<int64_t>(0, num_rows),
1961  [&](const tbb::blocked_range<int64_t>& r) {
1962  const int64_t start_idx = r.begin();
1963  const int64_t end_idx = r.end();
1964  double local_sum_squared_regression{0.0};
1965  double local_sum_square{0.0};
1966  for (int64_t row_idx = start_idx; row_idx < end_idx; ++row_idx) {
1967  if (output_predictions[row_idx] != inline_null_value<T>()) {
1968  local_sum_squared_regression +=
1969  (input_labels[row_idx] - output_predictions[row_idx]) *
1970  (input_labels[row_idx] - output_predictions[row_idx]);
1971  local_sum_square += (input_labels[row_idx] - labels_mean) *
1972  (input_labels[row_idx] - labels_mean);
1973  }
1974  }
1975  const size_t thread_idx = tbb::this_task_arena::current_thread_index();
1976  local_sum_squared_regressions[thread_idx] += local_sum_squared_regression;
1977  local_sum_squares[thread_idx] += local_sum_square;
1978  });
1979  });
1980  double sum_squared_regression{0.0};
1981  double sum_squares{0.0};
1982  for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
1983  sum_squared_regression += local_sum_squared_regressions[thread_idx];
1984  sum_squares += local_sum_squares[thread_idx];
1985  }
1986  output_r2[0] = sum_squares == 0.0 ? 1.0 : 1.0 - (sum_squared_regression / sum_squares);
1987  return 1;
1988 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
DEVICE int64_t size() const
const size_t max_inputs_per_thread
void disable_output_allocations()
Definition: heavydbTypes.h:379
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
NEVER_INLINE HOST int32_t ml_reg_predict_impl(TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)
void enable_output_allocations()
Definition: heavydbTypes.h:381
NEVER_INLINE HOST double get_column_mean(const T *data, const int64_t num_rows)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const int64_t  num_trees,
const double  obs_per_tree_fraction,
const int64_t  max_tree_depth,
const int64_t  features_per_node,
const double  impurity_threshold,
const bool  bootstrap,
const int64_t  min_obs_per_leaf_node,
const int64_t  min_obs_per_split_node,
const double  min_weight_fraction_in_leaf_node,
const double  min_impurity_decrease_in_split_node,
const int64_t  max_leaf_nodes,
const bool  use_histogram,
const TextEncodingNone var_importance_metric_str,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1223 of file MLTableFunctions.hpp.

References random_forest_reg_fit_impl().

1242  {
1243  std::vector<std::vector<std::string>> empty_cat_feature_keys;
1244  return random_forest_reg_fit_impl(mgr,
1245  model_name,
1246  input_labels,
1247  input_features,
1248  empty_cat_feature_keys,
1249  num_trees,
1250  obs_per_tree_fraction,
1251  max_tree_depth,
1252  features_per_node,
1253  impurity_threshold,
1254  bootstrap,
1255  min_obs_per_leaf_node,
1256  min_obs_per_split_node,
1257  min_weight_fraction_in_leaf_node,
1258  min_impurity_decrease_in_split_node,
1259  max_leaf_nodes,
1260  use_histogram,
1261  var_importance_metric_str,
1262  preferred_ml_framework_str,
1263  model_metadata,
1264  output_model_name);
1265 }
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const ColumnList< T > &  input_numeric_features,
const int64_t  num_trees,
const double  obs_per_tree_fraction,
const int64_t  max_tree_depth,
const int64_t  features_per_node,
const double  impurity_threshold,
const bool  bootstrap,
const int64_t  min_obs_per_leaf_node,
const int64_t  min_obs_per_split_node,
const double  min_weight_fraction_in_leaf_node,
const double  min_impurity_decrease_in_split_node,
const int64_t  max_leaf_nodes,
const bool  use_histogram,
const TextEncodingNone var_importance_metric_str,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1294 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and random_forest_reg_fit_impl().

1317  {
1318  CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
1319  input_numeric_features,
1320  cat_top_k,
1321  cat_min_fraction,
1322  false /* cat_include_others */);
1323  return random_forest_reg_fit_impl(mgr,
1324  model_name,
1325  input_labels,
1326  cat_features_builder.getFeatures(),
1327  cat_features_builder.getCatFeatureKeys(),
1328  num_trees,
1329  obs_per_tree_fraction,
1330  max_tree_depth,
1331  features_per_node,
1332  impurity_threshold,
1333  bootstrap,
1334  min_obs_per_leaf_node,
1335  min_obs_per_split_node,
1336  min_weight_fraction_in_leaf_node,
1337  min_impurity_decrease_in_split_node,
1338  max_leaf_nodes,
1339  use_histogram,
1340  var_importance_metric_str,
1341  preferred_ml_framework_str,
1342  model_metadata,
1343  output_model_name);
1344 }
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< TextEncodingDict > &  input_cat_features,
const int64_t  num_trees,
const double  obs_per_tree_fraction,
const int64_t  max_tree_depth,
const int64_t  features_per_node,
const double  impurity_threshold,
const bool  bootstrap,
const int64_t  min_obs_per_leaf_node,
const int64_t  min_obs_per_split_node,
const double  min_weight_fraction_in_leaf_node,
const double  min_impurity_decrease_in_split_node,
const int64_t  max_leaf_nodes,
const bool  use_histogram,
const TextEncodingNone var_importance_metric_str,
const int32_t  cat_top_k,
const float  cat_min_fraction,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1373 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and random_forest_reg_fit_impl().

1395  {
1396  CategoricalFeaturesBuilder<T> cat_features_builder(
1397  input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
1398  return random_forest_reg_fit_impl(mgr,
1399  model_name,
1400  input_labels,
1401  cat_features_builder.getFeatures(),
1402  cat_features_builder.getCatFeatureKeys(),
1403  num_trees,
1404  obs_per_tree_fraction,
1405  max_tree_depth,
1406  features_per_node,
1407  impurity_threshold,
1408  bootstrap,
1409  min_obs_per_leaf_node,
1410  min_obs_per_split_node,
1411  min_weight_fraction_in_leaf_node,
1412  min_impurity_decrease_in_split_node,
1413  max_leaf_nodes,
1414  use_histogram,
1415  var_importance_metric_str,
1416  preferred_ml_framework_str,
1417  model_metadata,
1418  output_model_name);
1419 }
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl(TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

+ Here is the call graph for this function:

template<typename T >
NEVER_INLINE HOST int32_t random_forest_reg_fit_impl ( TableFunctionManager mgr,
const TextEncodingNone model_name,
const Column< T > &  input_labels,
const ColumnList< T > &  input_features,
const std::vector< std::vector< std::string >> &  cat_feature_keys,
const int64_t  num_trees,
const double  obs_per_tree_fraction,
const int64_t  max_tree_depth,
const int64_t  features_per_node,
const double  impurity_threshold,
const bool  bootstrap,
const int64_t  min_obs_per_leaf_node,
const int64_t  min_obs_per_split_node,
const double  min_weight_fraction_in_leaf_node,
const double  min_impurity_decrease_in_split_node,
const int64_t  max_leaf_nodes,
const bool  use_histogram,
const TextEncodingNone var_importance_metric_str,
const TextEncodingNone preferred_ml_framework_str,
const TextEncodingNone model_metadata,
Column< TextEncodingDict > &  output_model_name 
)

Definition at line 1082 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_ml_framework(), get_var_importance_metric(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, MLPACK, ColumnList< T >::numCols(), ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), Column< T >::size(), and heavydb.dtypes::T.

Referenced by random_forest_reg_fit__cpu_template().

1102  {
1103  if (input_labels.size() == 0) {
1104  return mgr.ERROR_MESSAGE(
1105  "No rows exist in training data. Training data must at least contain 1 row.");
1106  }
1107  const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
1108  if (preferred_ml_framework == MLFramework::INVALID) {
1109  return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
1110  preferred_ml_framework_str.getString());
1111  }
1112  if (preferred_ml_framework == MLFramework::MLPACK) {
1113  return mgr.ERROR_MESSAGE(
1114  "Only OneDAL framework supported for random forest regression.");
1115  }
1116 #ifndef HAVE_ONEDAL
1117  return mgr.ERROR_MESSAGE(
1118  "Only OneDAL framework supported for random forest regression.");
1119 #endif
1120 
1121  const auto denulled_data = denull_data(input_labels, input_features);
1122  const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
1123  const auto features_ptrs =
1124  pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
1125  mgr.set_output_row_size(1);
1126  try {
1127  bool did_execute = false;
1128  const auto var_importance_metric =
1129  get_var_importance_metric(var_importance_metric_str);
1130  if (var_importance_metric == VarImportanceMetric::INVALID) {
1131  return mgr.ERROR_MESSAGE("Invalid variable importance metric: " +
1132  var_importance_metric_str.getString());
1133  }
1134 #ifdef HAVE_ONEDAL
1135  if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
1136  preferred_ml_framework == MLFramework::DEFAULT)) {
1137  if (use_histogram) {
1138  onedal_random_forest_reg_fit_impl<T, decision_forest::regression::training::hist>(
1139  model_name,
1140  labels_ptrs[0],
1141  features_ptrs,
1142  model_metadata,
1143  cat_feature_keys,
1144  denulled_data.masked_num_rows,
1145  num_trees,
1146  obs_per_tree_fraction,
1147  max_tree_depth,
1148  features_per_node,
1149  impurity_threshold,
1150  bootstrap,
1151  min_obs_per_leaf_node,
1152  min_obs_per_split_node,
1153  min_weight_fraction_in_leaf_node,
1154  min_impurity_decrease_in_split_node,
1155  max_leaf_nodes,
1156  var_importance_metric);
1157  } else {
1158  onedal_random_forest_reg_fit_impl<
1159  T,
1160  decision_forest::regression::training::defaultDense>(
1161  model_name,
1162  labels_ptrs[0],
1163  features_ptrs,
1164  model_metadata,
1165  cat_feature_keys,
1166  denulled_data.masked_num_rows,
1167  num_trees,
1168  obs_per_tree_fraction,
1169  max_tree_depth,
1170  features_per_node,
1171  impurity_threshold,
1172  bootstrap,
1173  min_obs_per_leaf_node,
1174  min_obs_per_split_node,
1175  min_weight_fraction_in_leaf_node,
1176  min_impurity_decrease_in_split_node,
1177  max_leaf_nodes,
1178  var_importance_metric);
1179  }
1180  const TextEncodingDict model_name_str_id =
1181  output_model_name.getOrAddTransient(model_name);
1182  output_model_name[0] = model_name_str_id;
1183  did_execute = true;
1184  }
1185 #endif
1186  if (!did_execute) {
1187  return mgr.ERROR_MESSAGE(
1188  "Cannot find " + preferred_ml_framework_str.getString() +
1189  " ML library to support random forest regression implementation.");
1190  }
1191  } catch (std::runtime_error& e) {
1192  return mgr.ERROR_MESSAGE(e.what());
1193  }
1194  return 1;
1195 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::string getString() const
Definition: heavydbTypes.h:641
DEVICE int64_t size() const
DEVICE int64_t numCols() const
MaskedData< T > denull_data(const ColumnList< T > &features)
MLFramework get_ml_framework(const std::string &ml_framework_str)
std::vector< const T * > pluck_ptrs(const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)
VarImportanceMetric get_var_importance_metric(const std::string &var_importance_metric_str)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1 ( TableFunctionManager mgr,
const TextEncodingNone model_name,
Column< int64_t > &  feature_id,
Column< TextEncodingDict > &  feature,
Column< int64_t > &  sub_feature_id,
Column< TextEncodingDict > &  sub_feature,
Column< double > &  importance_score 
)

Definition at line 173 of file MLTableFunctions.cpp.

References g_ml_models, get_model_features(), MLModelMap::getModel(), Column< TextEncodingDict >::getOrAddTransient(), and TableFunctionManager::set_output_row_size().

Referenced by random_forest_reg_var_importance__cpu_2().

179  {
180 #ifndef HAVE_ONEDAL
181  return mgr.ERROR_MESSAGE(
182  "Only OneDAL framework supported for random forest regression.");
183 #endif
184  try {
185 #ifdef HAVE_ONEDAL
186  const auto base_model = g_ml_models.getModel(model_name);
187  const auto rand_forest_model =
188  std::dynamic_pointer_cast<RandomForestRegressionModel>(base_model);
189  if (!rand_forest_model) {
190  throw std::runtime_error("Model is not of type random forest.");
191  }
192  const auto& variable_importance_scores =
193  onedal_random_forest_reg_var_importance_impl(rand_forest_model);
194  const int64_t num_features = variable_importance_scores.size();
195  mgr.set_output_row_size(num_features);
196  if (num_features == 0) {
197  return mgr.ERROR_MESSAGE("Variable importance not computed for this model.");
198  }
199  if (num_features != rand_forest_model->getNumFeatures()) {
200  return mgr.ERROR_MESSAGE(
201  "Mismatch in number of features and number of variable importance metrics.");
202  }
203  const auto num_logical_features = rand_forest_model->getNumLogicalFeatures();
204  std::vector<std::string> feature_names =
205  get_model_features(model_name, rand_forest_model);
206 
207  int64_t physical_feature_idx = 0;
208  const auto& cat_feature_keys = rand_forest_model->getCatFeatureKeys();
209  const auto num_cat_features = rand_forest_model->getNumCatFeatures();
210  for (int64_t feature_idx = 0; feature_idx < num_logical_features; ++feature_idx) {
211  // Make feature ids start at 1, not 0
212  if (feature_idx < num_cat_features) {
213  const auto& col_cat_feature_keys = cat_feature_keys[feature_idx];
214  int64_t sub_feature_idx = 1;
215  for (const auto& col_cat_feature_key : col_cat_feature_keys) {
216  feature_id[physical_feature_idx] = feature_idx + 1;
217  if (feature_names[feature_idx].empty()) {
218  feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
219  } else {
220  feature[physical_feature_idx] =
221  feature.getOrAddTransient(feature_names[feature_idx]);
222  }
223  sub_feature_id[physical_feature_idx] = sub_feature_idx++;
224  const TextEncodingDict feature_sub_key =
225  sub_feature.getOrAddTransient(col_cat_feature_key);
226  sub_feature[physical_feature_idx] = feature_sub_key;
227  importance_score[physical_feature_idx] =
228  variable_importance_scores[physical_feature_idx];
229  physical_feature_idx++;
230  }
231  } else {
232  feature_id[physical_feature_idx] = feature_idx + 1;
233  if (feature_names[feature_idx].empty()) {
234  feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
235  } else {
236  feature[physical_feature_idx] =
237  feature.getOrAddTransient(feature_names[feature_idx]);
238  }
239  sub_feature_id[physical_feature_idx] = 1;
240  sub_feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
241  importance_score[physical_feature_idx] =
242  variable_importance_scores[physical_feature_idx];
243  physical_feature_idx++;
244  }
245  }
246  return num_features;
247 #endif
248  } catch (std::runtime_error& e) {
249  return mgr.ERROR_MESSAGE(e.what());
250  }
251 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:50
MLModelMap g_ml_models
Definition: MLModel.h:124
std::vector< std::string > get_model_features(const std::string &model_name, const std::shared_ptr< AbstractMLModel > &model)
DEVICE const TextEncodingDict getOrAddTransient(const std::string &str)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_2 ( TableFunctionManager mgr,
const Column< TextEncodingDict > &  model_name,
Column< int64_t > &  feature_id,
Column< TextEncodingDict > &  feature,
Column< int64_t > &  sub_feature_id,
Column< TextEncodingDict > &  sub_feature,
Column< double > &  importance_score 
)

Definition at line 254 of file MLTableFunctions.cpp.

References Column< TextEncodingDict >::getString(), random_forest_reg_var_importance__cpu_1(), and Column< TextEncodingDict >::size().

260  {
261  if (model_name.size() != 1) {
262  return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
263  }
264  TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
266  model_name_text_enc_none,
267  feature_id,
268  feature,
269  sub_feature_id,
270  sub_feature,
271  importance_score);
272 }
DEVICE const std::string getString(int64_t index) const
EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1(TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)
DEVICE int64_t size() const

+ Here is the call graph for this function:

EXTENSION_NOINLINE_HOST int32_t supported_ml_frameworks__cpu_ ( TableFunctionManager mgr,
Column< TextEncodingDict > &  output_ml_frameworks,
Column< bool > &  output_availability,
Column< bool > &  output_default 
)

Definition at line 8 of file MLTableFunctions.cpp.

References StringDictionaryProxy::getOrAddTransientBulk(), TableFunctionManager::set_output_row_size(), and Column< TextEncodingDict >::string_dict_proxy_.

11  {
12  const std::vector<std::string> ml_frameworks = {"onedal", "mlpack"};
13  const int32_t num_frameworks = ml_frameworks.size();
14  mgr.set_output_row_size(num_frameworks);
15  const std::vector<int32_t> ml_framework_string_ids =
16  output_ml_frameworks.string_dict_proxy_->getOrAddTransientBulk(ml_frameworks);
17 
18 #if defined(HAVE_ONEDAL) || defined(HAVE_MLPACK)
19  bool found_available_framework = false;
20  auto framework_found_actions = [&output_availability,
21  &output_default,
22  &found_available_framework](const int64_t out_row_idx) {
23  output_availability[out_row_idx] = true;
24  if (!found_available_framework) {
25  output_default[out_row_idx] = true;
26  found_available_framework = true;
27  } else {
28  output_default[out_row_idx] = false;
29  }
30  };
31 #endif
32 
33 #if !defined(HAVE_ONEDAL) || !defined(HAVE_MLPACK)
34  auto framework_not_found_actions = [&output_availability,
35  &output_default](const int64_t out_row_idx) {
36  output_availability[out_row_idx] = false;
37  output_default[out_row_idx] = false;
38  };
39 #endif
40 
41  for (int32_t out_row_idx = 0; out_row_idx < num_frameworks; ++out_row_idx) {
42  output_ml_frameworks[out_row_idx] = ml_framework_string_ids[out_row_idx];
43  if (ml_frameworks[out_row_idx] == "onedal") {
44 #ifdef HAVE_ONEDAL
45  framework_found_actions(out_row_idx);
46 #else
47  framework_not_found_actions(out_row_idx);
48 #endif
49  } else if (ml_frameworks[out_row_idx] == "mlpack") {
50 #ifdef HAVE_MLPACK
51  framework_found_actions(out_row_idx);
52 #else
53  framework_not_found_actions(out_row_idx);
54 #endif
55  }
56  }
57  return num_frameworks;
58 }
void set_output_row_size(int64_t num_rows)
Definition: heavydbTypes.h:373
StringDictionaryProxy * string_dict_proxy_
std::vector< int32_t > getOrAddTransientBulk(const std::vector< std::string > &strings)

+ Here is the call graph for this function: