#include "QueryEngine/TableFunctions/SystemFunctions/os/ML/MLTableFunctionsCommon.h"
#include "QueryEngine/TableFunctions/SystemFunctions/os/Shared/NullRowsRemoval.h"
#include "QueryEngine/heavydbTypes.h"
#include "QueryEngine/TableFunctions/SystemFunctions/os/ML/MLModel.h"
#include "QueryEngine/TableFunctions/SystemFunctions/os/ML/OneHotEncoder.h"
#include <tbb/parallel_for.h>
#include <tbb/task_arena.h>

Include dependency graph for MLTableFunctions.hpp:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes
struct	CategoricalFeaturesBuilder< T >

Functions
template<typename T >
std::vector< const T * >	pluck_ptrs (const std::vector< std::vector< T >> &data, const int64_t start_idx, const int64_t end_idx)

template<typename T >
std::vector< const T * >	pluck_ptrs (const std::vector< T * > &data, const int64_t start_idx, const int64_t end_idx)

EXTENSION_NOINLINE_HOST int32_t	supported_ml_frameworks__cpu_ (TableFunctionManager &mgr, Column< TextEncodingDict > &output_ml_frameworks, Column< bool > &output_availability, Column< bool > &output_default)

EXTENSION_NOINLINE_HOST void	check_model_params (const std::shared_ptr< AbstractMLModel > &model, const int64_t num_cat_features, const int64_t num_numeric_features)

template<typename K , typename T >
NEVER_INLINE HOST int32_t	kmeans__cpu_template (TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const int num_clusters, const int num_iterations, const TextEncodingNone &init_type_str, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)

template<typename K , typename T >
NEVER_INLINE HOST int32_t	dbscan__cpu_template (TableFunctionManager &mgr, const Column< K > &input_ids, const ColumnList< T > &input_features, const double epsilon, const int32_t min_observations, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< int32_t > &output_clusters)

template<typename T >
NEVER_INLINE HOST int32_t	linear_reg_fit_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	linear_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	linear_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	linear_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
Column< T >	create_wrapper_col (std::vector< T > &col_vec)

EXTENSION_NOINLINE_HOST int32_t	linear_reg_coefs__cpu_1 (TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)

EXTENSION_NOINLINE_HOST int32_t	linear_reg_coefs__cpu_2 (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &output_coef_idx, Column< TextEncodingDict > &output_feature, Column< int64_t > &output_sub_coef_idx, Column< TextEncodingDict > &output_sub_feature, Column< double > &output_coef)

template<typename T >
NEVER_INLINE HOST int32_t	decision_tree_reg_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	decision_tree_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	decision_tree_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	decision_tree_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const int64_t max_tree_depth, const int64_t min_observations_per_leaf_node, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	gbt_reg_fit_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	gbt_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	gbt_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	gbt_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const int64_t max_iterations, const int64_t max_tree_depth, const double shrinkage, const double min_split_loss, const double lambda, const double obs_per_tree_fraction, const int64_t features_per_node, const int64_t min_observations_per_leaf_node, const int64_t max_bins, const int64_t min_bin_size, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	random_forest_reg_fit_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	random_forest_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	random_forest_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	random_forest_reg_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const int64_t num_trees, const double obs_per_tree_fraction, const int64_t max_tree_depth, const int64_t features_per_node, const double impurity_threshold, const bool bootstrap, const int64_t min_obs_per_leaf_node, const int64_t min_obs_per_split_node, const double min_weight_fraction_in_leaf_node, const double min_impurity_decrease_in_split_node, const int64_t max_leaf_nodes, const bool use_histogram, const TextEncodingNone &var_importance_metric_str, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	pca_fit_impl (TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const std::vector< std::vector< std::string >> &cat_feature_keys, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	pca_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T >
NEVER_INLINE HOST int32_t	pca_fit__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

EXTENSION_NOINLINE_HOST int32_t	pca_fit__cpu_1 (TableFunctionManager &mgr, const TextEncodingNone &model_name, const ColumnList< TextEncodingDict > &input_cat_features, const int32_t cat_top_k, const float cat_min_fraction, const TextEncodingNone &preferred_ml_framework_str, const TextEncodingNone &model_metadata, Column< TextEncodingDict > &output_model_name)

template<typename T , typename K >
NEVER_INLINE HOST int32_t	ml_reg_predict_impl (TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)

template<typename T , typename K >
NEVER_INLINE HOST int32_t	ml_reg_predict__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)

template<typename T , typename K >
NEVER_INLINE HOST int32_t	ml_reg_predict__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)

template<typename T , typename K >
NEVER_INLINE HOST int32_t	ml_reg_predict__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< K > &input_ids, const ColumnList< TextEncodingDict > &input_cat_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)

template<typename T , typename K >
NEVER_INLINE HOST int32_t	ml_reg_predict__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< K > &input_ids, const ColumnList< T > &input_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)

template<typename T , typename K >
NEVER_INLINE HOST int32_t	ml_reg_predict__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< K > &input_ids, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)

template<typename T , typename K >
NEVER_INLINE HOST int32_t	ml_reg_predict__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< K > &input_ids, const ColumnList< TextEncodingDict > &input_cat_features, const TextEncodingNone &preferred_ml_framework_str, Column< K > &output_ids, Column< T > &output_predictions)

template<typename T >
NEVER_INLINE HOST int32_t	r2_score_impl (TableFunctionManager &mgr, const std::shared_ptr< AbstractMLModel > &model, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)

template<typename T >
NEVER_INLINE HOST int32_t	r2_score__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)

template<typename T >
NEVER_INLINE HOST int32_t	r2_score__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< T > &input_labels, const ColumnList< T > &input_features, Column< double > &output_r2)

template<typename T >
NEVER_INLINE HOST int32_t	r2_score__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, Column< double > &output_r2)

template<typename T >
NEVER_INLINE HOST int32_t	r2_score__cpu_template (TableFunctionManager &mgr, const TextEncodingNone &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, Column< double > &output_r2)

template<typename T >
NEVER_INLINE HOST int32_t	r2_score__cpu_template (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, const Column< T > &input_labels, const ColumnList< TextEncodingDict > &input_cat_features, const ColumnList< T > &input_numeric_features, Column< double > &output_r2)

EXTENSION_NOINLINE_HOST int32_t	random_forest_reg_var_importance__cpu_1 (TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)

EXTENSION_NOINLINE_HOST int32_t	random_forest_reg_var_importance__cpu_2 (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &feature_id, Column< TextEncodingDict > &feature, Column< int64_t > &sub_feature_id, Column< TextEncodingDict > &sub_feature, Column< double > &importance_score)

EXTENSION_NOINLINE_HOST int32_t	get_decision_trees__cpu_1 (TableFunctionManager &mgr, const TextEncodingNone &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)

EXTENSION_NOINLINE_HOST int32_t	get_decision_trees__cpu_2 (TableFunctionManager &mgr, const Column< TextEncodingDict > &model_name, Column< int64_t > &tree_id, Column< int64_t > &entry_id, Column< bool > &is_split_node, Column< int64_t > &feature_id, Column< int64_t > &left_child, Column< int64_t > &right_child, Column< double > &value)

Function Documentation

EXTENSION_NOINLINE_HOST void check_model_params	(	const std::shared_ptr< AbstractMLModel > &	model,
		const int64_t	num_cat_features,
		const int64_t	num_numeric_features
	)

Definition at line 363 of file MLTableFunctions.cpp.

Referenced by ml_reg_predict__cpu_template(), and r2_score__cpu_template().

                                                             {
   if (model->getNumLogicalFeatures() != num_cat_features + num_numeric_features) {
     std::ostringstream error_oss;
     error_oss << "Model expects " << model->getNumLogicalFeatures() << " features but "
               << num_cat_features + num_numeric_features << " were provided.";
     throw std::runtime_error(error_oss.str());
   }
   if (model->getNumCatFeatures() != num_cat_features) {
     std::ostringstream error_oss;
     error_oss << "Model expects " << model->getNumCatFeatures()
               << " categorical features but " << num_cat_features << " were provided.";
     throw std::runtime_error(error_oss.str());
   }
 }

Here is the caller graph for this function:

template<typename T >

Column<T> create_wrapper_col ( std::vector< T > & col_vec )

Definition at line 594 of file MLTableFunctions.hpp.

                                                     {
   Column<T> wrapper_col(col_vec.data(), static_cast<int64_t>(col_vec.size()));
   return wrapper_col;
 }

template<typename K , typename T >

NEVER_INLINE HOST int32_t dbscan__cpu_template	(	TableFunctionManager &	mgr,
		const Column< K > &	input_ids,
		const ColumnList< T > &	input_features,
		const double	epsilon,
		const int32_t	min_observations,
		const TextEncodingNone &	preferred_ml_framework_str,
		Column< K > &	output_ids,
		Column< int32_t > &	output_clusters
	)

Definition at line 204 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_ml_framework(), TextEncodingNone::getString(), INVALID, MLPACK, ONEAPI, ONEDAL, pluck_ptrs(), Column< T >::ptr_, TableFunctionManager::set_output_row_size(), Column< T >::size(), TableFunctions_Namespace::unmask_data(), and z_std_normalize_data().

                                                        {
   mgr.set_output_row_size(input_ids.size());
   output_ids = input_ids;
 
   const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
   if (preferred_ml_framework == MLFramework::INVALID) {
     return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
                              preferred_ml_framework_str.getString());
   }
 
   try {
     const auto denulled_data = denull_data(input_features);
     const int64_t num_rows = denulled_data.masked_num_rows;
     const bool data_is_masked =
         denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
     std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
     int32_t* denulled_output =
         data_is_masked ? denulled_output_allocation.data() : output_clusters.ptr_;
 
     // z_std_normalize_data can throw if std dev is 0
     const auto normalized_data = z_std_normalize_data(denulled_data.data, num_rows);
     const auto normalized_ptrs = pluck_ptrs(normalized_data, 0L, normalized_data.size());
 
     bool did_execute = false;
 #ifdef HAVE_ONEDAL
     if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
                          preferred_ml_framework == MLFramework::DEFAULT)) {
       onedal_oneapi_dbscan_impl(
           normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
       did_execute = true;
     } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
       onedal_dbscan_impl(
           normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
       did_execute = true;
     }
 #endif
 #ifdef HAVE_MLPACK
     if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
                          preferred_ml_framework == MLFramework::DEFAULT)) {
       mlpack_dbscan_impl(
           normalized_ptrs, denulled_output, num_rows, epsilon, min_observations);
       did_execute = true;
     }
 #endif
     if (!did_execute) {
       return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
                                " ML library to support dbscan implementation.");
     }
 
     if (data_is_masked) {
       unmask_data(denulled_output,
                   denulled_data.reverse_index_map,
                   output_clusters.ptr_,
                   denulled_data.unmasked_num_rows,
                   inline_null_value<int32_t>());
     }
   } catch (std::runtime_error& e) {
     return mgr.ERROR_MESSAGE(e.what());
   }
   return input_ids.size();
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< T > &	input_features,
		const int64_t	max_tree_depth,
		const int64_t	min_observations_per_leaf_node,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 717 of file MLTableFunctions.hpp.

References decision_tree_reg_impl().

                                                                                  {
   std::vector<std::vector<std::string>> empty_cat_feature_keys;
   return decision_tree_reg_impl(mgr,
                                 model_name,
                                 input_labels,
                                 input_features,
                                 empty_cat_feature_keys,
                                 max_tree_depth,
                                 min_observations_per_leaf_node,
                                 preferred_ml_framework_str,
                                 model_metadata,
                                 output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const ColumnList< T > &	input_numeric_features,
		const int64_t	max_tree_depth,
		const int64_t	min_observations_per_leaf_node,
		const int32_t	cat_top_k,
		const float	cat_min_fraction,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 755 of file MLTableFunctions.hpp.

References decision_tree_reg_impl(), CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), and CategoricalFeaturesBuilder< T >::getFeatures().

                                                  {
   std::vector<std::vector<std::string>> empty_cat_feature_keys;
   CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
                                                      input_numeric_features,
                                                      cat_top_k,
                                                      cat_min_fraction,
                                                      false /* cat_include_others */);
   return decision_tree_reg_impl(mgr,
                                 model_name,
                                 input_labels,
                                 cat_features_builder.getFeatures(),
                                 cat_features_builder.getCatFeatureKeys(),
                                 max_tree_depth,
                                 min_observations_per_leaf_node,
                                 preferred_ml_framework_str,
                                 model_metadata,
                                 output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t decision_tree_reg_fit__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const int64_t	max_tree_depth,
		const int64_t	min_observations_per_leaf_node,
		const int32_t	cat_top_k,
		const float	cat_min_fraction,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 802 of file MLTableFunctions.hpp.

References decision_tree_reg_impl(), CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), and CategoricalFeaturesBuilder< T >::getFeatures().

                                                  {
   std::vector<std::vector<std::string>> empty_cat_feature_keys;
   CategoricalFeaturesBuilder<T> cat_features_builder(
       input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
   return decision_tree_reg_impl(mgr,
                                 model_name,
                                 input_labels,
                                 cat_features_builder.getFeatures(),
                                 cat_features_builder.getCatFeatureKeys(),
                                 max_tree_depth,
                                 min_observations_per_leaf_node,
                                 preferred_ml_framework_str,
                                 model_metadata,
                                 output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t decision_tree_reg_impl	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< T > &	input_features,
		const std::vector< std::vector< std::string >> &	cat_feature_keys,
		const int64_t	max_tree_depth,
		const int64_t	min_observations_per_leaf_node,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 639 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_ml_framework(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, MLPACK, ColumnList< T >::numCols(), ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), and Column< T >::size().

Referenced by decision_tree_reg_fit__cpu_template().

                                                                     {
   if (input_labels.size() == 0) {
     return mgr.ERROR_MESSAGE(
         "No rows exist in training data. Training data must at least contain 1 row.");
   }
   const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
   if (preferred_ml_framework == MLFramework::INVALID) {
     return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
                              preferred_ml_framework_str.getString());
   }
   if (preferred_ml_framework == MLFramework::MLPACK) {
     return mgr.ERROR_MESSAGE(
         "Only OneDAL framework supported for decision tree regression.");
   }
 #ifndef HAVE_ONEDAL
   return mgr.ERROR_MESSAGE(
       "Only OneDAL framework supported for decision tree regression.");
 #endif
 
   const auto denulled_data = denull_data(input_labels, input_features);
   const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
   const auto features_ptrs =
       pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
   mgr.set_output_row_size(1);
   try {
     bool did_execute = false;
 #ifdef HAVE_ONEDAL
     if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
                          preferred_ml_framework == MLFramework::DEFAULT)) {
       onedal_decision_tree_reg_fit_impl<T>(model_name,
                                            labels_ptrs[0],
                                            features_ptrs,
                                            model_metadata,
                                            cat_feature_keys,
                                            denulled_data.masked_num_rows,
                                            max_tree_depth,
                                            min_observations_per_leaf_node);
       const TextEncodingDict model_name_str_id =
           output_model_name.getOrAddTransient(model_name);
       output_model_name[0] = model_name_str_id;
       did_execute = true;
     }
 #endif
     if (!did_execute) {
       return mgr.ERROR_MESSAGE(
           "Cannot find " + preferred_ml_framework_str.getString() +
           " ML library to support decision tree regression implementation.");
     }
   } catch (std::runtime_error& e) {
     return mgr.ERROR_MESSAGE(e.what());
   }
   return 1;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< T > &	input_features,
		const int64_t	max_iterations,
		const int64_t	max_tree_depth,
		const double	shrinkage,
		const double	min_split_loss,
		const double	lambda,
		const double	obs_per_tree_fraction,
		const int64_t	features_per_node,
		const int64_t	min_observations_per_leaf_node,
		const int64_t	max_bins,
		const int64_t	min_bin_size,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 930 of file MLTableFunctions.hpp.

References gbt_reg_fit_impl().

                                                                        {
   std::vector<std::vector<std::string>> empty_cat_feature_keys;
   return gbt_reg_fit_impl(mgr,
                           model_name,
                           input_labels,
                           input_features,
                           empty_cat_feature_keys,
                           max_iterations,
                           max_tree_depth,
                           shrinkage,
                           min_split_loss,
                           lambda,
                           obs_per_tree_fraction,
                           features_per_node,
                           min_observations_per_leaf_node,
                           max_bins,
                           min_bin_size,
                           preferred_ml_framework_str,
                           model_metadata,
                           output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const ColumnList< T > &	input_numeric_features,
		const int64_t	max_iterations,
		const int64_t	max_tree_depth,
		const double	shrinkage,
		const double	min_split_loss,
		const double	lambda,
		const double	obs_per_tree_fraction,
		const int64_t	features_per_node,
		const int64_t	min_observations_per_leaf_node,
		const int64_t	max_bins,
		const int64_t	min_bin_size,
		const int32_t	cat_top_k,
		const float	cat_min_fraction,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 993 of file MLTableFunctions.hpp.

References gbt_reg_fit_impl(), CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), and CategoricalFeaturesBuilder< T >::getFeatures().

                                                                        {
   CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
                                                      input_numeric_features,
                                                      cat_top_k,
                                                      cat_min_fraction,
                                                      false /* cat_include_others */);
   return gbt_reg_fit_impl(mgr,
                           model_name,
                           input_labels,
                           cat_features_builder.getFeatures(),
                           cat_features_builder.getCatFeatureKeys(),
                           max_iterations,
                           max_tree_depth,
                           shrinkage,
                           min_split_loss,
                           lambda,
                           obs_per_tree_fraction,
                           features_per_node,
                           min_observations_per_leaf_node,
                           max_bins,
                           min_bin_size,
                           preferred_ml_framework_str,
                           model_metadata,
                           output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t gbt_reg_fit__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const int64_t	max_iterations,
		const int64_t	max_tree_depth,
		const double	shrinkage,
		const double	min_split_loss,
		const double	lambda,
		const double	obs_per_tree_fraction,
		const int64_t	features_per_node,
		const int64_t	min_observations_per_leaf_node,
		const int64_t	max_bins,
		const int64_t	min_bin_size,
		const int32_t	cat_top_k,
		const float	cat_min_fraction,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 1063 of file MLTableFunctions.hpp.

References gbt_reg_fit_impl(), CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), and CategoricalFeaturesBuilder< T >::getFeatures().

                                                                        {
   CategoricalFeaturesBuilder<T> cat_features_builder(
       input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
   return gbt_reg_fit_impl(mgr,
                           model_name,
                           input_labels,
                           cat_features_builder.getFeatures(),
                           cat_features_builder.getCatFeatureKeys(),
                           max_iterations,
                           max_tree_depth,
                           shrinkage,
                           min_split_loss,
                           lambda,
                           obs_per_tree_fraction,
                           features_per_node,
                           min_observations_per_leaf_node,
                           max_bins,
                           min_bin_size,
                           preferred_ml_framework_str,
                           model_metadata,
                           output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t gbt_reg_fit_impl	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< T > &	input_features,
		const std::vector< std::vector< std::string >> &	cat_feature_keys,
		const int64_t	max_iterations,
		const int64_t	max_tree_depth,
		const double	shrinkage,
		const double	min_split_loss,
		const double	lambda,
		const double	obs_per_tree_fraction,
		const int64_t	features_per_node,
		const int64_t	min_observations_per_leaf_node,
		const int64_t	max_bins,
		const int64_t	min_bin_size,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 831 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_ml_framework(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, MLPACK, ColumnList< T >::numCols(), ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), and Column< T >::size().

Referenced by gbt_reg_fit__cpu_template().

                                                               {
   if (input_labels.size() == 0) {
     return mgr.ERROR_MESSAGE(
         "No rows exist in training data. Training data must at least contain 1 row.");
   }
   const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
   if (preferred_ml_framework == MLFramework::INVALID) {
     return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
                              preferred_ml_framework_str.getString());
   }
   if (preferred_ml_framework == MLFramework::MLPACK) {
     return mgr.ERROR_MESSAGE("Only OneDAL framework supported for GBT regression.");
   }
 #ifndef HAVE_ONEDAL
   return mgr.ERROR_MESSAGE("Only OneDAL framework supported for GBT regression.");
 #endif
 
   const auto denulled_data = denull_data(input_labels, input_features);
   const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
   const auto features_ptrs =
       pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
   mgr.set_output_row_size(1);
   try {
     bool did_execute = false;
 #ifdef HAVE_ONEDAL
     if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
                          preferred_ml_framework == MLFramework::DEFAULT)) {
       onedal_gbt_reg_fit_impl<T>(model_name,
                                  labels_ptrs[0],
                                  features_ptrs,
                                  model_metadata,
                                  cat_feature_keys,
                                  denulled_data.masked_num_rows,
                                  max_iterations,
                                  max_tree_depth,
                                  shrinkage,
                                  min_split_loss,
                                  lambda,
                                  obs_per_tree_fraction,
                                  features_per_node,
                                  min_observations_per_leaf_node,
                                  max_bins,
                                  min_bin_size);
       const TextEncodingDict model_name_str_id =
           output_model_name.getOrAddTransient(model_name);
       output_model_name[0] = model_name_str_id;
       did_execute = true;
     }
 #endif
     if (!did_execute) {
       return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
                                " ML library to support GBT regression implementation.");
     }
   } catch (std::runtime_error& e) {
     return mgr.ERROR_MESSAGE(e.what());
   }
   return 1;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_1	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		Column< int64_t > &	tree_id,
		Column< int64_t > &	entry_id,
		Column< bool > &	is_split_node,
		Column< int64_t > &	feature_id,
		Column< int64_t > &	left_child,
		Column< int64_t > &	right_child,
		Column< double > &	value
	)

Definition at line 276 of file MLTableFunctions.cpp.

References g_ml_models, MLModelMap::getModel(), and TableFunctionManager::set_output_row_size().

Referenced by get_decision_trees__cpu_2().

                                                          {
 #ifdef HAVE_ONEDAL
   try {
     const auto model = g_ml_models.getModel(model_name);
     const auto tree_model = std::dynamic_pointer_cast<AbstractTreeModel>(model);
     if (!tree_model) {
       throw std::runtime_error("Model not a tree-type model.");
     }
     const auto num_trees = tree_model->getNumTrees();
     std::vector<std::vector<DecisionTreeEntry>> decision_trees(num_trees);
     for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
       TreeModelVisitor tree_visitor(decision_trees[tree_idx]);
       tree_model->traverseDF(tree_idx, tree_visitor);
     }
     std::vector<int64_t> decision_tree_offsets(num_trees + 1);
     decision_tree_offsets[0] = 0;
     for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
       decision_tree_offsets[tree_idx + 1] =
           decision_tree_offsets[tree_idx] +
           static_cast<int64_t>(decision_trees[tree_idx].size());
     }
     const auto num_entries = decision_tree_offsets[num_trees];
     mgr.set_output_row_size(num_entries);
     for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
       const auto& decision_tree = decision_trees[tree_idx];
       const auto output_offset = decision_tree_offsets[tree_idx];
       const int64_t num_tree_entries = decision_tree.size();
       for (int64_t entry_idx = 0; entry_idx < num_tree_entries; ++entry_idx) {
         const int64_t output_idx = output_offset + entry_idx;
         const auto& tree_entry = decision_tree[entry_idx];
         const bool entry_is_split_node = tree_entry.isSplitNode();
         tree_id[output_idx] = tree_idx;
         entry_id[output_idx] = entry_idx;
         is_split_node[output_idx] = entry_is_split_node;
         feature_id[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
                                                       : tree_entry.feature_index;
         left_child[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
                                                       : tree_entry.left_child_row_idx;
         right_child[output_idx] = !entry_is_split_node ? inline_null_value<int64_t>()
                                                        : tree_entry.right_child_row_idx;
         value[output_idx] = tree_entry.value;
       }
     }
     return num_entries;
   } catch (std::runtime_error& e) {
     const std::string error_str(e.what());
     return mgr.ERROR_MESSAGE(error_str);
   }
 #else  // Not HAVE_ONEDAL
   return mgr.ERROR_MESSAGE("OneDAL library must be available for get_decision_trees.");
 #endif
 }

Here is the call graph for this function:

Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t get_decision_trees__cpu_2	(	TableFunctionManager &	mgr,
		const Column< TextEncodingDict > &	model_name,
		Column< int64_t > &	tree_id,
		Column< int64_t > &	entry_id,
		Column< bool > &	is_split_node,
		Column< int64_t > &	feature_id,
		Column< int64_t > &	left_child,
		Column< int64_t > &	right_child,
		Column< double > &	value
	)

Definition at line 338 of file MLTableFunctions.cpp.

References get_decision_trees__cpu_1(), Column< TextEncodingDict >::getString(), and Column< TextEncodingDict >::size().

                                                          {
   if (model_name.size() != 1) {
     return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
   }
   TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
   return get_decision_trees__cpu_1(mgr,
                                    model_name_text_enc_none,
                                    tree_id,
                                    entry_id,
                                    is_split_node,
                                    feature_id,
                                    left_child,
                                    right_child,
                                    value);
 }

Here is the call graph for this function:

template<typename K , typename T >

NEVER_INLINE HOST int32_t kmeans__cpu_template	(	TableFunctionManager &	mgr,
		const Column< K > &	input_ids,
		const ColumnList< T > &	input_features,
		const int	num_clusters,
		const int	num_iterations,
		const TextEncodingNone &	init_type_str,
		const TextEncodingNone &	preferred_ml_framework_str,
		Column< K > &	output_ids,
		Column< int32_t > &	output_clusters
	)

Definition at line 103 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_kmeans_init_type(), get_ml_framework(), TextEncodingNone::getString(), INVALID, MLPACK, ONEAPI, ONEDAL, pluck_ptrs(), Column< T >::ptr_, TableFunctionManager::set_output_row_size(), Column< T >::size(), TableFunctions_Namespace::unmask_data(), and z_std_normalize_data().

                                                        {
   mgr.set_output_row_size(input_ids.size());
   output_ids = input_ids;
   const auto kmeans_init_strategy = get_kmeans_init_type(init_type_str);
   if (kmeans_init_strategy == KMeansInitStrategy::INVALID) {
     return mgr.ERROR_MESSAGE("Invalid KMeans initialization strategy: " +
                              init_type_str.getString());
   }
 
   const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
   if (preferred_ml_framework == MLFramework::INVALID) {
     return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
                              preferred_ml_framework_str.getString());
   }
 
   try {
     const auto denulled_data = denull_data(input_features);
     const int64_t num_rows = denulled_data.masked_num_rows;
     const bool data_is_masked =
         denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
     std::vector<int32_t> denulled_output_allocation(data_is_masked ? num_rows : 0);
     int32_t* denulled_output =
         data_is_masked ? denulled_output_allocation.data() : output_clusters.ptr_;
 
     // z_std_normalize_data can throw if std dev is 0
     const auto normalized_data = z_std_normalize_data(denulled_data.data, num_rows);
     const auto normalized_ptrs = pluck_ptrs(normalized_data, 0L, normalized_data.size());
 
     bool did_execute = false;
 #ifdef HAVE_ONEDAL
     if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
                          preferred_ml_framework == MLFramework::DEFAULT)) {
       onedal_oneapi_kmeans_impl(normalized_ptrs,
                                 denulled_output,
                                 num_rows,
                                 num_clusters,
                                 num_iterations,
                                 kmeans_init_strategy);
       did_execute = true;
     } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
       onedal_kmeans_impl(normalized_ptrs,
                          denulled_output,
                          num_rows,
                          num_clusters,
                          num_iterations,
                          kmeans_init_strategy);
       did_execute = true;
     }
 #endif
 #ifdef HAVE_MLPACK
     if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
                          preferred_ml_framework == MLFramework::DEFAULT)) {
       mlpack_kmeans_impl(normalized_ptrs,
                          denulled_output,
                          num_rows,
                          num_clusters,
                          num_iterations,
                          kmeans_init_strategy);
       did_execute = true;
     }
 #endif
     if (!did_execute) {
       return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
                                " ML library to support kmeans implementation.");
     }
 
     if (data_is_masked) {
       unmask_data(denulled_output,
                   denulled_data.reverse_index_map,
                   output_clusters.ptr_,
                   denulled_data.unmasked_num_rows,
                   inline_null_value<int32_t>());
     }
   } catch (std::runtime_error& e) {
     return mgr.ERROR_MESSAGE(e.what());
   }
   return input_ids.size();
 }

Here is the call graph for this function:

EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_1	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		Column< int64_t > &	output_coef_idx,
		Column< TextEncodingDict > &	output_feature,
		Column< int64_t > &	output_sub_coef_idx,
		Column< TextEncodingDict > &	output_sub_feature,
		Column< double > &	output_coef
	)

Definition at line 88 of file MLTableFunctions.cpp.

References g_ml_models, get_model_features(), MLModelMap::getModel(), Column< TextEncodingDict >::getOrAddTransient(), and TableFunctionManager::set_output_row_size().

Referenced by linear_reg_coefs__cpu_2().

                                                      {
   try {
     const auto linear_reg_model = std::dynamic_pointer_cast<LinearRegressionModel>(
         g_ml_models.getModel(model_name));
     if (!linear_reg_model) {
       throw std::runtime_error("Model is not of type linear regression.");
     }
 
     const auto& coefs = linear_reg_model->getCoefs();
     const auto& cat_feature_keys = linear_reg_model->getCatFeatureKeys();
     const int64_t num_sub_coefs = static_cast<int64_t>(coefs.size());
     const int64_t num_cat_features = static_cast<int64_t>(cat_feature_keys.size());
     mgr.set_output_row_size(num_sub_coefs);
 
     std::vector<std::string> feature_names =
         get_model_features(model_name, linear_reg_model);
     feature_names.insert(feature_names.begin(), "intercept");
 
     for (int64_t sub_coef_idx = 0, coef_idx = 0; sub_coef_idx < num_sub_coefs;
          ++coef_idx) {
       if (num_cat_features >= coef_idx && coef_idx > 0) {
         const auto& col_cat_feature_keys = cat_feature_keys[coef_idx - 1];
         int64_t col_cat_feature_idx = 1;
         for (const auto& col_cat_feature_key : col_cat_feature_keys) {
           output_coef_idx[sub_coef_idx] = coef_idx;
           if (feature_names[coef_idx].empty()) {
             output_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
           } else {
             output_feature[sub_coef_idx] =
                 output_feature.getOrAddTransient(feature_names[coef_idx]);
           }
           output_sub_coef_idx[sub_coef_idx] = col_cat_feature_idx++;
           output_sub_feature[sub_coef_idx] =
               output_sub_feature.getOrAddTransient(col_cat_feature_key);
           output_coef[sub_coef_idx] = coefs[sub_coef_idx];
           ++sub_coef_idx;
         }
       } else {
         output_coef_idx[sub_coef_idx] = coef_idx;
         if (feature_names[coef_idx].empty()) {
           output_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
         } else {
           output_feature[sub_coef_idx] =
               output_feature.getOrAddTransient(feature_names[coef_idx]);
         }
         output_sub_coef_idx[sub_coef_idx] = 1;
         output_sub_feature[sub_coef_idx] = inline_null_value<TextEncodingDict>();
         output_coef[sub_coef_idx] = coefs[sub_coef_idx];
         ++sub_coef_idx;
       }
     }
 
     return num_sub_coefs;
   } catch (std::runtime_error& e) {
     return mgr.ERROR_MESSAGE(e.what());
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t linear_reg_coefs__cpu_2	(	TableFunctionManager &	mgr,
		const Column< TextEncodingDict > &	model_name,
		Column< int64_t > &	output_coef_idx,
		Column< TextEncodingDict > &	output_feature,
		Column< int64_t > &	output_sub_coef_idx,
		Column< TextEncodingDict > &	output_sub_feature,
		Column< double > &	output_coef
	)

Definition at line 153 of file MLTableFunctions.cpp.

References Column< TextEncodingDict >::getString(), linear_reg_coefs__cpu_1(), and Column< TextEncodingDict >::size().

                                                      {
   if (model_name.size() != 1) {
     return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
   }
   TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
   return linear_reg_coefs__cpu_1(mgr,
                                  model_name_text_enc_none,
                                  output_coef_idx,
                                  output_feature,
                                  output_sub_coef_idx,
                                  output_sub_feature,
                                  output_coef);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< T > &	input_features,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 366 of file MLTableFunctions.hpp.

References linear_reg_fit_impl().

                                                                           {
   std::vector<std::vector<std::string>> empty_cat_feature_keys;
   return linear_reg_fit_impl(mgr,
                              model_name,
                              input_labels,
                              input_features,
                              empty_cat_feature_keys,
                              preferred_ml_framework_str,
                              model_metadata,
                              output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const ColumnList< T > &	input_numeric_features,
		const int32_t	cat_top_k,
		const float	cat_min_fraction,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 530 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and linear_reg_fit_impl().

                                                                           {
   CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
                                                      input_numeric_features,
                                                      cat_top_k,
                                                      cat_min_fraction,
                                                      false /* cat_include_others */);
 
   return linear_reg_fit_impl(mgr,
                              model_name,
                              input_labels,
                              cat_features_builder.getFeatures(),
                              cat_features_builder.getCatFeatureKeys(),
                              preferred_ml_framework_str,
                              model_metadata,
                              output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t linear_reg_fit__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const int32_t	cat_top_k,
		const float	cat_min_fraction,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 571 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and linear_reg_fit_impl().

                                                                           {
   CategoricalFeaturesBuilder<T> cat_features_builder(
       input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
 
   return linear_reg_fit_impl(mgr,
                              model_name,
                              input_labels,
                              cat_features_builder.getFeatures(),
                              cat_features_builder.getCatFeatureKeys(),
                              preferred_ml_framework_str,
                              model_metadata,
                              output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t linear_reg_fit_impl	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< T > &	input_features,
		const std::vector< std::vector< std::string >> &	cat_feature_keys,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 275 of file MLTableFunctions.hpp.

References MLModelMap::addModel(), DEFAULT, TableFunctions_Namespace::denull_data(), g_ml_models, get_ml_framework(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, MLPACK, ColumnList< T >::numCols(), ONEAPI, ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), and Column< T >::size().

Referenced by linear_reg_fit__cpu_template().

                                                                  {
   if (input_labels.size() == 0) {
     return mgr.ERROR_MESSAGE(
         "No rows exist in training data. Training data must at least contain 1 row.");
   }
   const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
   if (preferred_ml_framework == MLFramework::INVALID) {
     return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
                              preferred_ml_framework_str.getString());
   }
   const auto denulled_data = denull_data(input_labels, input_features);
   const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
   const auto features_ptrs =
       pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
   const int64_t num_coefs = input_features.numCols() + 1;
   mgr.set_output_row_size(num_coefs);
   std::vector<int64_t> coef_idxs(num_coefs);
   std::vector<double> coefs(num_coefs);
   try {
     bool did_execute = false;
 #ifdef HAVE_ONEDAL
     // FIXME: We default to legacy DAAL Linear Regression, as the oneAPI implementation
     // seems to be experimental. It crashes on a few small toy models (such as datasets
     // with 1 datapoint) and finds different coefficients for large models, when compared
     // with the DAAL implementation. This should be revisited when oneDAL is updated.
     if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
                          preferred_ml_framework == MLFramework::DEFAULT)) {
       onedal_linear_reg_fit_impl(labels_ptrs[0],
                                  features_ptrs,
                                  coef_idxs.data(),
                                  coefs.data(),
                                  denulled_data.masked_num_rows);
       did_execute = true;
     } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI)) {
       onedal_oneapi_linear_reg_fit_impl(labels_ptrs[0],
                                         features_ptrs,
                                         coef_idxs.data(),
                                         coefs.data(),
                                         denulled_data.masked_num_rows);
       did_execute = true;
     }
 #endif
 #ifdef HAVE_MLPACK
     if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
                          preferred_ml_framework == MLFramework::DEFAULT)) {
       mlpack_linear_reg_fit_impl(labels_ptrs[0],
                                  features_ptrs,
                                  coef_idxs.data(),
                                  coefs.data(),
                                  denulled_data.masked_num_rows);
       did_execute = true;
     }
 #endif
     if (!did_execute) {
       return mgr.ERROR_MESSAGE(
           "Cannot find " + preferred_ml_framework_str.getString() +
           " ML library to support linear regression implementation.");
     }
   } catch (std::runtime_error& e) {
     return mgr.ERROR_MESSAGE(e.what());
   }
   auto model =
       std::make_shared<LinearRegressionModel>(coefs, model_metadata, cat_feature_keys);
   g_ml_models.addModel(model_name, model);
   const std::string model_name_str = model_name.getString();
   const TextEncodingDict model_name_str_id =
       output_model_name.getOrAddTransient(model_name);
   output_model_name[0] = model_name_str_id;
   return 1;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T , typename K >

NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< K > &	input_ids,
		const ColumnList< T > &	input_features,
		const TextEncodingNone &	preferred_ml_framework_str,
		Column< K > &	output_ids,
		Column< T > &	output_predictions
	)

Definition at line 1801 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ml_reg_predict_impl(), and ColumnList< T >::numCols().

Referenced by ml_reg_predict__cpu_template().

                                                             {
   try {
     const auto model = g_ml_models.getModel(model_name);
     check_model_params(model, 0, input_features.numCols());
     return ml_reg_predict_impl(mgr,
                                model,
                                input_ids,
                                input_features,
                                preferred_ml_framework_str,
                                output_ids,
                                output_predictions);
   } catch (std::runtime_error& e) {
     const std::string error_str(e.what());
     return mgr.ERROR_MESSAGE(error_str);
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T , typename K >

NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< K > &	input_ids,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const ColumnList< T > &	input_numeric_features,
		const TextEncodingNone &	preferred_ml_framework_str,
		Column< K > &	output_ids,
		Column< T > &	output_predictions
	)

Definition at line 1837 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ml_reg_predict_impl(), ColumnList< T >::numCols(), and ColumnList< TextEncodingDict >::numCols().

                                                             {
   try {
     const auto model = g_ml_models.getModel(model_name);
     check_model_params(
         model, input_cat_features.numCols(), input_numeric_features.numCols());
     CategoricalFeaturesBuilder<T> cat_features_builder(
         input_cat_features, input_numeric_features, model->getCatFeatureKeys());
     return ml_reg_predict_impl(mgr,
                                model,
                                input_ids,
                                cat_features_builder.getFeatures(),
                                preferred_ml_framework_str,
                                output_ids,
                                output_predictions);
   } catch (std::runtime_error& e) {
     const std::string error_str(e.what());
     return mgr.ERROR_MESSAGE(error_str);
   }
 }

Here is the call graph for this function:

template<typename T , typename K >

NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< K > &	input_ids,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const TextEncodingNone &	preferred_ml_framework_str,
		Column< K > &	output_ids,
		Column< T > &	output_predictions
	)

Definition at line 1877 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ml_reg_predict_impl(), and ColumnList< TextEncodingDict >::numCols().

                                                             {
   try {
     const auto model = g_ml_models.getModel(model_name);
     check_model_params(model, input_cat_features.numCols(), 0);
     CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
                                                        model->getCatFeatureKeys());
     return ml_reg_predict_impl(mgr,
                                model,
                                input_ids,
                                cat_features_builder.getFeatures(),
                                preferred_ml_framework_str,
                                output_ids,
                                output_predictions);
   } catch (std::runtime_error& e) {
     const std::string error_str(e.what());
     return mgr.ERROR_MESSAGE(error_str);
   }
 }

Here is the call graph for this function:

template<typename T , typename K >

NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template	(	TableFunctionManager &	mgr,
		const Column< TextEncodingDict > &	model_name,
		const Column< K > &	input_ids,
		const ColumnList< T > &	input_features,
		const TextEncodingNone &	preferred_ml_framework_str,
		Column< K > &	output_ids,
		Column< T > &	output_predictions
	)

Definition at line 1915 of file MLTableFunctions.hpp.

References Column< TextEncodingDict >::getString(), ml_reg_predict__cpu_template(), and Column< TextEncodingDict >::size().

                                                             {
   if (model_name.size() != 1) {
     return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
   }
   TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
   return ml_reg_predict__cpu_template(mgr,
                                       model_name_text_enc_none,
                                       input_ids,
                                       input_features,
                                       preferred_ml_framework_str,
                                       output_ids,
                                       output_predictions);
 }

Here is the call graph for this function:

template<typename T , typename K >

NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template	(	TableFunctionManager &	mgr,
		const Column< TextEncodingDict > &	model_name,
		const Column< K > &	input_ids,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const ColumnList< T > &	input_numeric_features,
		const TextEncodingNone &	preferred_ml_framework_str,
		Column< K > &	output_ids,
		Column< T > &	output_predictions
	)

Definition at line 1948 of file MLTableFunctions.hpp.

References Column< TextEncodingDict >::getString(), ml_reg_predict__cpu_template(), and Column< TextEncodingDict >::size().

                                                             {
   if (model_name.size() != 1) {
     return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
   }
   TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
   return ml_reg_predict__cpu_template(mgr,
                                       model_name_text_enc_none,
                                       input_ids,
                                       input_cat_features,
                                       input_numeric_features,
                                       preferred_ml_framework_str,
                                       output_ids,
                                       output_predictions);
 }

Here is the call graph for this function:

template<typename T , typename K >

NEVER_INLINE HOST int32_t ml_reg_predict__cpu_template	(	TableFunctionManager &	mgr,
		const Column< TextEncodingDict > &	model_name,
		const Column< K > &	input_ids,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const TextEncodingNone &	preferred_ml_framework_str,
		Column< K > &	output_ids,
		Column< T > &	output_predictions
	)

Definition at line 1983 of file MLTableFunctions.hpp.

References Column< TextEncodingDict >::getString(), ml_reg_predict__cpu_template(), and Column< TextEncodingDict >::size().

                                                             {
   if (model_name.size() != 1) {
     return mgr.ERROR_MESSAGE("Expected only one row in model CURSOR.");
   }
   TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
   return ml_reg_predict__cpu_template(mgr,
                                       model_name_text_enc_none,
                                       input_ids,
                                       input_cat_features,
                                       preferred_ml_framework_str,
                                       output_ids,
                                       output_predictions);
 }

Here is the call graph for this function:

template<typename T , typename K >

NEVER_INLINE HOST int32_t ml_reg_predict_impl	(	TableFunctionManager &	mgr,
		const std::shared_ptr< AbstractMLModel > &	model,
		const Column< K > &	input_ids,
		const ColumnList< T > &	input_features,
		const TextEncodingNone &	preferred_ml_framework_str,
		Column< K > &	output_ids,
		Column< T > &	output_predictions
	)

Definition at line 1664 of file MLTableFunctions.hpp.

References CHECK, DECISION_TREE_REG, DEFAULT, TableFunctions_Namespace::denull_data(), GBT_REG, get_ml_framework(), TextEncodingNone::getString(), INVALID, LINEAR_REG, MLPACK, ColumnList< T >::numCols(), ONEAPI, ONEDAL, pluck_ptrs(), Column< T >::ptr_, RANDOM_FOREST_REG, TableFunctionManager::set_output_row_size(), Column< T >::size(), heavydb.dtypes::T, and TableFunctions_Namespace::unmask_data().

Referenced by ml_reg_predict__cpu_template(), and r2_score_impl().

                                                    {
   const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
   if (preferred_ml_framework == MLFramework::INVALID) {
     return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
                              preferred_ml_framework_str.getString());
   }
   const auto denulled_data = denull_data(input_features);
   const int64_t num_rows = denulled_data.masked_num_rows;
   const bool data_is_masked =
       denulled_data.masked_num_rows < denulled_data.unmasked_num_rows;
   std::vector<T> denulled_output_allocation(data_is_masked ? num_rows : 0);
   mgr.set_output_row_size(input_ids.size());
   T* denulled_output =
       data_is_masked ? denulled_output_allocation.data() : output_predictions.ptr_;
   const auto features_ptrs = pluck_ptrs(denulled_data.data, 0L, input_features.numCols());
 
   try {
     bool did_execute = false;
     const auto model_type = model->getModelType();
     switch (model_type) {
       case MLModelType::LINEAR_REG: {
         const auto linear_reg_model =
             std::dynamic_pointer_cast<LinearRegressionModel>(model);
         CHECK(linear_reg_model);
 #ifdef HAVE_ONEDAL
         if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
                              preferred_ml_framework == MLFramework::DEFAULT)) {
           onedal_oneapi_linear_reg_predict_impl(
               linear_reg_model, features_ptrs, denulled_output, num_rows);
           did_execute = true;
         } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
           onedal_linear_reg_predict_impl(
               linear_reg_model, features_ptrs, denulled_output, num_rows);
           did_execute = true;
         }
 #endif
 #ifdef HAVE_MLPACK
         if (!did_execute && (preferred_ml_framework == MLFramework::MLPACK ||
                              preferred_ml_framework == MLFramework::DEFAULT)) {
           mlpack_linear_reg_predict_impl(
               linear_reg_model, features_ptrs, denulled_output, num_rows);
           did_execute = true;
         }
 #endif
         break;
       }
       case MLModelType::DECISION_TREE_REG: {
 #ifdef HAVE_ONEDAL
         const auto decision_tree_reg_model =
             std::dynamic_pointer_cast<DecisionTreeRegressionModel>(model);
         CHECK(decision_tree_reg_model);
         if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
                              preferred_ml_framework == MLFramework::DEFAULT)) {
           onedal_decision_tree_reg_predict_impl(
               decision_tree_reg_model, features_ptrs, denulled_output, num_rows);
           did_execute = true;
         }
 #endif
         break;
       }
       case MLModelType::GBT_REG: {
 #ifdef HAVE_ONEDAL
         const auto gbt_reg_model = std::dynamic_pointer_cast<GbtRegressionModel>(model);
         CHECK(gbt_reg_model);
         if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL ||
                              preferred_ml_framework == MLFramework::DEFAULT)) {
           onedal_gbt_reg_predict_impl(
               gbt_reg_model, features_ptrs, denulled_output, num_rows);
           did_execute = true;
         }
 #endif
         break;
       }
       case MLModelType::RANDOM_FOREST_REG: {
 #ifdef HAVE_ONEDAL
         const auto random_forest_reg_model =
             std::dynamic_pointer_cast<RandomForestRegressionModel>(model);
         const auto oneapi_random_forest_reg_model =
             std::dynamic_pointer_cast<OneAPIRandomForestRegressionModel>(model);
         CHECK(random_forest_reg_model || oneapi_random_forest_reg_model);
         if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
                              preferred_ml_framework == MLFramework::ONEDAL ||
                              preferred_ml_framework == MLFramework::DEFAULT)) {
           if (random_forest_reg_model) {
             onedal_random_forest_reg_predict_impl(
                 random_forest_reg_model, features_ptrs, denulled_output, num_rows);
           } else {
             onedal_oneapi_random_forest_reg_predict_impl(
                 oneapi_random_forest_reg_model, features_ptrs, denulled_output, num_rows);
           }
           did_execute = true;
         }
 #endif
         break;
       }
       default: {
         throw std::runtime_error("Unsupported model type");
       }
     }
     if (!did_execute) {
       return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
                                " ML library to support model implementation.");
     }
   } catch (std::runtime_error& e) {
     const std::string error_str(e.what());
     return mgr.ERROR_MESSAGE(error_str);
   }
   output_ids = input_ids;
   if (data_is_masked) {
     unmask_data(denulled_output,
                 denulled_data.reverse_index_map,
                 output_predictions.ptr_,
                 denulled_data.unmasked_num_rows,
                 inline_null_value<T>());
   }
   return input_ids.size();
 }

Here is the call graph for this function:

Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t pca_fit__cpu_1	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const int32_t	cat_top_k,
		const float	cat_min_fraction,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 68 of file MLTableFunctions.cpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and pca_fit_impl().

                                                             {
   CategoricalFeaturesBuilder<double> cat_features_builder(
       input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
   return pca_fit_impl(mgr,
                       model_name,
                       cat_features_builder.getFeatures(),
                       cat_features_builder.getCatFeatureKeys(),
                       preferred_ml_framework_str,
                       model_metadata,
                       output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t pca_fit__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const ColumnList< T > &	input_features,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 1585 of file MLTableFunctions.hpp.

References pca_fit_impl().

                                                                    {
   std::vector<std::vector<std::string>> empty_cat_feature_keys;
   return pca_fit_impl(mgr,
                       model_name,
                       input_features,
                       empty_cat_feature_keys,
                       preferred_ml_framework_str,
                       model_metadata,
                       output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t pca_fit__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const ColumnList< T > &	input_numeric_features,
		const int32_t	cat_top_k,
		const float	cat_min_fraction,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 1616 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and pca_fit_impl().

                                                                    {
   CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
                                                      input_numeric_features,
                                                      cat_top_k,
                                                      cat_min_fraction,
                                                      false /* cat_include_others */);
   return pca_fit_impl(mgr,
                       model_name,
                       cat_features_builder.getFeatures(),
                       cat_features_builder.getCatFeatureKeys(),
                       preferred_ml_framework_str,
                       model_metadata,
                       output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t pca_fit_impl	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const ColumnList< T > &	input_features,
		const std::vector< std::vector< std::string >> &	cat_feature_keys,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 1497 of file MLTableFunctions.hpp.

References MLModelMap::addModel(), DEFAULT, TableFunctions_Namespace::denull_data(), g_ml_models, get_ml_framework(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, ColumnList< T >::numCols(), ONEAPI, ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), ColumnList< T >::size(), and z_std_normalize_data_with_summary_stats().

Referenced by pca_fit__cpu_1(), and pca_fit__cpu_template().

                                                           {
   if (input_features.size() == 0) {
     return mgr.ERROR_MESSAGE(
         "No rows exist in training data. Training data must at least contain 1 row.");
   }
   const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
   if (preferred_ml_framework == MLFramework::INVALID) {
     return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
                              preferred_ml_framework_str.getString());
   }
   try {
     const auto denulled_data = denull_data(input_features);
     const int64_t num_rows = denulled_data.masked_num_rows;
     if (num_rows == 0) {
       return mgr.ERROR_MESSAGE(
           "No non-null rows exist in training data. Training data must at least contain "
           "1 "
           "non-null row.");
     }
     const auto features_ptrs =
         pluck_ptrs(denulled_data.data, 0L, input_features.numCols());
     // z_std_normalize_data_with_summary_stats can throw if std dev is 0
     const auto z_std_norm_summary_stats =
         z_std_normalize_data_with_summary_stats(denulled_data.data, num_rows);
     const auto normalized_ptrs =
         pluck_ptrs(z_std_norm_summary_stats.normalized_data,
                    0L,
                    z_std_norm_summary_stats.normalized_data.size());
     bool did_execute = false;
 #ifdef HAVE_ONEDAL
     if (preferred_ml_framework == MLFramework::ONEAPI ||
         preferred_ml_framework == MLFramework::DEFAULT) {
       const auto [eigenvectors, eigenvalues] =
           onedal_oneapi_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
       auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
                                               z_std_norm_summary_stats.std_devs,
                                               eigenvectors,
                                               eigenvalues,
                                               model_metadata,
                                               cat_feature_keys);
       g_ml_models.addModel(model_name, model);
       did_execute = true;
     } else if (preferred_ml_framework == MLFramework::ONEDAL) {
       const auto [eigenvectors, eigenvalues] =
           onedal_pca_impl(normalized_ptrs, denulled_data.masked_num_rows);
       auto model = std::make_shared<PcaModel>(z_std_norm_summary_stats.means,
                                               z_std_norm_summary_stats.std_devs,
                                               eigenvectors,
                                               eigenvalues,
                                               model_metadata,
                                               cat_feature_keys);
       g_ml_models.addModel(model_name, model);
       did_execute = true;
     }
 #endif
     if (!did_execute) {
       return mgr.ERROR_MESSAGE("Cannot find " + preferred_ml_framework_str.getString() +
                                " ML library to support PCA implementation.");
     }
     mgr.set_output_row_size(1);
     const TextEncodingDict model_name_str_id =
         output_model_name.getOrAddTransient(model_name);
     output_model_name[0] = model_name_str_id;
     return 1;
   } catch (std::runtime_error& e) {
     return mgr.ERROR_MESSAGE(e.what());
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T >

std::vector<const T*> pluck_ptrs	(	const std::vector< std::vector< T >> &	data,
		const int64_t	start_idx,
		const int64_t	end_idx
	)

Definition at line 43 of file MLTableFunctions.hpp.

References CHECK_GE, CHECK_GT, and CHECK_LE.

Referenced by dbscan__cpu_template(), decision_tree_reg_impl(), gbt_reg_fit_impl(), kmeans__cpu_template(), linear_reg_fit_impl(), ml_reg_predict_impl(), pca_fit_impl(), and random_forest_reg_fit_impl().

                                                         {
   std::vector<const T*> raw_ptrs;
   CHECK_GE(start_idx, 0L);
   CHECK_GT(end_idx, start_idx);
   CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
   for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
     raw_ptrs.emplace_back(data[col_idx].data());
   }
   return raw_ptrs;
 }

Here is the caller graph for this function:

template<typename T >

std::vector<const T*> pluck_ptrs	(	const std::vector< T * > &	data,
		const int64_t	start_idx,
		const int64_t	end_idx
	)

Definition at line 57 of file MLTableFunctions.hpp.

References CHECK_GE, CHECK_GT, and CHECK_LE.

                                                         {
   std::vector<const T*> raw_ptrs;
   CHECK_GE(start_idx, 0L);
   CHECK_GT(end_idx, start_idx);
   CHECK_LE(end_idx, static_cast<int64_t>(data.size()));
   for (int64_t col_idx = start_idx; col_idx < end_idx; ++col_idx) {
     raw_ptrs.emplace_back(data[col_idx]);
   }
   return raw_ptrs;
 }

template<typename T >

NEVER_INLINE HOST int32_t r2_score__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< T > &	input_features,
		Column< double > &	output_r2
	)

Definition at line 2097 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ColumnList< T >::numCols(), and r2_score_impl().

Referenced by r2_score__cpu_template().

                                                                             {
   try {
     const auto model = g_ml_models.getModel(model_name);
     check_model_params(model, 0, input_features.numCols());
     return r2_score_impl(mgr, model, input_labels, input_features, output_r2);
   } catch (std::runtime_error& e) {
     const std::string error_str(e.what());
     return mgr.ERROR_MESSAGE(error_str);
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t r2_score__cpu_template	(	TableFunctionManager &	mgr,
		const Column< TextEncodingDict > &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< T > &	input_features,
		Column< double > &	output_r2
	)

Definition at line 2123 of file MLTableFunctions.hpp.

References Column< TextEncodingDict >::getString(), r2_score__cpu_template(), and Column< TextEncodingDict >::size().

                                                   {
   if (model_name.size() != 1) {
     return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
   }
   TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
   return r2_score__cpu_template(
       mgr, model_name_text_enc_none, input_labels, input_features, output_r2);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t r2_score__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const ColumnList< T > &	input_numeric_features,
		Column< double > &	output_r2
	)

Definition at line 2146 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ColumnList< T >::numCols(), ColumnList< TextEncodingDict >::numCols(), and r2_score_impl().

                                                   {
   try {
     const auto model = g_ml_models.getModel(model_name);
     check_model_params(
         model, input_cat_features.numCols(), input_numeric_features.numCols());
     CategoricalFeaturesBuilder<T> cat_features_builder(
         input_cat_features, input_numeric_features, model->getCatFeatureKeys());
     return r2_score_impl(
         mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
   } catch (std::runtime_error& e) {
     const std::string error_str(e.what());
     return mgr.ERROR_MESSAGE(error_str);
   }
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t r2_score__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		Column< double > &	output_r2
	)

Definition at line 2176 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), ColumnList< TextEncodingDict >::numCols(), and r2_score_impl().

                                                   {
   try {
     const auto model = g_ml_models.getModel(model_name);
     check_model_params(model, input_cat_features.numCols(), 0);
     CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
                                                        model->getCatFeatureKeys());
     return r2_score_impl(
         mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
   } catch (std::runtime_error& e) {
     const std::string error_str(e.what());
     return mgr.ERROR_MESSAGE(error_str);
   }
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t r2_score__cpu_template	(	TableFunctionManager &	mgr,
		const Column< TextEncodingDict > &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const ColumnList< T > &	input_numeric_features,
		Column< double > &	output_r2
	)

Definition at line 2204 of file MLTableFunctions.hpp.

References check_model_params(), g_ml_models, MLModelMap::getModel(), Column< TextEncodingDict >::getString(), ColumnList< T >::numCols(), ColumnList< TextEncodingDict >::numCols(), r2_score_impl(), and Column< TextEncodingDict >::size().

                                                   {
   if (model_name.size() != 1) {
     return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
   }
   const std::string model_name_str{model_name.getString(0)};
   try {
     const auto model = g_ml_models.getModel(model_name_str);
     check_model_params(
         model, input_cat_features.numCols(), input_numeric_features.numCols());
     CategoricalFeaturesBuilder<T> cat_features_builder(
         input_cat_features, input_numeric_features, model->getCatFeatureKeys());
     return r2_score_impl(
         mgr, model, input_labels, cat_features_builder.getFeatures(), output_r2);
   } catch (std::runtime_error& e) {
     const std::string error_str(e.what());
     return mgr.ERROR_MESSAGE(error_str);
   }
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t r2_score_impl	(	TableFunctionManager &	mgr,
		const std::shared_ptr< AbstractMLModel > &	model,
		const Column< T > &	input_labels,
		const ColumnList< T > &	input_features,
		Column< double > &	output_r2
	)

Definition at line 2004 of file MLTableFunctions.hpp.

References TableFunctionManager::disable_output_allocations(), TableFunctionManager::enable_output_allocations(), get_column_mean(), max_inputs_per_thread, ml_reg_predict_impl(), threading_serial::parallel_for(), TableFunctionManager::set_output_row_size(), and Column< T >::size().

Referenced by r2_score__cpu_template().

                                                                    {
   const int64_t num_rows = input_labels.size();
   if (num_rows == 0) {
     return mgr.ERROR_MESSAGE(
         "No rows exist in evaluation data. Evaluation data must at least contain 1 row.");
   }
   std::vector<T> output_predictions_vec(num_rows);
   Column<T> output_predictions(output_predictions_vec);
   std::vector<int64_t> input_ids_vec(num_rows);
   std::vector<int64_t> output_ids_vec(num_rows);
   Column<int64_t> input_ids(input_ids_vec);
   Column<int64_t> output_ids(output_ids_vec);
   mgr.disable_output_allocations();
   TextEncodingNone ml_framework_encoding_none("DEFAULT");
 
   try {
     auto ret = ml_reg_predict_impl(mgr,
                                    model,
                                    input_ids,
                                    input_features,
                                    ml_framework_encoding_none,
                                    output_ids,
                                    output_predictions);
 
     if (ret < 0) {
       // A return of less than 0 symbolizes an error
       return ret;
     }
   } catch (std::runtime_error& e) {
     mgr.enable_output_allocations();
     return mgr.ERROR_MESSAGE(e.what());
   }
 
   mgr.enable_output_allocations();
   mgr.set_output_row_size(1);
 
   const auto labels_mean = get_column_mean(input_labels);
   const size_t max_thread_count = std::thread::hardware_concurrency();
   const size_t max_inputs_per_thread = 20000;
   const size_t num_threads = std::min(
       max_thread_count, ((num_rows + max_inputs_per_thread - 1) / max_inputs_per_thread));
 
   std::vector<double> local_sum_squared_regressions(num_threads, 0.0);
   std::vector<double> local_sum_squares(num_threads, 0.0);
 
   tbb::task_arena limited_arena(num_threads);
 
   limited_arena.execute([&] {
     tbb::parallel_for(
         tbb::blocked_range<int64_t>(0, num_rows),
         [&](const tbb::blocked_range<int64_t>& r) {
           const int64_t start_idx = r.begin();
           const int64_t end_idx = r.end();
           double local_sum_squared_regression{0.0};
           double local_sum_square{0.0};
           for (int64_t row_idx = start_idx; row_idx < end_idx; ++row_idx) {
             if (output_predictions[row_idx] != inline_null_value<T>()) {
               local_sum_squared_regression +=
                   (input_labels[row_idx] - output_predictions[row_idx]) *
                   (input_labels[row_idx] - output_predictions[row_idx]);
               local_sum_square += (input_labels[row_idx] - labels_mean) *
                                   (input_labels[row_idx] - labels_mean);
             }
           }
           const size_t thread_idx = tbb::this_task_arena::current_thread_index();
           local_sum_squared_regressions[thread_idx] += local_sum_squared_regression;
           local_sum_squares[thread_idx] += local_sum_square;
         });
   });
   double sum_squared_regression{0.0};
   double sum_squares{0.0};
   for (size_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
     sum_squared_regression += local_sum_squared_regressions[thread_idx];
     sum_squares += local_sum_squares[thread_idx];
   }
   output_r2[0] = sum_squares == 0.0 ? 1.0 : 1.0 - (sum_squared_regression / sum_squares);
   return 1;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< T > &	input_features,
		const int64_t	num_trees,
		const double	obs_per_tree_fraction,
		const int64_t	max_tree_depth,
		const int64_t	features_per_node,
		const double	impurity_threshold,
		const bool	bootstrap,
		const int64_t	min_obs_per_leaf_node,
		const int64_t	min_obs_per_split_node,
		const double	min_weight_fraction_in_leaf_node,
		const double	min_impurity_decrease_in_split_node,
		const int64_t	max_leaf_nodes,
		const bool	use_histogram,
		const TextEncodingNone &	var_importance_metric_str,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 1297 of file MLTableFunctions.hpp.

References random_forest_reg_fit_impl().

                                                                                  {
   std::vector<std::vector<std::string>> empty_cat_feature_keys;
   return random_forest_reg_fit_impl(mgr,
                                     model_name,
                                     input_labels,
                                     input_features,
                                     empty_cat_feature_keys,
                                     num_trees,
                                     obs_per_tree_fraction,
                                     max_tree_depth,
                                     features_per_node,
                                     impurity_threshold,
                                     bootstrap,
                                     min_obs_per_leaf_node,
                                     min_obs_per_split_node,
                                     min_weight_fraction_in_leaf_node,
                                     min_impurity_decrease_in_split_node,
                                     max_leaf_nodes,
                                     use_histogram,
                                     var_importance_metric_str,
                                     preferred_ml_framework_str,
                                     model_metadata,
                                     output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const ColumnList< T > &	input_numeric_features,
		const int64_t	num_trees,
		const double	obs_per_tree_fraction,
		const int64_t	max_tree_depth,
		const int64_t	features_per_node,
		const double	impurity_threshold,
		const bool	bootstrap,
		const int64_t	min_obs_per_leaf_node,
		const int64_t	min_obs_per_split_node,
		const double	min_weight_fraction_in_leaf_node,
		const double	min_impurity_decrease_in_split_node,
		const int64_t	max_leaf_nodes,
		const bool	use_histogram,
		const TextEncodingNone &	var_importance_metric_str,
		const int32_t	cat_top_k,
		const float	cat_min_fraction,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 1368 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and random_forest_reg_fit_impl().

                                                  {
   CategoricalFeaturesBuilder<T> cat_features_builder(input_cat_features,
                                                      input_numeric_features,
                                                      cat_top_k,
                                                      cat_min_fraction,
                                                      false /* cat_include_others */);
   return random_forest_reg_fit_impl(mgr,
                                     model_name,
                                     input_labels,
                                     cat_features_builder.getFeatures(),
                                     cat_features_builder.getCatFeatureKeys(),
                                     num_trees,
                                     obs_per_tree_fraction,
                                     max_tree_depth,
                                     features_per_node,
                                     impurity_threshold,
                                     bootstrap,
                                     min_obs_per_leaf_node,
                                     min_obs_per_split_node,
                                     min_weight_fraction_in_leaf_node,
                                     min_impurity_decrease_in_split_node,
                                     max_leaf_nodes,
                                     use_histogram,
                                     var_importance_metric_str,
                                     preferred_ml_framework_str,
                                     model_metadata,
                                     output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t random_forest_reg_fit__cpu_template	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< TextEncodingDict > &	input_cat_features,
		const int64_t	num_trees,
		const double	obs_per_tree_fraction,
		const int64_t	max_tree_depth,
		const int64_t	features_per_node,
		const double	impurity_threshold,
		const bool	bootstrap,
		const int64_t	min_obs_per_leaf_node,
		const int64_t	min_obs_per_split_node,
		const double	min_weight_fraction_in_leaf_node,
		const double	min_impurity_decrease_in_split_node,
		const int64_t	max_leaf_nodes,
		const bool	use_histogram,
		const TextEncodingNone &	var_importance_metric_str,
		const int32_t	cat_top_k,
		const float	cat_min_fraction,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 1447 of file MLTableFunctions.hpp.

References CategoricalFeaturesBuilder< T >::getCatFeatureKeys(), CategoricalFeaturesBuilder< T >::getFeatures(), and random_forest_reg_fit_impl().

                                                  {
   CategoricalFeaturesBuilder<T> cat_features_builder(
       input_cat_features, cat_top_k, cat_min_fraction, false /* cat_include_others */);
   return random_forest_reg_fit_impl(mgr,
                                     model_name,
                                     input_labels,
                                     cat_features_builder.getFeatures(),
                                     cat_features_builder.getCatFeatureKeys(),
                                     num_trees,
                                     obs_per_tree_fraction,
                                     max_tree_depth,
                                     features_per_node,
                                     impurity_threshold,
                                     bootstrap,
                                     min_obs_per_leaf_node,
                                     min_obs_per_split_node,
                                     min_weight_fraction_in_leaf_node,
                                     min_impurity_decrease_in_split_node,
                                     max_leaf_nodes,
                                     use_histogram,
                                     var_importance_metric_str,
                                     preferred_ml_framework_str,
                                     model_metadata,
                                     output_model_name);
 }

Here is the call graph for this function:

template<typename T >

NEVER_INLINE HOST int32_t random_forest_reg_fit_impl	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		const Column< T > &	input_labels,
		const ColumnList< T > &	input_features,
		const std::vector< std::vector< std::string >> &	cat_feature_keys,
		const int64_t	num_trees,
		const double	obs_per_tree_fraction,
		const int64_t	max_tree_depth,
		const int64_t	features_per_node,
		const double	impurity_threshold,
		const bool	bootstrap,
		const int64_t	min_obs_per_leaf_node,
		const int64_t	min_obs_per_split_node,
		const double	min_weight_fraction_in_leaf_node,
		const double	min_impurity_decrease_in_split_node,
		const int64_t	max_leaf_nodes,
		const bool	use_histogram,
		const TextEncodingNone &	var_importance_metric_str,
		const TextEncodingNone &	preferred_ml_framework_str,
		const TextEncodingNone &	model_metadata,
		Column< TextEncodingDict > &	output_model_name
	)

Definition at line 1106 of file MLTableFunctions.hpp.

References DEFAULT, TableFunctions_Namespace::denull_data(), get_ml_framework(), get_var_importance_metric(), Column< TextEncodingDict >::getOrAddTransient(), TextEncodingNone::getString(), INVALID, MLPACK, ColumnList< T >::numCols(), ONEAPI, ONEDAL, pluck_ptrs(), TableFunctionManager::set_output_row_size(), Column< T >::size(), and heavydb.dtypes::T.

Referenced by random_forest_reg_fit__cpu_template().

                                                                         {
   if (input_labels.size() == 0) {
     return mgr.ERROR_MESSAGE(
         "No rows exist in training data. Training data must at least contain 1 row.");
   }
   const auto preferred_ml_framework = get_ml_framework(preferred_ml_framework_str);
   if (preferred_ml_framework == MLFramework::INVALID) {
     return mgr.ERROR_MESSAGE("Invalid ML Framework: " +
                              preferred_ml_framework_str.getString());
   }
   if (preferred_ml_framework == MLFramework::MLPACK) {
     return mgr.ERROR_MESSAGE(
         "Only OneDAL framework supported for random forest regression.");
   }
 #ifndef HAVE_ONEDAL
   return mgr.ERROR_MESSAGE(
       "Only OneDAL framework supported for random forest regression.");
 #endif
 
   const auto denulled_data = denull_data(input_labels, input_features);
   const auto labels_ptrs = pluck_ptrs(denulled_data.data, 0L, 1L);
   const auto features_ptrs =
       pluck_ptrs(denulled_data.data, 1L, input_features.numCols() + 1);
   mgr.set_output_row_size(1);
   try {
     bool did_execute = false;
     const auto var_importance_metric =
         get_var_importance_metric(var_importance_metric_str);
     if (var_importance_metric == VarImportanceMetric::INVALID) {
       return mgr.ERROR_MESSAGE("Invalid variable importance metric: " +
                                var_importance_metric_str.getString());
     }
 #ifdef HAVE_ONEDAL
     if (!did_execute && (preferred_ml_framework == MLFramework::ONEAPI ||
                          preferred_ml_framework == MLFramework::DEFAULT)) {
       if (use_histogram) {
         onedal_oneapi_random_forest_reg_fit_impl<
             T,
             oneapi::dal::decision_forest::method::hist>(
             model_name,
             labels_ptrs[0],
             features_ptrs,
             model_metadata,
             cat_feature_keys,
             denulled_data.masked_num_rows,
             num_trees,
             obs_per_tree_fraction,
             max_tree_depth,
             features_per_node,
             impurity_threshold,
             bootstrap,
             min_obs_per_leaf_node,
             min_obs_per_split_node,
             min_weight_fraction_in_leaf_node,
             min_impurity_decrease_in_split_node,
             max_leaf_nodes,
             var_importance_metric);
       } else {
         onedal_oneapi_random_forest_reg_fit_impl<
             T,
             oneapi::dal::decision_forest::method::dense>(
             model_name,
             labels_ptrs[0],
             features_ptrs,
             model_metadata,
             cat_feature_keys,
             denulled_data.masked_num_rows,
             num_trees,
             obs_per_tree_fraction,
             max_tree_depth,
             features_per_node,
             impurity_threshold,
             bootstrap,
             min_obs_per_leaf_node,
             min_obs_per_split_node,
             min_weight_fraction_in_leaf_node,
             min_impurity_decrease_in_split_node,
             max_leaf_nodes,
             var_importance_metric);
       }
       const TextEncodingDict model_name_str_id =
           output_model_name.getOrAddTransient(model_name);
       output_model_name[0] = model_name_str_id;
       did_execute = true;
     } else if (!did_execute && (preferred_ml_framework == MLFramework::ONEDAL)) {
       if (use_histogram) {
         onedal_random_forest_reg_fit_impl<T, decision_forest::regression::training::hist>(
             model_name,
             labels_ptrs[0],
             features_ptrs,
             model_metadata,
             cat_feature_keys,
             denulled_data.masked_num_rows,
             num_trees,
             obs_per_tree_fraction,
             max_tree_depth,
             features_per_node,
             impurity_threshold,
             bootstrap,
             min_obs_per_leaf_node,
             min_obs_per_split_node,
             min_weight_fraction_in_leaf_node,
             min_impurity_decrease_in_split_node,
             max_leaf_nodes,
             var_importance_metric);
       } else {
         onedal_random_forest_reg_fit_impl<
             T,
             decision_forest::regression::training::defaultDense>(
             model_name,
             labels_ptrs[0],
             features_ptrs,
             model_metadata,
             cat_feature_keys,
             denulled_data.masked_num_rows,
             num_trees,
             obs_per_tree_fraction,
             max_tree_depth,
             features_per_node,
             impurity_threshold,
             bootstrap,
             min_obs_per_leaf_node,
             min_obs_per_split_node,
             min_weight_fraction_in_leaf_node,
             min_impurity_decrease_in_split_node,
             max_leaf_nodes,
             var_importance_metric);
       }
       const TextEncodingDict model_name_str_id =
           output_model_name.getOrAddTransient(model_name);
       output_model_name[0] = model_name_str_id;
       did_execute = true;
     }
 #endif
     if (!did_execute) {
       return mgr.ERROR_MESSAGE(
           "Cannot find " + preferred_ml_framework_str.getString() +
           " ML library to support random forest regression implementation.");
     }
   } catch (std::runtime_error& e) {
     return mgr.ERROR_MESSAGE(e.what());
   }
   return 1;
 }

Here is the call graph for this function:

Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_1	(	TableFunctionManager &	mgr,
		const TextEncodingNone &	model_name,
		Column< int64_t > &	feature_id,
		Column< TextEncodingDict > &	feature,
		Column< int64_t > &	sub_feature_id,
		Column< TextEncodingDict > &	sub_feature,
		Column< double > &	importance_score
	)

Definition at line 174 of file MLTableFunctions.cpp.

References g_ml_models, get_model_features(), MLModelMap::getModel(), Column< TextEncodingDict >::getOrAddTransient(), and TableFunctionManager::set_output_row_size().

Referenced by random_forest_reg_var_importance__cpu_2().

                                                                           {
 #ifndef HAVE_ONEDAL
   return mgr.ERROR_MESSAGE(
       "Only OneDAL framework supported for random forest regression.");
 #endif
   try {
 #ifdef HAVE_ONEDAL
     const auto base_model = g_ml_models.getModel(model_name);
     const auto rand_forest_model =
         std::dynamic_pointer_cast<AbstractRandomForestModel>(base_model);
     if (!rand_forest_model) {
       throw std::runtime_error("Model is not of type random forest.");
     }
     const auto& variable_importance_scores =
         rand_forest_model->getVariableImportanceScores();
     const int64_t num_features = variable_importance_scores.size();
     mgr.set_output_row_size(num_features);
     if (num_features == 0) {
       return mgr.ERROR_MESSAGE("Variable importance not computed for this model.");
     }
     if (num_features != rand_forest_model->getNumFeatures()) {
       return mgr.ERROR_MESSAGE(
           "Mismatch in number of features and number of variable importance metrics.");
     }
     const auto num_logical_features = rand_forest_model->getNumLogicalFeatures();
     std::vector<std::string> feature_names =
         get_model_features(model_name, rand_forest_model);
 
     int64_t physical_feature_idx = 0;
     const auto& cat_feature_keys = rand_forest_model->getCatFeatureKeys();
     const auto num_cat_features = rand_forest_model->getNumCatFeatures();
     for (int64_t feature_idx = 0; feature_idx < num_logical_features; ++feature_idx) {
       // Make feature ids start at 1, not 0
       if (feature_idx < num_cat_features) {
         const auto& col_cat_feature_keys = cat_feature_keys[feature_idx];
         int64_t sub_feature_idx = 1;
         for (const auto& col_cat_feature_key : col_cat_feature_keys) {
           feature_id[physical_feature_idx] = feature_idx + 1;
           if (feature_names[feature_idx].empty()) {
             feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
           } else {
             feature[physical_feature_idx] =
                 feature.getOrAddTransient(feature_names[feature_idx]);
           }
           sub_feature_id[physical_feature_idx] = sub_feature_idx++;
           const TextEncodingDict feature_sub_key =
               sub_feature.getOrAddTransient(col_cat_feature_key);
           sub_feature[physical_feature_idx] = feature_sub_key;
           importance_score[physical_feature_idx] =
               variable_importance_scores[physical_feature_idx];
           physical_feature_idx++;
         }
       } else {
         feature_id[physical_feature_idx] = feature_idx + 1;
         if (feature_names[feature_idx].empty()) {
           feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
         } else {
           feature[physical_feature_idx] =
               feature.getOrAddTransient(feature_names[feature_idx]);
         }
         sub_feature_id[physical_feature_idx] = 1;
         sub_feature[physical_feature_idx] = inline_null_value<TextEncodingDict>();
         importance_score[physical_feature_idx] =
             variable_importance_scores[physical_feature_idx];
         physical_feature_idx++;
       }
     }
     return num_features;
 #endif
   } catch (std::runtime_error& e) {
     return mgr.ERROR_MESSAGE(e.what());
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

EXTENSION_NOINLINE_HOST int32_t random_forest_reg_var_importance__cpu_2	(	TableFunctionManager &	mgr,
		const Column< TextEncodingDict > &	model_name,
		Column< int64_t > &	feature_id,
		Column< TextEncodingDict > &	feature,
		Column< int64_t > &	sub_feature_id,
		Column< TextEncodingDict > &	sub_feature,
		Column< double > &	importance_score
	)

Definition at line 255 of file MLTableFunctions.cpp.

References Column< TextEncodingDict >::getString(), random_forest_reg_var_importance__cpu_1(), and Column< TextEncodingDict >::size().

                                                                           {
   if (model_name.size() != 1) {
     return mgr.ERROR_MESSAGE("Expected only one row in model name CURSOR.");
   }
   TextEncodingNone model_name_text_enc_none(mgr, model_name.getString(0));
   return random_forest_reg_var_importance__cpu_1(mgr,
                                                  model_name_text_enc_none,
                                                  feature_id,
                                                  feature,
                                                  sub_feature_id,
                                                  sub_feature,
                                                  importance_score);
 }

Here is the call graph for this function:

EXTENSION_NOINLINE_HOST int32_t supported_ml_frameworks__cpu_	(	TableFunctionManager &	mgr,
		Column< TextEncodingDict > &	output_ml_frameworks,
		Column< bool > &	output_availability,
		Column< bool > &	output_default
	)

Definition at line 8 of file MLTableFunctions.cpp.

References StringDictionaryProxy::getOrAddTransientBulk(), TableFunctionManager::set_output_row_size(), and Column< TextEncodingDict >::string_dict_proxy_.

                                                                     {
   const std::vector<std::string> ml_frameworks = {"oneapi", "onedal", "mlpack"};
   const int32_t num_frameworks = ml_frameworks.size();
   mgr.set_output_row_size(num_frameworks);
   const std::vector<int32_t> ml_framework_string_ids =
       output_ml_frameworks.string_dict_proxy_->getOrAddTransientBulk(ml_frameworks);
 
 #if defined(HAVE_ONEDAL) || defined(HAVE_MLPACK)
   bool found_available_framework = false;
   auto framework_found_actions = [&output_availability,
                                   &output_default,
                                   &found_available_framework](const int64_t out_row_idx) {
     output_availability[out_row_idx] = true;
     if (!found_available_framework) {
       output_default[out_row_idx] = true;
       found_available_framework = true;
     } else {
       output_default[out_row_idx] = false;
     }
   };
 #endif
 
 #if !defined(HAVE_ONEDAL) || !defined(HAVE_MLPACK)
   auto framework_not_found_actions = [&output_availability,
                                       &output_default](const int64_t out_row_idx) {
     output_availability[out_row_idx] = false;
     output_default[out_row_idx] = false;
   };
 #endif
 
   for (int32_t out_row_idx = 0; out_row_idx < num_frameworks; ++out_row_idx) {
     output_ml_frameworks[out_row_idx] = ml_framework_string_ids[out_row_idx];
     if (ml_frameworks[out_row_idx] == "onedal" ||
         ml_frameworks[out_row_idx] == "oneapi") {
 #ifdef HAVE_ONEDAL
       framework_found_actions(out_row_idx);
 #else
       framework_not_found_actions(out_row_idx);
 #endif
     } else if (ml_frameworks[out_row_idx] == "mlpack") {
 #ifdef HAVE_MLPACK
       framework_found_actions(out_row_idx);
 #else
       framework_not_found_actions(out_row_idx);
 #endif
     }
   }
   return num_frameworks;
 }

Here is the call graph for this function:

Classes

Functions

Function Documentation