OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MLPredictCodegen.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2023 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
19 #include "TreeModelPredictionMgr.h"
20 
21 #ifdef HAVE_CUDA
23 #include "GpuMemUtils.h"
24 #endif // HAVE_CUDA
25 
26 #include <tbb/parallel_for.h>
27 #include <stack>
28 #include <vector>
29 
30 std::vector<std::shared_ptr<Analyzer::Expr>> generated_encoded_and_casted_features(
31  const std::vector<std::shared_ptr<Analyzer::Expr>>& feature_exprs,
32  const std::vector<std::vector<std::string>>& cat_feature_keys,
33  const std::vector<int64_t>& feature_permutations,
34  Executor* executor) {
35  std::vector<std::shared_ptr<Analyzer::Expr>> casted_feature_exprs;
36  const size_t num_feature_exprs = feature_exprs.size();
37  const size_t num_cat_features = cat_feature_keys.size();
38 
39  if (num_cat_features > num_feature_exprs) {
40  throw std::runtime_error("More categorical keys than features.");
41  }
42 
43  auto get_int_constant_expr = [](int32_t const_val) {
44  Datum d;
45  d.intval = const_val;
46  return makeExpr<Analyzer::Constant>(SQLTypeInfo(kINT, false), false, d);
47  };
48 
49  for (size_t original_feature_idx = 0; original_feature_idx < num_feature_exprs;
50  ++original_feature_idx) {
51  const auto feature_idx = feature_permutations.empty()
52  ? original_feature_idx
53  : feature_permutations[original_feature_idx];
54  auto& feature_expr = feature_exprs[feature_idx];
55  const auto& feature_ti = feature_expr->get_type_info();
56  if (feature_ti.is_number()) {
57  // Don't conditionally cast to double iff type is not double
58  // as this was causing issues for the random forest function with
59  // mixed types. Need to troubleshoot more but always casting to double
60  // regardless of the underlying type always seems to be safe
61  casted_feature_exprs.emplace_back(makeExpr<Analyzer::UOper>(
62  SQLTypeInfo(kDOUBLE, false), false, kCAST, feature_expr));
63  } else {
64  CHECK(feature_ti.is_string()) << "Expected text type";
65  if (!feature_ti.is_text_encoding_dict()) {
66  throw std::runtime_error("Expected dictionary-encoded text column.");
67  }
68  if (original_feature_idx >= num_cat_features) {
69  throw std::runtime_error("Model not trained on text type for column.");
70  }
71  const auto& str_dict_key = feature_ti.getStringDictKey();
72  const auto str_dict_proxy = executor->getStringDictionaryProxy(str_dict_key, true);
73  for (const auto& cat_feature_key : cat_feature_keys[original_feature_idx]) {
74  // For one-hot encoded columns, null values will translate as a 0.0 and not a null
75  // We are computing the following:
76  // CASE WHEN str_val is NULL then 0.0 ELSE
77  // CAST(str_id = one_hot_encoded_str_id AS DOUBLE) END
78 
79  // Check if the expression is null
80  auto is_null_expr = makeExpr<Analyzer::UOper>(
81  SQLTypeInfo(kBOOLEAN, false), false, kISNULL, feature_expr);
82  Datum zero_datum;
83  zero_datum.doubleval = 0.0;
84  // If null then emit a 0.0 double constant as the THEN expr
85  auto is_null_then_expr =
86  makeExpr<Analyzer::Constant>(SQLTypeInfo(kDOUBLE, false), false, zero_datum);
87  std::list<
88  std::pair<std::shared_ptr<Analyzer::Expr>, std::shared_ptr<Analyzer::Expr>>>
89  when_then_exprs;
90  when_then_exprs.emplace_back(std::make_pair(is_null_expr, is_null_then_expr));
91  // The rest of/core string test logic goes in the ELSE statement
92  // Get the string id of the one-hot feature
93  const auto str_id = str_dict_proxy->getIdOfString(cat_feature_key);
94  auto str_id_expr = get_int_constant_expr(str_id);
95  // Get integer id for this row's string
96  auto key_for_string_expr = makeExpr<Analyzer::KeyForStringExpr>(feature_expr);
97 
98  // Check if this row's string id is equal to the search one-hot encoded id
99  std::shared_ptr<Analyzer::Expr> str_equality_expr =
100  makeExpr<Analyzer::BinOper>(SQLTypeInfo(kBOOLEAN, false),
101  false,
102  kEQ,
103  kONE,
104  key_for_string_expr,
105  str_id_expr);
106  // Cast the above boolean results to a double, 0.0 or 1.0
107  auto cast_expr = makeExpr<Analyzer::UOper>(
108  SQLTypeInfo(kDOUBLE, false), false, kCAST, str_equality_expr);
109 
110  // Generate the full CASE statement and add to the casted feature exprssions
111  casted_feature_exprs.emplace_back(makeExpr<Analyzer::CaseExpr>(
112  SQLTypeInfo(kDOUBLE, false), false, when_then_exprs, cast_expr));
113  }
114  }
115  }
116  return casted_feature_exprs;
117 }
118 
120  const Analyzer::MLPredictExpr* expr,
121  const std::string& model_name,
122  const std::shared_ptr<AbstractMLModel>& abstract_model,
123  const CompilationOptions& co) {
125  const auto linear_reg_model =
126  std::dynamic_pointer_cast<LinearRegressionModel>(abstract_model);
127  // The parent codegen function called this function `codegenLinRegPredict`
128  // iff we had MLModelType::LINEAR_REG_PREDICT, so below is just a sanity
129  // check
130  CHECK(linear_reg_model);
131  const auto& model_coefs = linear_reg_model->getCoefs();
132  const auto& cat_feature_keys = linear_reg_model->getCatFeatureKeys();
133 
134  const auto& regressor_exprs = expr->get_regressor_values();
135 
136  const auto casted_regressor_exprs = generated_encoded_and_casted_features(
137  regressor_exprs,
138  cat_feature_keys,
139  linear_reg_model->getModelMetadata().getFeaturePermutations(),
140  executor());
141 
142  auto get_double_constant_expr = [](double const_val) {
143  Datum d;
144  d.doubleval = const_val;
145  return makeExpr<Analyzer::Constant>(SQLTypeInfo(kDOUBLE, false), false, d);
146  };
147 
148  std::shared_ptr<Analyzer::Expr> result;
149 
150  // Linear regression models are of the form
151  // y = b0 + b1*x1 + b2*x2 + ... + bn*xn
152  // Where b0 is the constant y-intercept, x1..xn are the dependent
153  // varabiles (aka regressors or predictors), and b1..bn are the
154  // regression coefficients
155 
156  for (size_t model_coef_idx = 0; model_coef_idx < model_coefs.size(); ++model_coef_idx) {
157  auto coef_value_expr = get_double_constant_expr(model_coefs[model_coef_idx]);
158  if (model_coef_idx == size_t(0)) {
159  // We have the y-intercept b0, this is not multiplied by any regressor
160  result = coef_value_expr;
161  } else {
162  // We have a term with a regressor (xi) and regression coefficient (bi)
163  const auto& casted_regressor_expr = casted_regressor_exprs[model_coef_idx - 1];
164  // Multiply regressor by coefficient
165  auto mul_expr = makeExpr<Analyzer::BinOper>(SQLTypeInfo(kDOUBLE, false),
166  false,
167  kMULTIPLY,
168  kONE,
169  coef_value_expr,
170  casted_regressor_expr);
171  // Add term to result
172  result = makeExpr<Analyzer::BinOper>(
173  SQLTypeInfo(kDOUBLE, false), false, kPLUS, kONE, result, mul_expr);
174  }
175  }
176 
177  // The following will codegen the expression tree we just created modeling
178  // the linear regression formula
179  return codegenArith(dynamic_cast<Analyzer::BinOper*>(result.get()), co);
180 }
181 
183  const Analyzer::MLPredictExpr* expr,
184  const std::string& model_name,
185  const std::shared_ptr<AbstractMLModel>& model,
186  const CompilationOptions& co) {
187 #ifdef HAVE_ONEDAL
188  const auto tree_model = std::dynamic_pointer_cast<AbstractTreeModel>(model);
189  // The parent codegen function called this function `codegenTreeRegPredict`
190  // iff we a tree reg MLModelType, so below is just a sanity
191  // check
192  CHECK(tree_model);
193  const int64_t num_trees = static_cast<int64_t>(tree_model->getNumTrees());
194  const auto& regressor_exprs = expr->get_regressor_values();
195  const auto& cat_feature_keys = tree_model->getCatFeatureKeys();
196  const auto casted_regressor_exprs = generated_encoded_and_casted_features(
197  regressor_exprs,
198  cat_feature_keys,
199  tree_model->getModelMetadata().getFeaturePermutations(),
200  executor());
201  // We cast all regressors to double for simplicity and to match
202  // how feature filters are stored in the tree model.
203  // Null checks are handled further down in the generated kernel
204  // in the runtime function itself
205 
206  std::vector<llvm::Value*> regressor_values;
207  for (const auto& casted_regressor_expr : casted_regressor_exprs) {
208  regressor_values.emplace_back(codegen(casted_regressor_expr.get(), false, co)[0]);
209  }
210 
211  // First build tables, i.e. vectors of DecisionTreeEntry, for each tree
212  std::vector<std::vector<DecisionTreeEntry>> decision_trees(num_trees);
213  {
214  auto tree_build_timer = DEBUG_TIMER("Tree Visitors Dispatched");
215  tbb::parallel_for(tbb::blocked_range<int64_t>(0, num_trees),
216  [&](const tbb::blocked_range<int64_t>& r) {
217  const auto start_tree_idx = r.begin();
218  const auto end_tree_idx = r.end();
219  for (int64_t tree_idx = start_tree_idx; tree_idx < end_tree_idx;
220  ++tree_idx) {
221  TreeModelVisitor tree_visitor(decision_trees[tree_idx]);
222  tree_model->traverseDF(tree_idx, tree_visitor);
223  }
224  });
225  }
226 
227  // Next, compute prefix-sum offset such that decision_tree_offsets[k]
228  // specifies the starting offset of tree k relative to tree 0, and
229  // decision_tree_offsets[k+1] specifies the last entry + 1 of tree
230  // k relative to tree 0
231  std::vector<int64_t> decision_tree_offsets(num_trees + 1);
232  decision_tree_offsets[0] = 0;
233  for (int64_t tree_idx = 0; tree_idx < num_trees; ++tree_idx) {
234  decision_tree_offsets[tree_idx + 1] =
235  decision_tree_offsets[tree_idx] +
236  static_cast<int64_t>(decision_trees[tree_idx].size());
237  }
238 
239  VLOG(1) << tree_model->getModelTypeString() << " model has " << num_trees
240  << " trees and " << decision_tree_offsets[num_trees] << " total entries.";
241 
242  // Finally, go back through each tree and adjust all left and right child idx entries
243  // such that such values are global relative to the start of tree 0. This will allow
244  // the downstream code-generated kernel to be able treat these child idx entries as
245  // as absolute offsets from the base pointer for all trees, rather than computing such
246  // an offset on the fly
247  {
248  auto tree_offset_correction_timer = DEBUG_TIMER("Tree Offsets Corrected");
250  tbb::blocked_range<int64_t>(1, num_trees),
251  [&](const tbb::blocked_range<int64_t>& r) {
252  const auto start_tree_idx = r.begin();
253  const auto end_tree_idx = r.end();
254  for (int64_t tree_idx = start_tree_idx; tree_idx < end_tree_idx; ++tree_idx) {
255  const int64_t start_offset = decision_tree_offsets[tree_idx];
256  auto& decision_tree = decision_trees[tree_idx];
257  const int64_t num_tree_entries = static_cast<int64_t>(decision_tree.size());
258  CHECK_EQ(num_tree_entries,
259  decision_tree_offsets[tree_idx + 1] - start_offset);
260  for (int64_t decision_entry_idx = 0; decision_entry_idx < num_tree_entries;
261  ++decision_entry_idx) {
262  if (decision_tree[decision_entry_idx].isSplitNode()) {
263  decision_tree[decision_entry_idx].left_child_row_idx += start_offset;
264  decision_tree[decision_entry_idx].right_child_row_idx += start_offset;
265  }
266  }
267  }
268  });
269  }
270 
271  {
272  auto tree_model_prediction_mgr_timer =
273  DEBUG_TIMER("TreeModelPredictionMgr generation and codegen");
274  // TreeModelPredictionMgr copies the decision trees and offsets to host
275  // buffers in RowSetMemoryOwner and onto each GPU if the query is running
276  // on GPU, and takes care of the tree traversal codegen itself
277 
278  const bool compute_avg = tree_model->getModelType() == MLModelType::RANDOM_FOREST_REG;
279  auto tree_model_prediction_mgr = std::make_unique<TreeModelPredictionMgr>(
282  executor(),
283  decision_trees,
284  decision_tree_offsets,
285  compute_avg);
286 
287  return cgen_state_->moveTreeModelPredictionMgr(std::move(tree_model_prediction_mgr))
288  ->codegen(regressor_values, co);
289  }
290 #else
291  throw std::runtime_error("OneDAL not available.");
292 #endif
293 }
294 
296  const CompilationOptions& co) {
297  auto timer = DEBUG_TIMER(__func__);
298  const auto& model_expr = expr->get_model_value();
299  CHECK(model_expr);
300  auto model_constant_expr = dynamic_cast<const Analyzer::Constant*>(model_expr);
301  CHECK(model_constant_expr);
302  const auto model_datum = model_constant_expr->get_constval();
303  const auto model_name_ptr = model_datum.stringval;
304  CHECK(model_name_ptr);
305  const auto model_name = *model_name_ptr;
306  const auto abstract_model = g_ml_models.getModel(model_name);
307  const auto model_type = abstract_model->getModelType();
308  const auto& regressor_exprs = expr->get_regressor_values();
309  if (abstract_model->getNumLogicalFeatures() !=
310  static_cast<int64_t>(regressor_exprs.size())) {
311  std::ostringstream error_oss;
312  error_oss << "ML_PREDICT: Model '" << model_name
313  << "' expects different number of predictor variables ("
314  << abstract_model->getNumLogicalFeatures() << ") than provided ("
315  << regressor_exprs.size() << ").";
316  throw std::runtime_error(error_oss.str());
317  }
318 
319  switch (model_type) {
321  return codegenLinRegPredict(expr, model_name, abstract_model, co);
322  }
326  return codegenTreeRegPredict(expr, model_name, abstract_model, co);
327  }
328  default: {
329  throw std::runtime_error("Unsupported model type.");
330  }
331  }
332 }
333 
335  const CompilationOptions& co) {
336  auto timer = DEBUG_TIMER(__func__);
337  const auto& model_expr = expr->get_model_value();
338  CHECK(model_expr);
339  auto model_constant_expr = dynamic_cast<const Analyzer::Constant*>(model_expr);
340  CHECK(model_constant_expr);
341  const auto model_datum = model_constant_expr->get_constval();
342  const auto model_name_ptr = model_datum.stringval;
343  CHECK(model_name_ptr);
344  const auto model_name = *model_name_ptr;
345  const auto abstract_model = g_ml_models.getModel(model_name);
346  const auto model_type = abstract_model->getModelType();
347  if (model_type != MLModelType::PCA) {
348  throw std::runtime_error("PCA_PROJECT: Model '" + model_name +
349  "' is not a PCA model.");
350  }
351  const auto pca_model = std::dynamic_pointer_cast<PcaModel>(abstract_model);
352  const auto& feature_exprs = expr->get_feature_values();
353  if (pca_model->getNumLogicalFeatures() != static_cast<int64_t>(feature_exprs.size())) {
354  std::ostringstream error_oss;
355  error_oss << "PCA_PROJECT: Model '" << model_name
356  << "' expects different number of predictor variables ("
357  << pca_model->getNumLogicalFeatures() << ") than provided ("
358  << feature_exprs.size() << ").";
359  throw std::runtime_error(error_oss.str());
360  }
361 
362  const auto& pc_dimension_expr = expr->get_pc_dimension_value();
363  auto pc_dimension_const_expr =
364  dynamic_cast<const Analyzer::Constant*>(pc_dimension_expr);
365  const auto pc_dimension_datum = pc_dimension_const_expr->get_constval();
366  const auto pc_dimension = pc_dimension_datum.intval - 1;
367  if (pc_dimension < 0 || pc_dimension >= pca_model->getNumFeatures()) {
368  std::ostringstream error_oss;
369  error_oss << "PCA_PROJECT: Invalid PC dimension (" << pc_dimension + 1
370  << ") provided. Valid range is [1, " << pca_model->getNumFeatures() << "].";
371  throw std::runtime_error(error_oss.str());
372  }
373 
374  const auto& column_means = pca_model->getColumnMeans();
375  const auto& column_std_devs = pca_model->getColumnStdDevs();
376  const auto& eigenvectors = pca_model->getEigenvectors();
377 
378  const auto& cat_feature_keys = pca_model->getCatFeatureKeys();
379 
380  const auto casted_feature_exprs = generated_encoded_and_casted_features(
381  feature_exprs,
382  cat_feature_keys,
383  pca_model->getModelMetadata().getFeaturePermutations(),
384  executor());
385 
386  auto get_double_constant_expr = [](double const_val) {
387  Datum d;
388  d.doubleval = const_val;
389  return makeExpr<Analyzer::Constant>(SQLTypeInfo(kDOUBLE, false), false, d);
390  };
391 
392  std::shared_ptr<Analyzer::Expr> result;
393 
394  for (size_t feature_idx = 0; feature_idx < feature_exprs.size(); ++feature_idx) {
395  auto mean_expr = get_double_constant_expr(column_means[feature_idx]);
396  const auto& casted_feature_expr = casted_feature_exprs[feature_idx];
397  // Subtract column mean from feature
398  auto mean_diff_expr = makeExpr<Analyzer::BinOper>(
399  SQLTypeInfo(kDOUBLE, false), false, kMINUS, kONE, casted_feature_expr, mean_expr);
400  auto std_dev_expr = get_double_constant_expr(column_std_devs[feature_idx]);
401  auto z_score_expr = makeExpr<Analyzer::BinOper>(
402  SQLTypeInfo(kDOUBLE, false), false, kDIVIDE, kONE, mean_diff_expr, std_dev_expr);
403  auto pc_term_expr = get_double_constant_expr(eigenvectors[pc_dimension][feature_idx]);
404  auto pca_mul_expr = makeExpr<Analyzer::BinOper>(
405  SQLTypeInfo(kDOUBLE, false), false, kMULTIPLY, kONE, z_score_expr, pc_term_expr);
406  if (feature_idx == 0) {
407  // There is no result yet, so set the result to the first term
408  result = pca_mul_expr;
409  } else {
410  // Add the term to the result
411  result = makeExpr<Analyzer::BinOper>(
412  SQLTypeInfo(kDOUBLE, false), false, kPLUS, kONE, result, pca_mul_expr);
413  }
414  }
415 
416  // The following will codegen the expression tree we just created modeling
417  // the linear regression formula
418  return codegenArith(dynamic_cast<Analyzer::BinOper*>(result.get()), co);
419 }
llvm::Value * codegenLinRegPredict(const Analyzer::MLPredictExpr *, const std::string &model_name, const std::shared_ptr< AbstractMLModel > &model, const CompilationOptions &)
#define CHECK_EQ(x, y)
Definition: Logger.h:301
llvm::Value * codegenArith(const Analyzer::BinOper *, const CompilationOptions &)
CgenState * cgen_state_
Definition: sqldefs.h:48
Definition: sqldefs.h:29
llvm::Value * codegen(const std::vector< llvm::Value * > &regressor_inputs, const CompilationOptions &co) const
Definition: sqldefs.h:40
int32_t intval
Definition: Datum.h:73
std::vector< std::shared_ptr< Analyzer::Expr > > generated_encoded_and_casted_features(const std::vector< std::shared_ptr< Analyzer::Expr >> &feature_exprs, const std::vector< std::vector< std::string >> &cat_feature_keys, const std::vector< int64_t > &feature_permutations, Executor *executor)
const Expr * get_pc_dimension_value() const
Definition: Analyzer.h:792
const TreeModelPredictionMgr * moveTreeModelPredictionMgr(std::unique_ptr< const TreeModelPredictionMgr > &&tree_model_prediction_mgr)
Definition: CgenState.h:205
#define AUTOMATIC_IR_METADATA(CGENSTATE)
std::shared_ptr< AbstractMLModel > getModel(const std::string &model_name) const
Definition: MLModel.h:50
ExecutorDeviceType device_type
MLModelMap g_ml_models
Definition: MLModel.h:124
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
Definition: sqldefs.h:39
Definition: sqldefs.h:71
Datum get_constval() const
Definition: Analyzer.h:348
const Expr * get_model_value() const
Definition: Analyzer.h:788
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
llvm::Value * codegenTreeRegPredict(const Analyzer::MLPredictExpr *, const std::string &model_name, const std::shared_ptr< AbstractMLModel > &model, const CompilationOptions &)
const std::vector< std::shared_ptr< Analyzer::Expr > > & get_feature_values() const
Definition: Analyzer.h:789
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
const Expr * get_model_value() const
Definition: Analyzer.h:713
Definition: sqltypes.h:72
Allocate GPU memory using GpuBuffers via DataMgr.
const std::vector< std::shared_ptr< Analyzer::Expr > > & get_regressor_values() const
Definition: Analyzer.h:714
Definition: Datum.h:69
#define VLOG(n)
Definition: Logger.h:388
double doubleval
Definition: Datum.h:76
Executor * executor() const