OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
TreeModelPredictionMgr.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2023 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "TreeModelPredictionMgr.h"
18 #include "CodeGenerator.h"
19 
20 #ifdef HAVE_CUDA
22 #include "GpuMemUtils.h"
23 #endif // HAVE_CUDA
24 #include "Parser/ParserNode.h"
25 
26 #include <tbb/parallel_for.h>
27 
29  const Data_Namespace::MemoryLevel memory_level,
30  Executor* executor,
31  const std::vector<std::vector<DecisionTreeEntry>>& decision_trees,
32  const std::vector<int64_t>& decision_tree_offsets,
33  const bool compute_avg)
34  : memory_level_(memory_level)
35  , executor_(executor)
36  , data_mgr_(executor->getDataMgr())
37  , device_count_(executor->deviceCount(memory_level == Data_Namespace::GPU_LEVEL
40  , num_trees_(decision_trees.size())
41  , compute_avg_(compute_avg) {
42 #ifdef HAVE_CUDA
45 #else
47 #endif // HAVE_CUDA
48  allocateAndPopulateHostBuffers(decision_trees, decision_tree_offsets);
50 }
51 
54  for (auto* buffer : decision_tree_table_device_buffers_) {
55  CHECK(buffer);
56  data_mgr_->free(buffer);
57  }
58  for (auto* buffer : decision_tree_offsets_device_buffers_) {
59  CHECK(buffer);
60  data_mgr_->free(buffer);
61  }
62 }
63 
65  const std::vector<std::vector<DecisionTreeEntry>>& decision_trees,
66  const std::vector<int64_t>& decision_tree_offsets) {
67  auto timer = DEBUG_TIMER(__func__);
68  const size_t num_trees = decision_trees.size();
69  CHECK_EQ(num_trees, static_cast<size_t>(num_trees_));
70  CHECK_EQ(num_trees, decision_tree_offsets.size() - 1);
71  const size_t num_tree_entries = decision_tree_offsets[num_trees];
72  decision_tree_table_size_bytes_ = num_tree_entries * sizeof(DecisionTreeEntry);
73  decision_tree_offsets_size_bytes_ = decision_tree_offsets.size() * sizeof(size_t);
75  executor_->getRowSetMemoryOwner()->allocate(decision_tree_table_size_bytes_);
77  executor_->getRowSetMemoryOwner()->allocate(decision_tree_offsets_size_bytes_);
78  // Take this opportunity to copy offsets buffer over
79  std::memcpy(host_decision_tree_offsets_,
80  reinterpret_cast<const int8_t*>(decision_tree_offsets.data()),
81  decision_tree_offsets_size_bytes_);
82 
84  tbb::blocked_range<size_t>(0, num_trees), [&](const tbb::blocked_range<size_t>& r) {
85  const auto start_tree_idx = r.begin();
86  const auto end_tree_idx = r.end();
87  for (size_t tree_idx = start_tree_idx; tree_idx < end_tree_idx; ++tree_idx) {
88  std::memcpy(host_decision_tree_table_ +
89  decision_tree_offsets[tree_idx] * sizeof(DecisionTreeEntry),
90  reinterpret_cast<const int8_t*>(decision_trees[tree_idx].data()),
91  decision_trees[tree_idx].size() * sizeof(DecisionTreeEntry));
92  }
93  });
94 }
95 
97  auto timer = DEBUG_TIMER(__func__);
98 #ifdef HAVE_CUDA
100  for (int device_id = 0; device_id < device_count_; ++device_id) {
107  auto decision_tree_table_device_buffer = reinterpret_cast<const int8_t*>(
108  decision_tree_table_device_buffers_.back()->getMemoryPtr());
109  auto decision_tree_offsets_device_buffer = reinterpret_cast<const int8_t*>(
110  decision_tree_offsets_device_buffers_.back()->getMemoryPtr());
112  reinterpret_cast<CUdeviceptr>(decision_tree_table_device_buffer),
113  reinterpret_cast<const int8_t*>(host_decision_tree_table_),
115  device_id);
117  data_mgr_,
118  reinterpret_cast<CUdeviceptr>(decision_tree_offsets_device_buffer),
119  reinterpret_cast<const int8_t*>(host_decision_tree_offsets_),
121  device_id);
122  kernel_decision_tree_tables_.push_back(decision_tree_table_device_buffer);
123  kernel_decision_tree_offsets_.push_back(decision_tree_offsets_device_buffer);
124  }
125  }
126 #else
128 #endif
132  }
133 }
134 
135 std::pair<std::vector<std::shared_ptr<const Analyzer::Constant>>,
136  std::vector<const Analyzer::Constant*>>
138  const std::vector<const int8_t*>& kernel_buffers,
139  const bool hoist_literals) {
140  std::vector<std::shared_ptr<const Analyzer::Constant>> kernel_buffer_constants_owned;
141  std::vector<const Analyzer::Constant*> kernel_buffer_constants;
142  for (const auto kernel_buffer : kernel_buffers) {
143  const int64_t kernel_buffer_handle = reinterpret_cast<int64_t>(kernel_buffer);
144  const auto kernel_buffer_handle_literal =
145  std::dynamic_pointer_cast<Analyzer::Constant>(
146  Parser::IntLiteral::analyzeValue(kernel_buffer_handle));
148  kernel_buffer_handle_literal->get_type_info().get_compression());
149  kernel_buffer_constants_owned.push_back(kernel_buffer_handle_literal);
150  kernel_buffer_constants.push_back(kernel_buffer_handle_literal.get());
151  }
152  CHECK_GE(kernel_buffer_constants.size(), 1UL);
153  CHECK(hoist_literals || kernel_buffer_constants.size() == 1UL);
154 
155  return std::make_pair(kernel_buffer_constants_owned, kernel_buffer_constants);
156 }
157 
159  const std::vector<llvm::Value*>& regressor_inputs,
160  const CompilationOptions& co) const {
162  CHECK(kernel_decision_tree_tables_.size() == static_cast<size_t>(device_count_));
163  if (!co.hoist_literals && kernel_decision_tree_tables_.size() > 1UL) {
166  throw QueryMustRunOnCpu();
167  }
169 
170  auto cgen_state_ptr = executor_->getCgenStatePtr();
171  AUTOMATIC_IR_METADATA(cgen_state_ptr);
172 
173  const auto [decision_tree_table_constants_owned, decision_tree_table_constants] =
175  cgen_state_ptr, kernel_decision_tree_tables_, co.hoist_literals);
176 
177  const auto [decision_tree_offsets_constants_owned, decision_tree_offsets_constants] =
179  cgen_state_ptr, kernel_decision_tree_offsets_, co.hoist_literals);
180 
181  CodeGenerator code_generator(executor_);
182 
183  const auto decision_tree_table_handle_lvs =
184  co.hoist_literals
185  ? code_generator.codegenHoistedConstants(
186  decision_tree_table_constants, kENCODING_NONE, {})
187  : code_generator.codegen(decision_tree_table_constants[0], false, co);
188 
189  const auto decision_tree_offsets_handle_lvs =
190  co.hoist_literals
191  ? code_generator.codegenHoistedConstants(
192  decision_tree_offsets_constants, kENCODING_NONE, {})
193  : code_generator.codegen(decision_tree_offsets_constants[0], false, co);
194 
195  auto& builder = cgen_state_ptr->ir_builder_;
196  const int32_t num_regressors = static_cast<int32_t>(regressor_inputs.size());
197  auto regressor_ty = llvm::Type::getDoubleTy(cgen_state_ptr->context_);
198  llvm::ArrayType* regressor_arr_type =
199  llvm::ArrayType::get(regressor_ty, num_regressors);
200  auto regressor_local_storage_lv =
201  builder.CreateAlloca(regressor_arr_type, nullptr, "Regressor_Local_Storage");
202  auto idx_lv = cgen_state_ptr->llInt(0);
203  auto regressor_local_storage_gep = llvm::GetElementPtrInst::CreateInBounds(
204  regressor_local_storage_lv->getType()->getScalarType()->getPointerElementType(),
205  regressor_local_storage_lv,
206  {idx_lv, idx_lv},
207  "",
208  builder.GetInsertBlock());
209  for (int32_t reg_idx = 0; reg_idx < num_regressors; ++reg_idx) {
210  auto reg_ptr = builder.CreateGEP(
211  regressor_local_storage_lv->getType()->getScalarType()->getPointerElementType(),
212  regressor_local_storage_lv,
213  {cgen_state_ptr->llInt(0), cgen_state_ptr->llInt(reg_idx)},
214  "");
215  builder.CreateStore(regressor_inputs[reg_idx], reg_ptr);
216  }
217  const double translated_null_value = inline_fp_null_value<double>();
218 
219  return cgen_state_ptr->emitCall(
220  "tree_model_reg_predict",
221  {regressor_local_storage_gep,
222  cgen_state_ptr->castToTypeIn(decision_tree_table_handle_lvs.front(), 64),
223  cgen_state_ptr->castToTypeIn(decision_tree_offsets_handle_lvs.front(), 64),
224  cgen_state_ptr->llInt(num_regressors),
225  cgen_state_ptr->llInt(num_trees_),
226  cgen_state_ptr->llBool(compute_avg_),
227  cgen_state_ptr->llFp(translated_null_value)});
228 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
static std::shared_ptr< Analyzer::Expr > analyzeValue(const int64_t intval)
Definition: ParserNode.cpp:165
TreeModelPredictionMgr(const Data_Namespace::MemoryLevel memory_level, Executor *executor, const std::vector< std::vector< DecisionTreeEntry >> &decision_trees, const std::vector< int64_t > &decision_tree_offsets, const bool compute_avg)
#define CHECK_GE(x, y)
Definition: Logger.h:306
llvm::Value * codegen(const std::vector< llvm::Value * > &regressor_inputs, const CompilationOptions &co) const
Data_Namespace::DataMgr * getDataMgr() const
std::vector< llvm::Value * > codegenHoistedConstants(const std::vector< const Analyzer::Constant * > &constants, const EncodingType enc_type, const shared::StringDictKey &dict_id)
Definition: ConstantIR.cpp:373
std::vector< Data_Namespace::AbstractBuffer * > decision_tree_table_device_buffers_
ExecutorDeviceType
Data_Namespace::DataMgr * data_mgr_
std::vector< const int8_t * > kernel_decision_tree_tables_
Classes representing a parse tree.
executor_(executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
ExecutorDeviceType device_type
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
std::pair< std::vector< std::shared_ptr< const Analyzer::Constant > >, std::vector< const Analyzer::Constant * > > generate_kernel_buffer_constants(CgenState *cgen_state_ptr, const std::vector< const int8_t * > &kernel_buffers, const bool hoist_literals)
constexpr double inline_fp_null_value< double >()
static Data_Namespace::AbstractBuffer * allocGpuAbstractBuffer(Data_Namespace::DataMgr *data_mgr, const size_t num_bytes, const int device_id)
std::vector< Data_Namespace::AbstractBuffer * > decision_tree_offsets_device_buffers_
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
data_mgr_(data_mgr)
void copy_to_nvidia_gpu(Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id)
Definition: GpuMemUtils.cpp:35
std::vector< const int8_t * > kernel_decision_tree_offsets_
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
void allocateAndPopulateHostBuffers(const std::vector< std::vector< DecisionTreeEntry >> &decision_trees, const std::vector< int64_t > &decision_tree_offsets)
device_count_(device_count)
Allocate GPU memory using GpuBuffers via DataMgr.
void free(AbstractBuffer *buffer)
Definition: DataMgr.cpp:564
const Data_Namespace::MemoryLevel memory_level_
memory_level_(memory_level)