OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GpuSharedMemoryUtils.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <llvm/Transforms/Utils/Cloning.h>
18 
19 #include "GpuSharedMemoryUtils.h"
20 #include "ResultSetReductionJIT.h"
21 #include "RuntimeFunctions.h"
22 
24  llvm::Module* llvm_module,
25  llvm::LLVMContext& context,
26  const QueryMemoryDescriptor& qmd,
27  const std::vector<TargetInfo>& targets,
28  const std::vector<int64_t>& init_agg_values,
29  const size_t executor_id)
30  : executor_id_(executor_id)
31  , module_(llvm_module)
32  , context_(context)
33  , reduction_func_(nullptr)
34  , init_func_(nullptr)
35  , query_mem_desc_(qmd)
36  , targets_(targets)
37  , init_agg_values_(init_agg_values) {
52 }
53 
55  auto timer = DEBUG_TIMER(__func__);
56 
57  // codegen the init function
62 
63  // codegen the reduction function:
68 }
69 
99  // adding names to input arguments:
100  auto arg_it = reduction_func_->arg_begin();
101  auto dest_buffer_ptr = &*arg_it;
102  dest_buffer_ptr->setName("dest_buffer_ptr");
103  arg_it++;
104  auto src_buffer_ptr = &*arg_it;
105  src_buffer_ptr->setName("src_buffer_ptr");
106  arg_it++;
107  auto buffer_size = &*arg_it;
108  buffer_size->setName("buffer_size");
109 
110  auto bb_entry = llvm::BasicBlock::Create(context_, ".entry", reduction_func_);
111  auto bb_body = llvm::BasicBlock::Create(context_, ".body", reduction_func_);
112  auto bb_exit = llvm::BasicBlock::Create(context_, ".exit", reduction_func_);
113  llvm::IRBuilder<> ir_builder(bb_entry);
114 
115  // synchronize all threads within a threadblock:
116  const auto sync_threadblock = getFunction("sync_threadblock");
117  ir_builder.CreateCall(sync_threadblock, {});
118 
119  const auto func_thread_index = getFunction("get_thread_index");
120  const auto thread_idx = ir_builder.CreateCall(func_thread_index, {}, "thread_index");
121 
122  // branching out of out of bound:
123  const auto entry_count = ll_int(query_mem_desc_.getEntryCount(), context_);
124  const auto entry_count_i32 =
125  ll_int(static_cast<int32_t>(query_mem_desc_.getEntryCount()), context_);
126  const auto is_thread_inbound =
127  ir_builder.CreateICmpSLT(thread_idx, entry_count, "is_thread_inbound");
128  ir_builder.CreateCondBr(is_thread_inbound, bb_body, bb_exit);
129 
130  ir_builder.SetInsertPoint(bb_body);
131 
132  // cast src/dest buffers into byte streams:
133  auto src_byte_stream = ir_builder.CreatePointerCast(
134  src_buffer_ptr, llvm::Type::getInt8PtrTy(context_, 0), "src_byte_stream");
135  const auto dest_byte_stream = ir_builder.CreatePointerCast(
136  dest_buffer_ptr, llvm::Type::getInt8PtrTy(context_, 0), "dest_byte_stream");
137 
138  // running the result set reduction JIT code to get reduce_one_entry_idx function
139  auto fixup_query_mem_desc = ResultSet::fixupQueryMemoryDescriptor(query_mem_desc_);
140  auto rs_reduction_jit = std::make_unique<GpuReductionHelperJIT>(
141  fixup_query_mem_desc,
142  targets_,
144  executor_id_);
145  auto reduction_code = rs_reduction_jit->codegen();
146  CHECK(reduction_code.module);
147  reduction_code.module->setDataLayout(
148  "e-p:64:64:64-i1:8:8-i8:8:8-"
149  "i16:16:16-i32:32:32-i64:64:64-"
150  "f32:32:32-f64:64:64-v16:16:16-"
151  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
152  reduction_code.module->setTargetTriple("nvptx64-nvidia-cuda");
153  llvm::Linker linker(*module_);
154  std::unique_ptr<llvm::Module> owner(reduction_code.module);
155  bool link_error = linker.linkInModule(std::move(owner));
156  CHECK(!link_error);
157 
158  // go through the reduction code and replace all occurances of agg functions
159  // with their _shared counterparts, which are specifically used in GPUs
160  auto reduce_one_entry_func = getFunction("reduce_one_entry");
161  bool agg_func_found = true;
162  while (agg_func_found) {
163  agg_func_found = false;
164  for (auto it = llvm::inst_begin(reduce_one_entry_func);
165  it != llvm::inst_end(reduce_one_entry_func);
166  it++) {
167  if (!llvm::isa<llvm::CallInst>(*it)) {
168  continue;
169  }
170  auto& func_call = llvm::cast<llvm::CallInst>(*it);
171  std::string func_name = func_call.getCalledFunction()->getName().str();
172  if (func_name.length() > 4 && func_name.substr(0, 4) == "agg_") {
173  if (func_name.length() > 7 &&
174  func_name.substr(func_name.length() - 7) == "_shared") {
175  continue;
176  }
177  agg_func_found = true;
178  std::vector<llvm::Value*> args;
179  for (size_t i = 0; i < func_call.getNumOperands() - 1; ++i) {
180  args.push_back(func_call.getArgOperand(i));
181  }
182  auto gpu_agg_func = getFunction(func_name + "_shared");
183  llvm::ReplaceInstWithInst(&func_call,
184  llvm::CallInst::Create(gpu_agg_func, args, ""));
185  break;
186  }
187  }
188  }
189  const auto reduce_one_entry_idx_func = getFunction("reduce_one_entry_idx");
190  CHECK(reduce_one_entry_idx_func);
191 
192  // qmd_handles are only used with count distinct and baseline group by
193  // serialized varlen buffer is only used with SAMPLE on varlen types, which we will
194  // disable for current shared memory support.
195  const auto null_ptr_ll =
196  llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(context_, 0));
197  const auto thread_idx_i32 = ir_builder.CreateCast(
198  llvm::Instruction::CastOps::Trunc, thread_idx, get_int_type(32, context_));
199  ir_builder.CreateCall(reduce_one_entry_idx_func,
200  {dest_byte_stream,
201  src_byte_stream,
202  thread_idx_i32,
203  entry_count_i32,
204  null_ptr_ll,
205  null_ptr_ll,
206  null_ptr_ll},
207  "");
208  ir_builder.CreateBr(bb_exit);
209  llvm::ReturnInst::Create(context_, bb_exit);
210 }
211 
212 namespace {
213 // given a particular destination ptr to the beginning of an entry, this function creates
214 // proper cast for a specific slot index.
215 // it also assumes these pointers are within shared memory address space (3)
216 llvm::Value* codegen_smem_dest_slot_ptr(llvm::LLVMContext& context,
218  llvm::IRBuilder<>& ir_builder,
219  const size_t slot_idx,
220  const TargetInfo& target_info,
221  llvm::Value* dest_byte_stream,
222  llvm::Value* byte_offset) {
223  const auto sql_type = get_compact_type(target_info);
224  const auto slot_bytes = query_mem_desc.getPaddedSlotWidthBytes(slot_idx);
225  auto ptr_type = [&context](const size_t slot_bytes, const SQLTypeInfo& sql_type) {
226  if (slot_bytes == sizeof(int32_t)) {
227  return llvm::Type::getInt32PtrTy(context, /*address_space=*/3);
228  } else {
229  CHECK(slot_bytes == sizeof(int64_t));
230  return llvm::Type::getInt64PtrTy(context, /*address_space=*/3);
231  }
232  UNREACHABLE() << "Invalid slot size encountered: " << std::to_string(slot_bytes);
233  return llvm::Type::getInt32PtrTy(context, /*address_space=*/3);
234  };
235 
236  const auto casted_dest_slot_address = ir_builder.CreatePointerCast(
237  ir_builder.CreateGEP(
238  dest_byte_stream->getType()->getScalarType()->getPointerElementType(),
239  dest_byte_stream,
240  byte_offset),
241  ptr_type(slot_bytes, sql_type),
242  "dest_slot_adr_" + std::to_string(slot_idx));
243  return casted_dest_slot_address;
244 }
245 } // namespace
246 
255  CHECK(init_func_);
256  // similar to the rest of the system, we used fixup QMD to be able to handle reductions
257  // it should be removed in the future.
258  auto fixup_query_mem_desc = ResultSet::fixupQueryMemoryDescriptor(query_mem_desc_);
259  CHECK(!fixup_query_mem_desc.didOutputColumnar());
260  CHECK(fixup_query_mem_desc.hasKeylessHash());
261  CHECK_GE(init_agg_values_.size(), targets_.size());
262 
263  auto bb_entry = llvm::BasicBlock::Create(context_, ".entry", init_func_);
264  auto bb_body = llvm::BasicBlock::Create(context_, ".body", init_func_);
265  auto bb_exit = llvm::BasicBlock::Create(context_, ".exit", init_func_);
266 
267  llvm::IRBuilder<> ir_builder(bb_entry);
268  const auto func_thread_index = getFunction("get_thread_index");
269  const auto thread_idx = ir_builder.CreateCall(func_thread_index, {}, "thread_index");
270 
271  // declare dynamic shared memory:
272  const auto declare_smem_func = getFunction("declare_dynamic_shared_memory");
273  const auto shared_mem_buffer =
274  ir_builder.CreateCall(declare_smem_func, {}, "shared_mem_buffer");
275 
276  const auto entry_count = ll_int(fixup_query_mem_desc.getEntryCount(), context_);
277  const auto is_thread_inbound =
278  ir_builder.CreateICmpSLT(thread_idx, entry_count, "is_thread_inbound");
279  ir_builder.CreateCondBr(is_thread_inbound, bb_body, bb_exit);
280 
281  ir_builder.SetInsertPoint(bb_body);
282  // compute byte offset assigned to this thread:
283  const auto row_size_bytes = ll_int(fixup_query_mem_desc.getRowWidth(), context_);
284  auto byte_offset_ll = ir_builder.CreateMul(row_size_bytes, thread_idx, "byte_offset");
285 
286  const auto dest_byte_stream = ir_builder.CreatePointerCast(
287  shared_mem_buffer, llvm::Type::getInt8PtrTy(context_), "dest_byte_stream");
288 
289  // each thread will be responsible for one
290  const auto& col_slot_context = fixup_query_mem_desc.getColSlotContext();
291  size_t init_agg_idx = 0;
292  for (size_t target_logical_idx = 0; target_logical_idx < targets_.size();
293  ++target_logical_idx) {
294  const auto& target_info = targets_[target_logical_idx];
295  const auto& slots_for_target = col_slot_context.getSlotsForCol(target_logical_idx);
296  for (size_t slot_idx = slots_for_target.front(); slot_idx <= slots_for_target.back();
297  slot_idx++) {
298  const auto slot_size = fixup_query_mem_desc.getPaddedSlotWidthBytes(slot_idx);
299 
300  auto casted_dest_slot_address = codegen_smem_dest_slot_ptr(context_,
301  fixup_query_mem_desc,
302  ir_builder,
303  slot_idx,
304  target_info,
305  dest_byte_stream,
306  byte_offset_ll);
307 
308  llvm::Value* init_value_ll = nullptr;
309  if (slot_size == sizeof(int32_t)) {
310  init_value_ll =
311  ll_int(static_cast<int32_t>(init_agg_values_[init_agg_idx++]), context_);
312  } else if (slot_size == sizeof(int64_t)) {
313  init_value_ll =
314  ll_int(static_cast<int64_t>(init_agg_values_[init_agg_idx++]), context_);
315  } else {
316  UNREACHABLE() << "Invalid slot size encountered.";
317  }
318  ir_builder.CreateStore(init_value_ll, casted_dest_slot_address);
319 
320  // if not the last loop, we compute the next offset:
321  if (slot_idx != (col_slot_context.getSlotCount() - 1)) {
322  byte_offset_ll = ir_builder.CreateAdd(
323  byte_offset_ll, ll_int(static_cast<size_t>(slot_size), context_));
324  }
325  }
326  }
327 
328  ir_builder.CreateBr(bb_exit);
329 
330  ir_builder.SetInsertPoint(bb_exit);
331  // synchronize all threads within a threadblock:
332  const auto sync_threadblock = getFunction("sync_threadblock");
333  ir_builder.CreateCall(sync_threadblock, {});
334  ir_builder.CreateRet(shared_mem_buffer);
335 }
336 
338  std::vector<llvm::Type*> input_arguments;
339  input_arguments.push_back(llvm::Type::getInt64PtrTy(context_));
340  input_arguments.push_back(llvm::Type::getInt64PtrTy(context_));
341  input_arguments.push_back(llvm::Type::getInt32Ty(context_));
342 
343  llvm::FunctionType* ft =
344  llvm::FunctionType::get(llvm::Type::getVoidTy(context_), input_arguments, false);
345  const auto reduction_function = llvm::Function::Create(
346  ft, llvm::Function::ExternalLinkage, "reduce_from_smem_to_gmem", module_);
347  return reduction_function;
348 }
349 
351  std::vector<llvm::Type*> input_arguments;
352  input_arguments.push_back(
353  llvm::Type::getInt64PtrTy(context_)); // a pointer to the buffer
354  input_arguments.push_back(llvm::Type::getInt32Ty(context_)); // buffer size in bytes
355 
356  llvm::FunctionType* ft = llvm::FunctionType::get(
357  llvm::Type::getInt64PtrTy(context_), input_arguments, false);
358  const auto init_function = llvm::Function::Create(
359  ft, llvm::Function::ExternalLinkage, "init_smem_func", module_);
360  return init_function;
361 }
362 
363 llvm::Function* GpuSharedMemCodeBuilder::getFunction(const std::string& func_name) const {
364  const auto function = module_->getFunction(func_name);
365  CHECK(function) << func_name << " is not found in the module.";
366  return function;
367 }
368 
369 namespace {
375 void replace_called_function_with(llvm::Function* main_func,
376  const std::string& target_func_name,
377  llvm::Function* replace_func) {
378  for (auto it = llvm::inst_begin(main_func), e = llvm::inst_end(main_func); it != e;
379  ++it) {
380  if (!llvm::isa<llvm::CallInst>(*it)) {
381  continue;
382  }
383  auto& instruction = llvm::cast<llvm::CallInst>(*it);
384  if (std::string(instruction.getCalledFunction()->getName()) == target_func_name) {
385  std::vector<llvm::Value*> args;
386  for (size_t i = 0; i < instruction.getNumOperands() - 1; ++i) {
387  args.push_back(instruction.getArgOperand(i));
388  }
389  llvm::ReplaceInstWithInst(&instruction,
390  llvm::CallInst::Create(replace_func, args, ""));
391  return;
392  }
393  }
394  UNREACHABLE() << "Target function " << target_func_name << " was not found in "
395  << replace_func->getName().str();
396 }
397 
398 } // namespace
399 
400 void GpuSharedMemCodeBuilder::injectFunctionsInto(llvm::Function* query_func) {
402  CHECK(init_func_);
403  replace_called_function_with(query_func, "init_shared_mem", init_func_);
404  replace_called_function_with(query_func, "write_back_nop", reduction_func_);
405 }
406 
409  CHECK(init_func_);
411 }
llvm::ConstantInt * ll_int(const T v, llvm::LLVMContext &context)
#define UNREACHABLE()
Definition: Logger.h:337
#define CHECK_GE(x, y)
Definition: Logger.h:306
const QueryMemoryDescriptor query_mem_desc_
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::string to_string(char const *&&v)
llvm::Function * createInitFunction() const
const SQLTypeInfo get_compact_type(const TargetInfo &target)
void verify_function_ir(const llvm::Function *func)
llvm::LLVMContext & context_
GpuSharedMemCodeBuilder(llvm::Module *module, llvm::LLVMContext &context, const QueryMemoryDescriptor &qmd, const std::vector< TargetInfo > &targets, const std::vector< int64_t > &init_agg_values, const size_t executor_id)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
std::string toString() const
QueryDescriptionType getQueryDescriptionType() const
std::vector< int64_t > initialize_target_values_for_storage(const std::vector< TargetInfo > &targets)
const std::vector< int64_t > init_agg_values_
llvm::Value * codegen_smem_dest_slot_ptr(llvm::LLVMContext &context, const QueryMemoryDescriptor &query_mem_desc, llvm::IRBuilder<> &ir_builder, const size_t slot_idx, const TargetInfo &target_info, llvm::Value *dest_byte_stream, llvm::Value *byte_offset)
std::string serialize_llvm_object(const T *llvm_obj)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:756
const std::vector< TargetInfo > targets_
void replace_called_function_with(llvm::Function *main_func, const std::string &target_func_name, llvm::Function *replace_func)
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:411
__device__ void sync_threadblock()
llvm::Function * getFunction(const std::string &func_name) const
void injectFunctionsInto(llvm::Function *query_func)
llvm::Function * reduction_func_
llvm::Function * createReductionFunction() const