OmniSciDB  2e3a973ef4
GpuSharedMemoryUtils.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2019 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GpuSharedMemoryUtils.h"
18 #include "ResultSetReductionJIT.h"
19 #include "RuntimeFunctions.h"
20 
22  llvm::Module* module,
23  llvm::LLVMContext& context,
24  const QueryMemoryDescriptor& qmd,
25  const std::vector<TargetInfo>& targets,
26  const std::vector<int64_t>& init_agg_values)
27  : module_(module)
28  , context_(context)
29  , reduction_func_(nullptr)
30  , init_func_(nullptr)
31  , query_mem_desc_(qmd)
32  , targets_(targets)
33  , init_agg_values_(init_agg_values) {
48 }
49 
51  auto timer = DEBUG_TIMER(__func__);
52 
53  // codegen the init function
58 
59  // codegen the reduction function:
64 }
65 
95  // adding names to input arguments:
96  auto arg_it = reduction_func_->arg_begin();
97  auto dest_buffer_ptr = &*arg_it;
98  dest_buffer_ptr->setName("dest_buffer_ptr");
99  arg_it++;
100  auto src_buffer_ptr = &*arg_it;
101  src_buffer_ptr->setName("src_buffer_ptr");
102  arg_it++;
103  auto buffer_size = &*arg_it;
104  buffer_size->setName("buffer_size");
105 
106  auto bb_entry = llvm::BasicBlock::Create(context_, ".entry", reduction_func_);
107  auto bb_body = llvm::BasicBlock::Create(context_, ".body", reduction_func_);
108  auto bb_exit = llvm::BasicBlock::Create(context_, ".exit", reduction_func_);
109  llvm::IRBuilder<> ir_builder(bb_entry);
110 
111  // synchronize all threads within a threadblock:
112  const auto sync_threadblock = getFunction("sync_threadblock");
113  ir_builder.CreateCall(sync_threadblock, {});
114 
115  const auto func_thread_index = getFunction("get_thread_index");
116  const auto thread_idx = ir_builder.CreateCall(func_thread_index, {}, "thread_index");
117 
118  // branching out of out of bound:
119  const auto entry_count = ll_int(query_mem_desc_.getEntryCount(), context_);
120  const auto entry_count_i32 =
121  ll_int(static_cast<int32_t>(query_mem_desc_.getEntryCount()), context_);
122  const auto is_thread_inbound =
123  ir_builder.CreateICmpSLT(thread_idx, entry_count, "is_thread_inbound");
124  ir_builder.CreateCondBr(is_thread_inbound, bb_body, bb_exit);
125 
126  ir_builder.SetInsertPoint(bb_body);
127 
128  // cast src/dest buffers into byte streams:
129  auto src_byte_stream = ir_builder.CreatePointerCast(
130  src_buffer_ptr, llvm::Type::getInt8PtrTy(context_, 0), "src_byte_stream");
131  const auto dest_byte_stream = ir_builder.CreatePointerCast(
132  dest_buffer_ptr, llvm::Type::getInt8PtrTy(context_, 0), "dest_byte_stream");
133 
134  // running the result set reduction JIT code to get reduce_one_entry_idx function
135  auto rs_reduction_jit = std::make_unique<GpuReductionHelperJIT>(
137  targets_,
139  auto reduction_code = rs_reduction_jit->codegen();
140  reduction_code.module->setDataLayout(
141  "e-p:64:64:64-i1:8:8-i8:8:8-"
142  "i16:16:16-i32:32:32-i64:64:64-"
143  "f32:32:32-f64:64:64-v16:16:16-"
144  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
145  reduction_code.module->setTargetTriple("nvptx64-nvidia-cuda");
146 
147  llvm::Linker linker(*module_);
148  bool link_error = linker.linkInModule(std::move(reduction_code.module));
149  CHECK(!link_error);
150 
151  // go through the reduction code and replace all occurances of agg functions
152  // with their _shared counterparts, which are specifically used in GPUs
153  auto reduce_one_entry_func = getFunction("reduce_one_entry");
154  bool agg_func_found = true;
155  while (agg_func_found) {
156  agg_func_found = false;
157  for (auto it = llvm::inst_begin(reduce_one_entry_func);
158  it != llvm::inst_end(reduce_one_entry_func);
159  it++) {
160  if (!llvm::isa<llvm::CallInst>(*it)) {
161  continue;
162  }
163  auto& func_call = llvm::cast<llvm::CallInst>(*it);
164  std::string func_name = func_call.getCalledFunction()->getName().str();
165  if (func_name.length() > 4 && func_name.substr(0, 4) == "agg_") {
166  if (func_name.length() > 7 &&
167  func_name.substr(func_name.length() - 7) == "_shared") {
168  continue;
169  }
170  agg_func_found = true;
171  std::vector<llvm::Value*> args;
172  for (size_t i = 0; i < func_call.getNumArgOperands(); ++i) {
173  args.push_back(func_call.getArgOperand(i));
174  }
175  auto gpu_agg_func = getFunction(func_name + "_shared");
176  llvm::ReplaceInstWithInst(&func_call,
177  llvm::CallInst::Create(gpu_agg_func, args, ""));
178  break;
179  }
180  }
181  }
182  const auto reduce_one_entry_idx_func = getFunction("reduce_one_entry_idx");
183  CHECK(reduce_one_entry_idx_func);
184 
185  // qmd_handles are only used with count distinct and baseline group by
186  // serialized varlen buffer is only used with SAMPLE on varlen types, which we will
187  // disable for current shared memory support.
188  const auto null_ptr_ll =
189  llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(context_, 0));
190  const auto thread_idx_i32 = ir_builder.CreateCast(
191  llvm::Instruction::CastOps::Trunc, thread_idx, get_int_type(32, context_));
192  ir_builder.CreateCall(reduce_one_entry_idx_func,
193  {dest_byte_stream,
194  src_byte_stream,
195  thread_idx_i32,
196  entry_count_i32,
197  null_ptr_ll,
198  null_ptr_ll,
199  null_ptr_ll},
200  "");
201  ir_builder.CreateBr(bb_exit);
202  llvm::ReturnInst::Create(context_, bb_exit);
203 }
204 
205 namespace {
206 // given a particular destination ptr to the beginning of an entry, this function creates
207 // proper cast for a specific slot index.
208 // it also assumes these pointers are within shared memory address space (3)
209 llvm::Value* codegen_smem_dest_slot_ptr(llvm::LLVMContext& context,
210  const QueryMemoryDescriptor& query_mem_desc,
211  llvm::IRBuilder<>& ir_builder,
212  const size_t slot_idx,
213  const TargetInfo& target_info,
214  llvm::Value* dest_byte_stream,
215  llvm::Value* byte_offset) {
216  const auto sql_type = get_compact_type(target_info);
217  const auto slot_bytes = query_mem_desc.getPaddedSlotWidthBytes(slot_idx);
218  auto ptr_type = [&context](const size_t slot_bytes, const SQLTypeInfo& sql_type) {
219  if (slot_bytes == sizeof(int32_t)) {
220  return llvm::Type::getInt32PtrTy(context, /*address_space=*/3);
221  } else {
222  CHECK(slot_bytes == sizeof(int64_t));
223  return llvm::Type::getInt64PtrTy(context, /*address_space=*/3);
224  }
225  UNREACHABLE() << "Invalid slot size encountered: " << std::to_string(slot_bytes);
226  return llvm::Type::getInt32PtrTy(context, /*address_space=*/3);
227  };
228 
229  const auto casted_dest_slot_address =
230  ir_builder.CreatePointerCast(ir_builder.CreateGEP(dest_byte_stream, byte_offset),
231  ptr_type(slot_bytes, sql_type),
232  "dest_slot_adr_" + std::to_string(slot_idx));
233  return casted_dest_slot_address;
234 }
235 } // namespace
236 
245  CHECK(init_func_);
246  // similar to the rest of the system, we used fixup QMD to be able to handle reductions
247  // it should be removed in the future.
248  auto fixup_query_mem_desc = ResultSet::fixupQueryMemoryDescriptor(query_mem_desc_);
249  CHECK(!fixup_query_mem_desc.didOutputColumnar());
250  CHECK(fixup_query_mem_desc.hasKeylessHash());
251  CHECK_GE(init_agg_values_.size(), targets_.size());
252 
253  auto bb_entry = llvm::BasicBlock::Create(context_, ".entry", init_func_);
254  auto bb_body = llvm::BasicBlock::Create(context_, ".body", init_func_);
255  auto bb_exit = llvm::BasicBlock::Create(context_, ".exit", init_func_);
256 
257  llvm::IRBuilder<> ir_builder(bb_entry);
258  const auto func_thread_index = getFunction("get_thread_index");
259  const auto thread_idx = ir_builder.CreateCall(func_thread_index, {}, "thread_index");
260 
261  // declare dynamic shared memory:
262  const auto declare_smem_func = getFunction("declare_dynamic_shared_memory");
263  const auto shared_mem_buffer =
264  ir_builder.CreateCall(declare_smem_func, {}, "shared_mem_buffer");
265 
266  const auto entry_count = ll_int(fixup_query_mem_desc.getEntryCount(), context_);
267  const auto is_thread_inbound =
268  ir_builder.CreateICmpSLT(thread_idx, entry_count, "is_thread_inbound");
269  ir_builder.CreateCondBr(is_thread_inbound, bb_body, bb_exit);
270 
271  ir_builder.SetInsertPoint(bb_body);
272  // compute byte offset assigned to this thread:
273  const auto row_size_bytes = ll_int(fixup_query_mem_desc.getRowWidth(), context_);
274  auto byte_offset_ll = ir_builder.CreateMul(row_size_bytes, thread_idx, "byte_offset");
275 
276  const auto dest_byte_stream = ir_builder.CreatePointerCast(
277  shared_mem_buffer, llvm::Type::getInt8PtrTy(context_), "dest_byte_stream");
278 
279  // each thread will be responsible for one
280  const auto& col_slot_context = fixup_query_mem_desc.getColSlotContext();
281  size_t init_agg_idx = 0;
282  for (size_t target_logical_idx = 0; target_logical_idx < targets_.size();
283  ++target_logical_idx) {
284  const auto& target_info = targets_[target_logical_idx];
285  const auto& slots_for_target = col_slot_context.getSlotsForCol(target_logical_idx);
286  for (size_t slot_idx = slots_for_target.front(); slot_idx <= slots_for_target.back();
287  slot_idx++) {
288  const auto slot_size = fixup_query_mem_desc.getPaddedSlotWidthBytes(slot_idx);
289 
290  auto casted_dest_slot_address = codegen_smem_dest_slot_ptr(context_,
291  fixup_query_mem_desc,
292  ir_builder,
293  slot_idx,
294  target_info,
295  dest_byte_stream,
296  byte_offset_ll);
297 
298  llvm::Value* init_value_ll = nullptr;
299  if (slot_size == sizeof(int32_t)) {
300  init_value_ll =
301  ll_int(static_cast<int32_t>(init_agg_values_[init_agg_idx++]), context_);
302  } else if (slot_size == sizeof(int64_t)) {
303  init_value_ll =
304  ll_int(static_cast<int64_t>(init_agg_values_[init_agg_idx++]), context_);
305  } else {
306  UNREACHABLE() << "Invalid slot size encountered.";
307  }
308  ir_builder.CreateStore(init_value_ll, casted_dest_slot_address);
309 
310  // if not the last loop, we compute the next offset:
311  if (slot_idx != (col_slot_context.getSlotCount() - 1)) {
312  byte_offset_ll = ir_builder.CreateAdd(
313  byte_offset_ll, ll_int(static_cast<size_t>(slot_size), context_));
314  }
315  }
316  }
317 
318  ir_builder.CreateBr(bb_exit);
319 
320  ir_builder.SetInsertPoint(bb_exit);
321  // synchronize all threads within a threadblock:
322  const auto sync_threadblock = getFunction("sync_threadblock");
323  ir_builder.CreateCall(sync_threadblock, {});
324  ir_builder.CreateRet(shared_mem_buffer);
325 }
326 
328  std::vector<llvm::Type*> input_arguments;
329  input_arguments.push_back(llvm::Type::getInt64PtrTy(context_));
330  input_arguments.push_back(llvm::Type::getInt64PtrTy(context_));
331  input_arguments.push_back(llvm::Type::getInt32Ty(context_));
332 
333  llvm::FunctionType* ft =
334  llvm::FunctionType::get(llvm::Type::getVoidTy(context_), input_arguments, false);
335  const auto reduction_function = llvm::Function::Create(
336  ft, llvm::Function::ExternalLinkage, "reduce_from_smem_to_gmem", module_);
337  return reduction_function;
338 }
339 
341  std::vector<llvm::Type*> input_arguments;
342  input_arguments.push_back(
343  llvm::Type::getInt64PtrTy(context_)); // a pointer to the buffer
344  input_arguments.push_back(llvm::Type::getInt32Ty(context_)); // buffer size in bytes
345 
346  llvm::FunctionType* ft = llvm::FunctionType::get(
347  llvm::Type::getInt64PtrTy(context_), input_arguments, false);
348  const auto init_function = llvm::Function::Create(
349  ft, llvm::Function::ExternalLinkage, "init_smem_func", module_);
350  return init_function;
351 }
352 
353 llvm::Function* GpuSharedMemCodeBuilder::getFunction(const std::string& func_name) const {
354  const auto function = module_->getFunction(func_name);
355  CHECK(function) << func_name << " is not found in the module.";
356  return function;
357 }
358 
359 namespace {
365 void replace_called_function_with(llvm::Function* main_func,
366  const std::string& target_func_name,
367  llvm::Function* replace_func) {
368  for (auto it = llvm::inst_begin(main_func), e = llvm::inst_end(main_func); it != e;
369  ++it) {
370  if (!llvm::isa<llvm::CallInst>(*it)) {
371  continue;
372  }
373  auto& instruction = llvm::cast<llvm::CallInst>(*it);
374  if (std::string(instruction.getCalledFunction()->getName()) == target_func_name) {
375  std::vector<llvm::Value*> args;
376  for (size_t i = 0; i < instruction.getNumArgOperands(); ++i) {
377  args.push_back(instruction.getArgOperand(i));
378  }
379  llvm::ReplaceInstWithInst(&instruction,
380  llvm::CallInst::Create(replace_func, args, ""));
381  return;
382  }
383  }
384  UNREACHABLE() << "Target function " << target_func_name << " was not found in "
385  << replace_func->getName().str();
386 }
387 
388 } // namespace
389 
390 void GpuSharedMemCodeBuilder::injectFunctionsInto(llvm::Function* query_func) {
392  CHECK(init_func_);
393  replace_called_function_with(query_func, "init_shared_mem", init_func_);
394  replace_called_function_with(query_func, "write_back_nop", reduction_func_);
395 }
396 
399  CHECK(init_func_);
401 }
std::vector< int64_t > initialize_target_values_for_storage(const std::vector< TargetInfo > &targets)
Definition: ResultSet.cpp:47
llvm::Function * getFunction(const std::string &func_name) const
llvm::Function * createReductionFunction() const
llvm::ConstantInt * ll_int(const T v, llvm::LLVMContext &context)
#define UNREACHABLE()
Definition: Logger.h:241
#define CHECK_GE(x, y)
Definition: Logger.h:210
const QueryMemoryDescriptor query_mem_desc_
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::string to_string(char const *&&v)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
const SQLTypeInfo get_compact_type(const TargetInfo &target)
void verify_function_ir(const llvm::Function *func)
llvm::LLVMContext & context_
llvm::Function * createInitFunction() const
const std::vector< int64_t > init_agg_values_
llvm::Value * codegen_smem_dest_slot_ptr(llvm::LLVMContext &context, const QueryMemoryDescriptor &query_mem_desc, llvm::IRBuilder<> &ir_builder, const size_t slot_idx, const TargetInfo &target_info, llvm::Value *dest_byte_stream, llvm::Value *byte_offset)
std::string serialize_llvm_object(const T *llvm_obj)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:509
const std::vector< TargetInfo > targets_
void replace_called_function_with(llvm::Function *main_func, const std::string &target_func_name, llvm::Function *replace_func)
#define CHECK(condition)
Definition: Logger.h:197
#define DEBUG_TIMER(name)
Definition: Logger.h:313
void injectFunctionsInto(llvm::Function *query_func)
llvm::Function * reduction_func_
GPU_RT_STUB void sync_threadblock()
QueryDescriptionType getQueryDescriptionType() const
GpuSharedMemCodeBuilder(llvm::Module *module, llvm::LLVMContext &context, const QueryMemoryDescriptor &qmd, const std::vector< TargetInfo > &targets, const std::vector< int64_t > &init_agg_values)