17 #include <llvm/Transforms/Utils/Cloning.h>
24 llvm::Module* llvm_module,
25 llvm::LLVMContext& context,
27 const std::vector<TargetInfo>& targets,
28 const std::vector<int64_t>& init_agg_values,
29 const size_t executor_id)
30 : executor_id_(executor_id)
31 , module_(llvm_module)
33 , reduction_func_(nullptr)
35 , query_mem_desc_(qmd)
37 , init_agg_values_(init_agg_values) {
101 auto dest_buffer_ptr = &*arg_it;
102 dest_buffer_ptr->setName(
"dest_buffer_ptr");
104 auto src_buffer_ptr = &*arg_it;
105 src_buffer_ptr->setName(
"src_buffer_ptr");
107 auto buffer_size = &*arg_it;
108 buffer_size->setName(
"buffer_size");
113 llvm::IRBuilder<> ir_builder(bb_entry);
119 const auto func_thread_index =
getFunction(
"get_thread_index");
120 const auto thread_idx = ir_builder.CreateCall(func_thread_index, {},
"thread_index");
124 const auto entry_count_i32 =
126 const auto is_thread_inbound =
127 ir_builder.CreateICmpSLT(thread_idx, entry_count,
"is_thread_inbound");
128 ir_builder.CreateCondBr(is_thread_inbound, bb_body, bb_exit);
130 ir_builder.SetInsertPoint(bb_body);
133 auto src_byte_stream = ir_builder.CreatePointerCast(
134 src_buffer_ptr, llvm::Type::getInt8PtrTy(
context_, 0),
"src_byte_stream");
135 const auto dest_byte_stream = ir_builder.CreatePointerCast(
136 dest_buffer_ptr, llvm::Type::getInt8PtrTy(
context_, 0),
"dest_byte_stream");
140 auto rs_reduction_jit = std::make_unique<GpuReductionHelperJIT>(
141 fixup_query_mem_desc,
145 auto reduction_code = rs_reduction_jit->codegen();
146 CHECK(reduction_code.module);
147 reduction_code.module->setDataLayout(
148 "e-p:64:64:64-i1:8:8-i8:8:8-"
149 "i16:16:16-i32:32:32-i64:64:64-"
150 "f32:32:32-f64:64:64-v16:16:16-"
151 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
152 reduction_code.module->setTargetTriple(
"nvptx64-nvidia-cuda");
154 std::unique_ptr<llvm::Module> owner(reduction_code.module);
155 bool link_error = linker.linkInModule(std::move(owner));
160 auto reduce_one_entry_func =
getFunction(
"reduce_one_entry");
161 bool agg_func_found =
true;
162 while (agg_func_found) {
163 agg_func_found =
false;
164 for (
auto it = llvm::inst_begin(reduce_one_entry_func);
165 it != llvm::inst_end(reduce_one_entry_func);
167 if (!llvm::isa<llvm::CallInst>(*it)) {
170 auto& func_call = llvm::cast<llvm::CallInst>(*it);
171 std::string func_name = func_call.getCalledFunction()->getName().str();
172 if (func_name.length() > 4 && func_name.substr(0, 4) ==
"agg_") {
173 if (func_name.length() > 7 &&
174 func_name.substr(func_name.length() - 7) ==
"_shared") {
177 agg_func_found =
true;
178 std::vector<llvm::Value*>
args;
179 for (
size_t i = 0; i < func_call.getNumOperands() - 1; ++i) {
180 args.push_back(func_call.getArgOperand(i));
182 auto gpu_agg_func =
getFunction(func_name +
"_shared");
183 llvm::ReplaceInstWithInst(&func_call,
184 llvm::CallInst::Create(gpu_agg_func, args,
""));
189 const auto reduce_one_entry_idx_func =
getFunction(
"reduce_one_entry_idx");
190 CHECK(reduce_one_entry_idx_func);
195 const auto null_ptr_ll =
196 llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(
context_, 0));
197 const auto thread_idx_i32 = ir_builder.CreateCast(
199 ir_builder.CreateCall(reduce_one_entry_idx_func,
208 ir_builder.CreateBr(bb_exit);
209 llvm::ReturnInst::Create(
context_, bb_exit);
218 llvm::IRBuilder<>& ir_builder,
219 const size_t slot_idx,
221 llvm::Value* dest_byte_stream,
222 llvm::Value* byte_offset) {
225 auto ptr_type = [&context](
const size_t slot_bytes,
const SQLTypeInfo& sql_type) {
226 if (slot_bytes ==
sizeof(int32_t)) {
227 return llvm::Type::getInt32PtrTy(context, 3);
229 CHECK(slot_bytes ==
sizeof(int64_t));
230 return llvm::Type::getInt64PtrTy(context, 3);
233 return llvm::Type::getInt32PtrTy(context, 3);
236 const auto casted_dest_slot_address = ir_builder.CreatePointerCast(
237 ir_builder.CreateGEP(
238 dest_byte_stream->getType()->getScalarType()->getPointerElementType(),
241 ptr_type(slot_bytes, sql_type),
243 return casted_dest_slot_address;
259 CHECK(!fixup_query_mem_desc.didOutputColumnar());
260 CHECK(fixup_query_mem_desc.hasKeylessHash());
267 llvm::IRBuilder<> ir_builder(bb_entry);
268 const auto func_thread_index =
getFunction(
"get_thread_index");
269 const auto thread_idx = ir_builder.CreateCall(func_thread_index, {},
"thread_index");
272 const auto declare_smem_func =
getFunction(
"declare_dynamic_shared_memory");
273 const auto shared_mem_buffer =
274 ir_builder.CreateCall(declare_smem_func, {},
"shared_mem_buffer");
276 const auto entry_count =
ll_int(fixup_query_mem_desc.getEntryCount(),
context_);
277 const auto is_thread_inbound =
278 ir_builder.CreateICmpSLT(thread_idx, entry_count,
"is_thread_inbound");
279 ir_builder.CreateCondBr(is_thread_inbound, bb_body, bb_exit);
281 ir_builder.SetInsertPoint(bb_body);
283 const auto row_size_bytes =
ll_int(fixup_query_mem_desc.getRowWidth(),
context_);
284 auto byte_offset_ll = ir_builder.CreateMul(row_size_bytes, thread_idx,
"byte_offset");
286 const auto dest_byte_stream = ir_builder.CreatePointerCast(
287 shared_mem_buffer, llvm::Type::getInt8PtrTy(
context_),
"dest_byte_stream");
290 const auto& col_slot_context = fixup_query_mem_desc.getColSlotContext();
291 size_t init_agg_idx = 0;
292 for (
size_t target_logical_idx = 0; target_logical_idx <
targets_.size();
293 ++target_logical_idx) {
294 const auto& target_info =
targets_[target_logical_idx];
295 const auto& slots_for_target = col_slot_context.getSlotsForCol(target_logical_idx);
296 for (
size_t slot_idx = slots_for_target.front(); slot_idx <= slots_for_target.back();
298 const auto slot_size = fixup_query_mem_desc.getPaddedSlotWidthBytes(slot_idx);
301 fixup_query_mem_desc,
308 llvm::Value* init_value_ll =
nullptr;
309 if (slot_size ==
sizeof(int32_t)) {
312 }
else if (slot_size ==
sizeof(int64_t)) {
318 ir_builder.CreateStore(init_value_ll, casted_dest_slot_address);
321 if (slot_idx != (col_slot_context.getSlotCount() - 1)) {
322 byte_offset_ll = ir_builder.CreateAdd(
323 byte_offset_ll,
ll_int(static_cast<size_t>(slot_size),
context_));
328 ir_builder.CreateBr(bb_exit);
330 ir_builder.SetInsertPoint(bb_exit);
334 ir_builder.CreateRet(shared_mem_buffer);
338 std::vector<llvm::Type*> input_arguments;
339 input_arguments.push_back(llvm::Type::getInt64PtrTy(
context_));
340 input_arguments.push_back(llvm::Type::getInt64PtrTy(
context_));
341 input_arguments.push_back(llvm::Type::getInt32Ty(
context_));
343 llvm::FunctionType* ft =
344 llvm::FunctionType::get(llvm::Type::getVoidTy(
context_), input_arguments,
false);
345 const auto reduction_function = llvm::Function::Create(
346 ft, llvm::Function::ExternalLinkage,
"reduce_from_smem_to_gmem",
module_);
347 return reduction_function;
351 std::vector<llvm::Type*> input_arguments;
352 input_arguments.push_back(
353 llvm::Type::getInt64PtrTy(
context_));
354 input_arguments.push_back(llvm::Type::getInt32Ty(
context_));
356 llvm::FunctionType* ft = llvm::FunctionType::get(
357 llvm::Type::getInt64PtrTy(
context_), input_arguments,
false);
358 const auto init_function = llvm::Function::Create(
359 ft, llvm::Function::ExternalLinkage,
"init_smem_func",
module_);
360 return init_function;
364 const auto function =
module_->getFunction(func_name);
365 CHECK(
function) << func_name <<
" is not found in the module.";
376 const std::string& target_func_name,
377 llvm::Function* replace_func) {
378 for (
auto it = llvm::inst_begin(main_func), e = llvm::inst_end(main_func); it != e;
380 if (!llvm::isa<llvm::CallInst>(*it)) {
383 auto& instruction = llvm::cast<llvm::CallInst>(*it);
384 if (std::string(instruction.getCalledFunction()->getName()) == target_func_name) {
385 std::vector<llvm::Value*>
args;
386 for (
size_t i = 0; i < instruction.getNumOperands() - 1; ++i) {
387 args.push_back(instruction.getArgOperand(i));
389 llvm::ReplaceInstWithInst(&instruction,
390 llvm::CallInst::Create(replace_func, args,
""));
394 UNREACHABLE() <<
"Target function " << target_func_name <<
" was not found in "
395 << replace_func->getName().str();
size_t getEntryCount() const
llvm::ConstantInt * ll_int(const T v, llvm::LLVMContext &context)
bool hasKeylessHash() const
const QueryMemoryDescriptor query_mem_desc_
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
llvm::Function * createInitFunction() const
const SQLTypeInfo get_compact_type(const TargetInfo &target)
void verify_function_ir(const llvm::Function *func)
llvm::LLVMContext & context_
void codegenInitialization()
GpuSharedMemCodeBuilder(llvm::Module *module, llvm::LLVMContext &context, const QueryMemoryDescriptor &qmd, const std::vector< TargetInfo > &targets, const std::vector< int64_t > &init_agg_values, const size_t executor_id)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
std::string toString() const
QueryDescriptionType getQueryDescriptionType() const
std::vector< int64_t > initialize_target_values_for_storage(const std::vector< TargetInfo > &targets)
const std::vector< int64_t > init_agg_values_
llvm::Value * codegen_smem_dest_slot_ptr(llvm::LLVMContext &context, const QueryMemoryDescriptor &query_mem_desc, llvm::IRBuilder<> &ir_builder, const size_t slot_idx, const TargetInfo &target_info, llvm::Value *dest_byte_stream, llvm::Value *byte_offset)
std::string serialize_llvm_object(const T *llvm_obj)
llvm::Function * init_func_
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
bool didOutputColumnar() const
const std::vector< TargetInfo > targets_
void replace_called_function_with(llvm::Function *main_func, const std::string &target_func_name, llvm::Function *replace_func)
#define DEBUG_TIMER(name)
__device__ void sync_threadblock()
llvm::Function * getFunction(const std::string &func_name) const
void injectFunctionsInto(llvm::Function *query_func)
llvm::Function * reduction_func_
llvm::Function * createReductionFunction() const