23 llvm::LLVMContext& context,
25 const std::vector<TargetInfo>& targets,
26 const std::vector<int64_t>& init_agg_values)
29 , reduction_func_(nullptr)
31 , query_mem_desc_(qmd)
33 , init_agg_values_(init_agg_values) {
97 auto dest_buffer_ptr = &*arg_it;
98 dest_buffer_ptr->setName(
"dest_buffer_ptr");
100 auto src_buffer_ptr = &*arg_it;
101 src_buffer_ptr->setName(
"src_buffer_ptr");
103 auto buffer_size = &*arg_it;
104 buffer_size->setName(
"buffer_size");
109 llvm::IRBuilder<> ir_builder(bb_entry);
115 const auto func_thread_index =
getFunction(
"get_thread_index");
116 const auto thread_idx = ir_builder.CreateCall(func_thread_index, {},
"thread_index");
120 const auto entry_count_i32 =
122 const auto is_thread_inbound =
123 ir_builder.CreateICmpSLT(thread_idx, entry_count,
"is_thread_inbound");
124 ir_builder.CreateCondBr(is_thread_inbound, bb_body, bb_exit);
126 ir_builder.SetInsertPoint(bb_body);
129 auto src_byte_stream = ir_builder.CreatePointerCast(
130 src_buffer_ptr, llvm::Type::getInt8PtrTy(
context_, 0),
"src_byte_stream");
131 const auto dest_byte_stream = ir_builder.CreatePointerCast(
132 dest_buffer_ptr, llvm::Type::getInt8PtrTy(
context_, 0),
"dest_byte_stream");
135 auto rs_reduction_jit = std::make_unique<GpuReductionHelperJIT>(
139 auto reduction_code = rs_reduction_jit->codegen();
140 reduction_code.module->setDataLayout(
141 "e-p:64:64:64-i1:8:8-i8:8:8-"
142 "i16:16:16-i32:32:32-i64:64:64-"
143 "f32:32:32-f64:64:64-v16:16:16-"
144 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
145 reduction_code.module->setTargetTriple(
"nvptx64-nvidia-cuda");
148 bool link_error = linker.linkInModule(std::move(reduction_code.module));
153 auto reduce_one_entry_func =
getFunction(
"reduce_one_entry");
154 bool agg_func_found =
true;
155 while (agg_func_found) {
156 agg_func_found =
false;
157 for (
auto it = llvm::inst_begin(reduce_one_entry_func);
158 it != llvm::inst_end(reduce_one_entry_func);
160 if (!llvm::isa<llvm::CallInst>(*it)) {
163 auto& func_call = llvm::cast<llvm::CallInst>(*it);
164 std::string func_name = func_call.getCalledFunction()->getName().str();
165 if (func_name.length() > 4 && func_name.substr(0, 4) ==
"agg_") {
166 if (func_name.length() > 7 &&
167 func_name.substr(func_name.length() - 7) ==
"_shared") {
170 agg_func_found =
true;
171 std::vector<llvm::Value*>
args;
172 for (
size_t i = 0;
i < func_call.getNumArgOperands(); ++
i) {
173 args.push_back(func_call.getArgOperand(
i));
175 auto gpu_agg_func =
getFunction(func_name +
"_shared");
176 llvm::ReplaceInstWithInst(&func_call,
177 llvm::CallInst::Create(gpu_agg_func, args,
""));
182 const auto reduce_one_entry_idx_func =
getFunction(
"reduce_one_entry_idx");
183 CHECK(reduce_one_entry_idx_func);
188 const auto null_ptr_ll =
189 llvm::ConstantPointerNull::get(llvm::Type::getInt8PtrTy(
context_, 0));
190 const auto thread_idx_i32 = ir_builder.CreateCast(
192 ir_builder.CreateCall(reduce_one_entry_idx_func,
201 ir_builder.CreateBr(bb_exit);
202 llvm::ReturnInst::Create(
context_, bb_exit);
211 llvm::IRBuilder<>& ir_builder,
212 const size_t slot_idx,
214 llvm::Value* dest_byte_stream,
215 llvm::Value* byte_offset) {
218 auto ptr_type = [&context](
const size_t slot_bytes,
const SQLTypeInfo& sql_type) {
219 if (slot_bytes ==
sizeof(int32_t)) {
220 return llvm::Type::getInt32PtrTy(context, 3);
222 CHECK(slot_bytes ==
sizeof(int64_t));
223 return llvm::Type::getInt64PtrTy(context, 3);
226 return llvm::Type::getInt32PtrTy(context, 3);
229 const auto casted_dest_slot_address =
230 ir_builder.CreatePointerCast(ir_builder.CreateGEP(dest_byte_stream, byte_offset),
231 ptr_type(slot_bytes, sql_type),
233 return casted_dest_slot_address;
249 CHECK(!fixup_query_mem_desc.didOutputColumnar());
250 CHECK(fixup_query_mem_desc.hasKeylessHash());
257 llvm::IRBuilder<> ir_builder(bb_entry);
258 const auto func_thread_index =
getFunction(
"get_thread_index");
259 const auto thread_idx = ir_builder.CreateCall(func_thread_index, {},
"thread_index");
262 const auto declare_smem_func =
getFunction(
"declare_dynamic_shared_memory");
263 const auto shared_mem_buffer =
264 ir_builder.CreateCall(declare_smem_func, {},
"shared_mem_buffer");
266 const auto entry_count =
ll_int(fixup_query_mem_desc.getEntryCount(),
context_);
267 const auto is_thread_inbound =
268 ir_builder.CreateICmpSLT(thread_idx, entry_count,
"is_thread_inbound");
269 ir_builder.CreateCondBr(is_thread_inbound, bb_body, bb_exit);
271 ir_builder.SetInsertPoint(bb_body);
273 const auto row_size_bytes =
ll_int(fixup_query_mem_desc.getRowWidth(),
context_);
274 auto byte_offset_ll = ir_builder.CreateMul(row_size_bytes, thread_idx,
"byte_offset");
276 const auto dest_byte_stream = ir_builder.CreatePointerCast(
277 shared_mem_buffer, llvm::Type::getInt8PtrTy(
context_),
"dest_byte_stream");
280 const auto& col_slot_context = fixup_query_mem_desc.getColSlotContext();
281 size_t init_agg_idx = 0;
282 for (
size_t target_logical_idx = 0; target_logical_idx <
targets_.size();
283 ++target_logical_idx) {
284 const auto& target_info =
targets_[target_logical_idx];
285 const auto& slots_for_target = col_slot_context.getSlotsForCol(target_logical_idx);
286 for (
size_t slot_idx = slots_for_target.front(); slot_idx <= slots_for_target.back();
288 const auto slot_size = fixup_query_mem_desc.getPaddedSlotWidthBytes(slot_idx);
291 fixup_query_mem_desc,
298 llvm::Value* init_value_ll =
nullptr;
299 if (slot_size ==
sizeof(int32_t)) {
302 }
else if (slot_size ==
sizeof(int64_t)) {
308 ir_builder.CreateStore(init_value_ll, casted_dest_slot_address);
311 if (slot_idx != (col_slot_context.getSlotCount() - 1)) {
312 byte_offset_ll = ir_builder.CreateAdd(
313 byte_offset_ll,
ll_int(static_cast<size_t>(slot_size),
context_));
318 ir_builder.CreateBr(bb_exit);
320 ir_builder.SetInsertPoint(bb_exit);
324 ir_builder.CreateRet(shared_mem_buffer);
328 std::vector<llvm::Type*> input_arguments;
329 input_arguments.push_back(llvm::Type::getInt64PtrTy(
context_));
330 input_arguments.push_back(llvm::Type::getInt64PtrTy(
context_));
331 input_arguments.push_back(llvm::Type::getInt32Ty(
context_));
333 llvm::FunctionType* ft =
334 llvm::FunctionType::get(llvm::Type::getVoidTy(
context_), input_arguments,
false);
335 const auto reduction_function = llvm::Function::Create(
336 ft, llvm::Function::ExternalLinkage,
"reduce_from_smem_to_gmem",
module_);
337 return reduction_function;
341 std::vector<llvm::Type*> input_arguments;
342 input_arguments.push_back(
343 llvm::Type::getInt64PtrTy(
context_));
344 input_arguments.push_back(llvm::Type::getInt32Ty(
context_));
346 llvm::FunctionType* ft = llvm::FunctionType::get(
347 llvm::Type::getInt64PtrTy(
context_), input_arguments,
false);
348 const auto init_function = llvm::Function::Create(
349 ft, llvm::Function::ExternalLinkage,
"init_smem_func",
module_);
350 return init_function;
354 const auto function =
module_->getFunction(func_name);
355 CHECK(
function) << func_name <<
" is not found in the module.";
366 const std::string& target_func_name,
367 llvm::Function* replace_func) {
368 for (
auto it = llvm::inst_begin(main_func), e = llvm::inst_end(main_func); it != e;
370 if (!llvm::isa<llvm::CallInst>(*it)) {
373 auto& instruction = llvm::cast<llvm::CallInst>(*it);
374 if (std::string(instruction.getCalledFunction()->getName()) == target_func_name) {
375 std::vector<llvm::Value*>
args;
376 for (
size_t i = 0;
i < instruction.getNumArgOperands(); ++
i) {
377 args.push_back(instruction.getArgOperand(
i));
379 llvm::ReplaceInstWithInst(&instruction,
380 llvm::CallInst::Create(replace_func, args,
""));
384 UNREACHABLE() <<
"Target function " << target_func_name <<
" was not found in "
385 << replace_func->getName().str();
size_t getEntryCount() const
llvm::ConstantInt * ll_int(const T v, llvm::LLVMContext &context)
bool hasKeylessHash() const
const QueryMemoryDescriptor query_mem_desc_
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
llvm::Function * createInitFunction() const
const SQLTypeInfo get_compact_type(const TargetInfo &target)
void verify_function_ir(const llvm::Function *func)
llvm::LLVMContext & context_
void codegenInitialization()
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
std::string toString() const
QueryDescriptionType getQueryDescriptionType() const
std::vector< int64_t > initialize_target_values_for_storage(const std::vector< TargetInfo > &targets)
const std::vector< int64_t > init_agg_values_
llvm::Value * codegen_smem_dest_slot_ptr(llvm::LLVMContext &context, const QueryMemoryDescriptor &query_mem_desc, llvm::IRBuilder<> &ir_builder, const size_t slot_idx, const TargetInfo &target_info, llvm::Value *dest_byte_stream, llvm::Value *byte_offset)
std::string serialize_llvm_object(const T *llvm_obj)
llvm::Function * init_func_
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
bool didOutputColumnar() const
const std::vector< TargetInfo > targets_
void replace_called_function_with(llvm::Function *main_func, const std::string &target_func_name, llvm::Function *replace_func)
#define DEBUG_TIMER(name)
__device__ void sync_threadblock()
llvm::Function * getFunction(const std::string &func_name) const
void injectFunctionsInto(llvm::Function *query_func)
llvm::Function * reduction_func_
llvm::Function * createReductionFunction() const
GpuSharedMemCodeBuilder(llvm::Module *module, llvm::LLVMContext &context, const QueryMemoryDescriptor &qmd, const std::vector< TargetInfo > &targets, const std::vector< int64_t > &init_agg_values)