OmniSciDB  d2f719934e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryTemplateGenerator.cpp File Reference
#include "QueryTemplateGenerator.h"
#include "IRCodegenUtils.h"
#include "Logger/Logger.h"
#include <llvm/IR/Constants.h>
#include <llvm/IR/IRBuilder.h>
#include <llvm/IR/Instructions.h>
#include <llvm/IR/Verifier.h>
+ Include dependency graph for QueryTemplateGenerator.cpp:

Go to the source code of this file.

Namespaces

 anonymous_namespace{QueryTemplateGenerator.cpp}
 

Functions

llvm::Typeanonymous_namespace{QueryTemplateGenerator.cpp}::get_pointer_element_type (llvm::Value *value)
 
template<class Attributes >
llvm::Function * anonymous_namespace{QueryTemplateGenerator.cpp}::default_func_builder (llvm::Module *mod, const std::string &name)
 
template<class Attributes >
llvm::Function * anonymous_namespace{QueryTemplateGenerator.cpp}::pos_start (llvm::Module *mod)
 
template<class Attributes >
llvm::Function * anonymous_namespace{QueryTemplateGenerator.cpp}::group_buff_idx (llvm::Module *mod)
 
template<class Attributes >
llvm::Function * anonymous_namespace{QueryTemplateGenerator.cpp}::pos_step (llvm::Module *mod)
 
template<class Attributes >
llvm::Function * anonymous_namespace{QueryTemplateGenerator.cpp}::row_process (llvm::Module *mod, const size_t aggr_col_count, const bool hoist_literals)
 
template<class Attributes >
std::tuple< llvm::Function
*, llvm::CallInst * > 
query_template_impl (llvm::Module *mod, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
 
template<class Attributes >
std::tuple< llvm::Function
*, llvm::CallInst * > 
query_group_by_template_impl (llvm::Module *mod, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
 
std::tuple< llvm::Function
*, llvm::CallInst * > 
query_template (llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
 
std::tuple< llvm::Function
*, llvm::CallInst * > 
query_group_by_template (llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
 

Function Documentation

std::tuple<llvm::Function*, llvm::CallInst*> query_group_by_template ( llvm::Module *  module,
const bool  hoist_literals,
const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
const bool  check_scan_limit,
const GpuSharedMemoryContext gpu_smem_context 
)

Definition at line 907 of file QueryTemplateGenerator.cpp.

References query_mem_desc.

913  {
914  return query_group_by_template_impl<llvm::AttributeList>(module,
915  hoist_literals,
917  device_type,
918  check_scan_limit,
919  gpu_smem_context);
920 }
template<class Attributes >
std::tuple<llvm::Function*, llvm::CallInst*> query_group_by_template_impl ( llvm::Module *  mod,
const bool  hoist_literals,
const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
const bool  check_scan_limit,
const GpuSharedMemoryContext gpu_smem_context 
)

Definition at line 544 of file QueryTemplateGenerator.cpp.

References CHECK, logger::FATAL, anonymous_namespace{QueryTemplateGenerator.cpp}::get_pointer_element_type(), GpuSharedMemoryContext::getSharedMemorySize(), GPU, anonymous_namespace{QueryTemplateGenerator.cpp}::group_buff_idx(), QueryMemoryDescriptor::hasVarlenOutput(), GpuSharedMemoryContext::isSharedMemoryUsed(), QueryMemoryDescriptor::isWarpSyncRequired(), LLVM_ALIGN, LOG, anonymous_namespace{QueryTemplateGenerator.cpp}::pos_start(), anonymous_namespace{QueryTemplateGenerator.cpp}::pos_step(), and anonymous_namespace{QueryTemplateGenerator.cpp}::row_process().

550  {
551  if (gpu_smem_context.isSharedMemoryUsed()) {
552  CHECK(device_type == ExecutorDeviceType::GPU);
553  }
554  using namespace llvm;
555 
556  auto func_pos_start = pos_start<Attributes>(mod);
557  CHECK(func_pos_start);
558  auto func_pos_step = pos_step<Attributes>(mod);
559  CHECK(func_pos_step);
560  auto func_group_buff_idx = group_buff_idx<Attributes>(mod);
561  CHECK(func_group_buff_idx);
562  auto func_row_process = row_process<Attributes>(mod, 0, hoist_literals);
563  CHECK(func_row_process);
564  auto func_init_shared_mem = gpu_smem_context.isSharedMemoryUsed()
565  ? mod->getFunction("init_shared_mem")
566  : mod->getFunction("init_shared_mem_nop");
567  CHECK(func_init_shared_mem);
568 
569  auto func_write_back = mod->getFunction("write_back_nop");
570  CHECK(func_write_back);
571 
572  auto i32_type = IntegerType::get(mod->getContext(), 32);
573  auto i64_type = IntegerType::get(mod->getContext(), 64);
574  auto pi8_type = PointerType::get(IntegerType::get(mod->getContext(), 8), 0);
575  auto pi32_type = PointerType::get(i32_type, 0);
576  auto pi64_type = PointerType::get(i64_type, 0);
577  auto ppi64_type = PointerType::get(pi64_type, 0);
578  auto ppi8_type = PointerType::get(pi8_type, 0);
579 
580  std::vector<Type*> query_args;
581  query_args.push_back(ppi8_type);
582  if (hoist_literals) {
583  query_args.push_back(pi8_type);
584  }
585  query_args.push_back(pi64_type);
586  query_args.push_back(pi64_type);
587  query_args.push_back(pi32_type);
588  query_args.push_back(pi64_type);
589 
590  query_args.push_back(ppi64_type);
591  query_args.push_back(i32_type);
592  query_args.push_back(pi64_type);
593  query_args.push_back(pi32_type);
594  query_args.push_back(pi32_type);
595 
596  FunctionType* query_func_type = FunctionType::get(
597  /*Result=*/Type::getVoidTy(mod->getContext()),
598  /*Params=*/query_args,
599  /*isVarArg=*/false);
600 
601  std::string query_name{"query_group_by_template"};
602  auto query_func_ptr = mod->getFunction(query_name);
603  CHECK(!query_func_ptr);
604 
605  query_func_ptr = Function::Create(
606  /*Type=*/query_func_type,
607  /*Linkage=*/GlobalValue::ExternalLinkage,
608  /*Name=*/"query_group_by_template",
609  mod);
610 
611  query_func_ptr->setCallingConv(CallingConv::C);
612 
613  Attributes query_func_pal;
614  {
615  SmallVector<Attributes, 4> Attrs;
616  Attributes PAS;
617  {
618  AttrBuilder B;
619  B.addAttribute(Attribute::ReadNone);
620  B.addAttribute(Attribute::NoCapture);
621  PAS = Attributes::get(mod->getContext(), 1U, B);
622  }
623 
624  Attrs.push_back(PAS);
625  {
626  AttrBuilder B;
627  B.addAttribute(Attribute::ReadOnly);
628  B.addAttribute(Attribute::NoCapture);
629  PAS = Attributes::get(mod->getContext(), 2U, B);
630  }
631 
632  Attrs.push_back(PAS);
633  {
634  AttrBuilder B;
635  B.addAttribute(Attribute::ReadNone);
636  B.addAttribute(Attribute::NoCapture);
637  PAS = Attributes::get(mod->getContext(), 3U, B);
638  }
639 
640  Attrs.push_back(PAS);
641  {
642  AttrBuilder B;
643  B.addAttribute(Attribute::ReadOnly);
644  B.addAttribute(Attribute::NoCapture);
645  PAS = Attributes::get(mod->getContext(), 4U, B);
646  }
647 
648  Attrs.push_back(PAS);
649  {
650  AttrBuilder B;
651  B.addAttribute(Attribute::UWTable);
652  PAS = Attributes::get(mod->getContext(), ~0U, B);
653  }
654 
655  Attrs.push_back(PAS);
656 
657  query_func_pal = Attributes::get(mod->getContext(), Attrs);
658  }
659  query_func_ptr->setAttributes(query_func_pal);
660 
661  Function::arg_iterator query_arg_it = query_func_ptr->arg_begin();
662  Value* byte_stream = &*query_arg_it;
663  byte_stream->setName("byte_stream");
664  Value* literals{nullptr};
665  if (hoist_literals) {
666  literals = &*(++query_arg_it);
667  ;
668  literals->setName("literals");
669  }
670  Value* row_count_ptr = &*(++query_arg_it);
671  row_count_ptr->setName("row_count_ptr");
672  Value* frag_row_off_ptr = &*(++query_arg_it);
673  frag_row_off_ptr->setName("frag_row_off_ptr");
674  Value* max_matched_ptr = &*(++query_arg_it);
675  max_matched_ptr->setName("max_matched_ptr");
676  Value* agg_init_val = &*(++query_arg_it);
677  agg_init_val->setName("agg_init_val");
678  Value* group_by_buffers = &*(++query_arg_it);
679  group_by_buffers->setName("group_by_buffers");
680  Value* frag_idx = &*(++query_arg_it);
681  frag_idx->setName("frag_idx");
682  Value* join_hash_tables = &*(++query_arg_it);
683  join_hash_tables->setName("join_hash_tables");
684  Value* total_matched = &*(++query_arg_it);
685  total_matched->setName("total_matched");
686  Value* error_code = &*(++query_arg_it);
687  error_code->setName("error_code");
688 
689  auto bb_entry = BasicBlock::Create(mod->getContext(), ".entry", query_func_ptr, 0);
690  auto bb_preheader =
691  BasicBlock::Create(mod->getContext(), ".loop.preheader", query_func_ptr, 0);
692  auto bb_forbody = BasicBlock::Create(mod->getContext(), ".forbody", query_func_ptr, 0);
693  auto bb_crit_edge =
694  BasicBlock::Create(mod->getContext(), "._crit_edge", query_func_ptr, 0);
695  auto bb_exit = BasicBlock::Create(mod->getContext(), ".exit", query_func_ptr, 0);
696 
697  // Block .entry
698  LoadInst* row_count = new LoadInst(
699  get_pointer_element_type(row_count_ptr), row_count_ptr, "", false, bb_entry);
700  row_count->setAlignment(LLVM_ALIGN(8));
701  row_count->setName("row_count");
702 
703  LoadInst* max_matched = new LoadInst(
704  get_pointer_element_type(max_matched_ptr), max_matched_ptr, "", false, bb_entry);
705  max_matched->setAlignment(LLVM_ALIGN(8));
706 
707  auto crt_matched_ptr = new AllocaInst(i32_type, 0, "crt_matched", bb_entry);
708  auto old_total_matched_ptr = new AllocaInst(i32_type, 0, "old_total_matched", bb_entry);
709  CallInst* pos_start = CallInst::Create(func_pos_start, "", bb_entry);
710  pos_start->setCallingConv(CallingConv::C);
711  pos_start->setTailCall(true);
712  Attributes pos_start_pal;
713  pos_start->setAttributes(pos_start_pal);
714 
715  CallInst* pos_step = CallInst::Create(func_pos_step, "", bb_entry);
716  pos_step->setCallingConv(CallingConv::C);
717  pos_step->setTailCall(true);
718  Attributes pos_step_pal;
719  pos_step->setAttributes(pos_step_pal);
720 
721  CallInst* group_buff_idx_call = CallInst::Create(func_group_buff_idx, "", bb_entry);
722  group_buff_idx_call->setCallingConv(CallingConv::C);
723  group_buff_idx_call->setTailCall(true);
724  Attributes group_buff_idx_pal;
725  group_buff_idx_call->setAttributes(group_buff_idx_pal);
726  Value* group_buff_idx = group_buff_idx_call;
727 
728  const PointerType* Ty = dyn_cast<PointerType>(group_by_buffers->getType());
729  CHECK(Ty);
730 
731  Value* varlen_output_buffer{nullptr};
732  if (query_mem_desc.hasVarlenOutput()) {
733  // make the varlen buffer the _first_ 8 byte value in the group by buffers double ptr,
734  // and offset the group by buffers index by 8 bytes
735  auto varlen_output_buffer_gep = GetElementPtrInst::Create(
736  Ty->getElementType(),
737  group_by_buffers,
738  llvm::ConstantInt::get(llvm::Type::getInt32Ty(mod->getContext()), 0),
739  "",
740  bb_entry);
741  varlen_output_buffer =
742  new LoadInst(get_pointer_element_type(varlen_output_buffer_gep),
743  varlen_output_buffer_gep,
744  "varlen_output_buffer",
745  false,
746  bb_entry);
747 
748  group_buff_idx = BinaryOperator::Create(
749  Instruction::Add,
751  llvm::ConstantInt::get(llvm::Type::getInt32Ty(mod->getContext()), 1),
752  "group_buff_idx_varlen_offset",
753  bb_entry);
754  } else {
755  varlen_output_buffer =
756  ConstantPointerNull::get(Type::getInt64PtrTy(mod->getContext()));
757  }
758  CHECK(varlen_output_buffer);
759 
760  CastInst* pos_start_i64 = new SExtInst(pos_start, i64_type, "", bb_entry);
761  GetElementPtrInst* group_by_buffers_gep = GetElementPtrInst::Create(
762  Ty->getElementType(), group_by_buffers, group_buff_idx, "", bb_entry);
763  LoadInst* col_buffer = new LoadInst(get_pointer_element_type(group_by_buffers_gep),
764  group_by_buffers_gep,
765  "",
766  false,
767  bb_entry);
768  col_buffer->setName("col_buffer");
769  col_buffer->setAlignment(LLVM_ALIGN(8));
770 
771  llvm::ConstantInt* shared_mem_bytes_lv =
772  ConstantInt::get(i32_type, gpu_smem_context.getSharedMemorySize());
773  // TODO(Saman): change this further, normal path should not go through this
774  llvm::CallInst* result_buffer =
775  CallInst::Create(func_init_shared_mem,
776  std::vector<llvm::Value*>{col_buffer, shared_mem_bytes_lv},
777  "result_buffer",
778  bb_entry);
779 
780  ICmpInst* enter_or_not =
781  new ICmpInst(*bb_entry, ICmpInst::ICMP_SLT, pos_start_i64, row_count, "");
782  BranchInst::Create(bb_preheader, bb_exit, enter_or_not, bb_entry);
783 
784  // Block .loop.preheader
785  CastInst* pos_step_i64 = new SExtInst(pos_step, i64_type, "", bb_preheader);
786  BranchInst::Create(bb_forbody, bb_preheader);
787 
788  // Block .forbody
789  Argument* pos_pre = new Argument(i64_type);
790  PHINode* pos = PHINode::Create(i64_type, check_scan_limit ? 3 : 2, "pos", bb_forbody);
791 
792  std::vector<Value*> row_process_params;
793  row_process_params.push_back(result_buffer);
794  row_process_params.push_back(varlen_output_buffer);
795  row_process_params.push_back(crt_matched_ptr);
796  row_process_params.push_back(total_matched);
797  row_process_params.push_back(old_total_matched_ptr);
798  row_process_params.push_back(max_matched_ptr);
799  row_process_params.push_back(agg_init_val);
800  row_process_params.push_back(pos);
801  row_process_params.push_back(frag_row_off_ptr);
802  row_process_params.push_back(row_count_ptr);
803  if (hoist_literals) {
804  CHECK(literals);
805  row_process_params.push_back(literals);
806  }
807  if (check_scan_limit) {
808  new StoreInst(ConstantInt::get(IntegerType::get(mod->getContext(), 32), 0),
809  crt_matched_ptr,
810  bb_forbody);
811  }
812  CallInst* row_process =
813  CallInst::Create(func_row_process, row_process_params, "", bb_forbody);
814  row_process->setCallingConv(CallingConv::C);
815  row_process->setTailCall(true);
816  Attributes row_process_pal;
817  row_process->setAttributes(row_process_pal);
818 
819  // Forcing all threads within a warp to be synchronized (Compute >= 7.x)
820  if (query_mem_desc.isWarpSyncRequired(device_type)) {
821  auto func_sync_warp_protected = mod->getFunction("sync_warp_protected");
822  CHECK(func_sync_warp_protected);
823  CallInst::Create(func_sync_warp_protected,
824  std::vector<llvm::Value*>{pos, row_count},
825  "",
826  bb_forbody);
827  }
828 
829  BinaryOperator* pos_inc =
830  BinaryOperator::Create(Instruction::Add, pos, pos_step_i64, "", bb_forbody);
831  ICmpInst* loop_or_exit =
832  new ICmpInst(*bb_forbody, ICmpInst::ICMP_SLT, pos_inc, row_count, "");
833  if (check_scan_limit) {
834  auto crt_matched = new LoadInst(get_pointer_element_type(crt_matched_ptr),
835  crt_matched_ptr,
836  "crt_matched",
837  false,
838  bb_forbody);
839  auto filter_match = BasicBlock::Create(
840  mod->getContext(), "filter_match", query_func_ptr, bb_crit_edge);
841  llvm::Value* new_total_matched =
842  new LoadInst(get_pointer_element_type(old_total_matched_ptr),
843  old_total_matched_ptr,
844  "",
845  false,
846  filter_match);
847  new_total_matched =
848  BinaryOperator::CreateAdd(new_total_matched, crt_matched, "", filter_match);
849  CHECK(new_total_matched);
850  ICmpInst* limit_not_reached = new ICmpInst(*filter_match,
851  ICmpInst::ICMP_SLT,
852  new_total_matched,
853  max_matched,
854  "limit_not_reached");
855  BranchInst::Create(
856  bb_forbody,
857  bb_crit_edge,
858  BinaryOperator::Create(
859  BinaryOperator::And, loop_or_exit, limit_not_reached, "", filter_match),
860  filter_match);
861  auto filter_nomatch = BasicBlock::Create(
862  mod->getContext(), "filter_nomatch", query_func_ptr, bb_crit_edge);
863  BranchInst::Create(bb_forbody, bb_crit_edge, loop_or_exit, filter_nomatch);
864  ICmpInst* crt_matched_nz = new ICmpInst(
865  *bb_forbody, ICmpInst::ICMP_NE, crt_matched, ConstantInt::get(i32_type, 0), "");
866  BranchInst::Create(filter_match, filter_nomatch, crt_matched_nz, bb_forbody);
867  pos->addIncoming(pos_start_i64, bb_preheader);
868  pos->addIncoming(pos_pre, filter_match);
869  pos->addIncoming(pos_pre, filter_nomatch);
870  } else {
871  pos->addIncoming(pos_start_i64, bb_preheader);
872  pos->addIncoming(pos_pre, bb_forbody);
873  BranchInst::Create(bb_forbody, bb_crit_edge, loop_or_exit, bb_forbody);
874  }
875 
876  // Block ._crit_edge
877  BranchInst::Create(bb_exit, bb_crit_edge);
878 
879  // Block .exit
880  CallInst::Create(func_write_back,
881  std::vector<Value*>{col_buffer, result_buffer, shared_mem_bytes_lv},
882  "",
883  bb_exit);
884 
885  ReturnInst::Create(mod->getContext(), bb_exit);
886 
887  // Resolve Forward References
888  pos_pre->replaceAllUsesWith(pos_inc);
889  delete pos_pre;
890 
891  if (verifyFunction(*query_func_ptr, &llvm::errs())) {
892  LOG(FATAL) << "Generated invalid code. ";
893  }
894 
895  return {query_func_ptr, row_process};
896 }
#define LOG(tag)
Definition: Logger.h:205
size_t getSharedMemorySize() const
#define LLVM_ALIGN(alignment)
llvm::Function * group_buff_idx(llvm::Module *mod)
bool isWarpSyncRequired(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:211
llvm::Type * get_pointer_element_type(llvm::Value *value)
llvm::Function * row_process(llvm::Module *mod, const size_t aggr_col_count, const bool hoist_literals)

+ Here is the call graph for this function:

std::tuple<llvm::Function*, llvm::CallInst*> query_template ( llvm::Module *  module,
const size_t  aggr_col_count,
const bool  hoist_literals,
const bool  is_estimate_query,
const GpuSharedMemoryContext gpu_smem_context 
)

Definition at line 898 of file QueryTemplateGenerator.cpp.

903  {
904  return query_template_impl<llvm::AttributeList>(
905  module, aggr_col_count, hoist_literals, is_estimate_query, gpu_smem_context);
906 }
template<class Attributes >
std::tuple<llvm::Function*, llvm::CallInst*> query_template_impl ( llvm::Module *  mod,
const size_t  aggr_col_count,
const bool  hoist_literals,
const bool  is_estimate_query,
const GpuSharedMemoryContext gpu_smem_context 
)

If GPU shared memory optimization is disabled, for each aggregate target, threads copy back their aggregate results (stored in registers) back into memory. This process is performed per processed fragment. In the host the final results are reduced (per target, for all threads and all fragments).

If GPU Shared memory optimization is enabled, we properly (atomically) aggregate all thread's results into memory, which makes the final reduction on host much cheaper. Here, we call a noop dummy write back function which will be properly replaced at runtime depending on the target expressions.

Definition at line 196 of file QueryTemplateGenerator.cpp.

References CHECK, logger::FATAL, anonymous_namespace{QueryTemplateGenerator.cpp}::get_pointer_element_type(), anonymous_namespace{QueryTemplateGenerator.cpp}::group_buff_idx(), i, GpuSharedMemoryContext::isSharedMemoryUsed(), LLVM_ALIGN, LOG, anonymous_namespace{QueryTemplateGenerator.cpp}::pos_start(), anonymous_namespace{QueryTemplateGenerator.cpp}::pos_step(), run_benchmark_import::result, anonymous_namespace{QueryTemplateGenerator.cpp}::row_process(), and to_string().

201  {
202  using namespace llvm;
203 
204  auto func_pos_start = pos_start<Attributes>(mod);
205  CHECK(func_pos_start);
206  auto func_pos_step = pos_step<Attributes>(mod);
207  CHECK(func_pos_step);
208  auto func_group_buff_idx = group_buff_idx<Attributes>(mod);
209  CHECK(func_group_buff_idx);
210  auto func_row_process = row_process<Attributes>(
211  mod, is_estimate_query ? 1 : aggr_col_count, hoist_literals);
212  CHECK(func_row_process);
213 
214  auto i8_type = IntegerType::get(mod->getContext(), 8);
215  auto i32_type = IntegerType::get(mod->getContext(), 32);
216  auto i64_type = IntegerType::get(mod->getContext(), 64);
217  auto pi8_type = PointerType::get(i8_type, 0);
218  auto ppi8_type = PointerType::get(pi8_type, 0);
219  auto pi32_type = PointerType::get(i32_type, 0);
220  auto pi64_type = PointerType::get(i64_type, 0);
221  auto ppi64_type = PointerType::get(pi64_type, 0);
222 
223  std::vector<Type*> query_args;
224  query_args.push_back(ppi8_type);
225  if (hoist_literals) {
226  query_args.push_back(pi8_type);
227  }
228  query_args.push_back(pi64_type);
229  query_args.push_back(pi64_type);
230  query_args.push_back(pi32_type);
231 
232  query_args.push_back(pi64_type);
233  query_args.push_back(ppi64_type);
234  query_args.push_back(i32_type);
235  query_args.push_back(pi64_type);
236  query_args.push_back(pi32_type);
237  query_args.push_back(pi32_type);
238 
239  FunctionType* query_func_type = FunctionType::get(
240  /*Result=*/Type::getVoidTy(mod->getContext()),
241  /*Params=*/query_args,
242  /*isVarArg=*/false);
243 
244  std::string query_template_name{"query_template"};
245  auto query_func_ptr = mod->getFunction(query_template_name);
246  CHECK(!query_func_ptr);
247 
248  query_func_ptr = Function::Create(
249  /*Type=*/query_func_type,
250  /*Linkage=*/GlobalValue::ExternalLinkage,
251  /*Name=*/query_template_name,
252  mod);
253  query_func_ptr->setCallingConv(CallingConv::C);
254 
255  Attributes query_func_pal;
256  {
257  SmallVector<Attributes, 4> Attrs;
258  Attributes PAS;
259  {
260  AttrBuilder B;
261  B.addAttribute(Attribute::NoCapture);
262  PAS = Attributes::get(mod->getContext(), 1U, B);
263  }
264 
265  Attrs.push_back(PAS);
266  {
267  AttrBuilder B;
268  B.addAttribute(Attribute::NoCapture);
269  PAS = Attributes::get(mod->getContext(), 2U, B);
270  }
271 
272  Attrs.push_back(PAS);
273 
274  {
275  AttrBuilder B;
276  B.addAttribute(Attribute::NoCapture);
277  Attrs.push_back(Attributes::get(mod->getContext(), 3U, B));
278  }
279 
280  {
281  AttrBuilder B;
282  B.addAttribute(Attribute::NoCapture);
283  Attrs.push_back(Attributes::get(mod->getContext(), 4U, B));
284  }
285 
286  Attrs.push_back(PAS);
287 
288  query_func_pal = Attributes::get(mod->getContext(), Attrs);
289  }
290  query_func_ptr->setAttributes(query_func_pal);
291 
292  Function::arg_iterator query_arg_it = query_func_ptr->arg_begin();
293  Value* byte_stream = &*query_arg_it;
294  byte_stream->setName("byte_stream");
295  Value* literals{nullptr};
296  if (hoist_literals) {
297  literals = &*(++query_arg_it);
298  literals->setName("literals");
299  }
300  Value* row_count_ptr = &*(++query_arg_it);
301  row_count_ptr->setName("row_count_ptr");
302  Value* frag_row_off_ptr = &*(++query_arg_it);
303  frag_row_off_ptr->setName("frag_row_off_ptr");
304  Value* max_matched_ptr = &*(++query_arg_it);
305  max_matched_ptr->setName("max_matched_ptr");
306  Value* agg_init_val = &*(++query_arg_it);
307  agg_init_val->setName("agg_init_val");
308  Value* out = &*(++query_arg_it);
309  out->setName("out");
310  Value* frag_idx = &*(++query_arg_it);
311  frag_idx->setName("frag_idx");
312  Value* join_hash_tables = &*(++query_arg_it);
313  join_hash_tables->setName("join_hash_tables");
314  Value* total_matched = &*(++query_arg_it);
315  total_matched->setName("total_matched");
316  Value* error_code = &*(++query_arg_it);
317  error_code->setName("error_code");
318 
319  auto bb_entry = BasicBlock::Create(mod->getContext(), ".entry", query_func_ptr, 0);
320  auto bb_preheader =
321  BasicBlock::Create(mod->getContext(), ".loop.preheader", query_func_ptr, 0);
322  auto bb_forbody = BasicBlock::Create(mod->getContext(), ".for.body", query_func_ptr, 0);
323  auto bb_crit_edge =
324  BasicBlock::Create(mod->getContext(), "._crit_edge", query_func_ptr, 0);
325  auto bb_exit = BasicBlock::Create(mod->getContext(), ".exit", query_func_ptr, 0);
326 
327  // Block (.entry)
328  std::vector<Value*> result_ptr_vec;
329  llvm::CallInst* smem_output_buffer{nullptr};
330  if (!is_estimate_query) {
331  for (size_t i = 0; i < aggr_col_count; ++i) {
332  auto result_ptr = new AllocaInst(i64_type, 0, "result", bb_entry);
333  result_ptr->setAlignment(LLVM_ALIGN(8));
334  result_ptr_vec.push_back(result_ptr);
335  }
336  if (gpu_smem_context.isSharedMemoryUsed()) {
337  auto init_smem_func = mod->getFunction("init_shared_mem");
338  CHECK(init_smem_func);
339  // only one slot per aggregate column is needed, and so we can initialize shared
340  // memory buffer for intermediate results to be exactly like the agg_init_val array
341  smem_output_buffer = CallInst::Create(
342  init_smem_func,
343  std::vector<llvm::Value*>{
344  agg_init_val,
345  llvm::ConstantInt::get(i32_type, aggr_col_count * sizeof(int64_t))},
346  "smem_buffer",
347  bb_entry);
348  }
349  }
350 
351  LoadInst* row_count = new LoadInst(get_pointer_element_type(row_count_ptr),
352  row_count_ptr,
353  "row_count",
354  false,
355  bb_entry);
356  row_count->setAlignment(LLVM_ALIGN(8));
357  row_count->setName("row_count");
358  std::vector<Value*> agg_init_val_vec;
359  if (!is_estimate_query) {
360  for (size_t i = 0; i < aggr_col_count; ++i) {
361  auto idx_lv = ConstantInt::get(i32_type, i);
362  auto agg_init_gep =
363  GetElementPtrInst::CreateInBounds(agg_init_val, idx_lv, "", bb_entry);
364  auto agg_init_val = new LoadInst(
365  get_pointer_element_type(agg_init_gep), agg_init_gep, "", false, bb_entry);
366  agg_init_val->setAlignment(LLVM_ALIGN(8));
367  agg_init_val_vec.push_back(agg_init_val);
368  auto init_val_st = new StoreInst(agg_init_val, result_ptr_vec[i], false, bb_entry);
369  init_val_st->setAlignment(LLVM_ALIGN(8));
370  }
371  }
372 
373  CallInst* pos_start = CallInst::Create(func_pos_start, "pos_start", bb_entry);
374  pos_start->setCallingConv(CallingConv::C);
375  pos_start->setTailCall(true);
376  Attributes pos_start_pal;
377  pos_start->setAttributes(pos_start_pal);
378 
379  CallInst* pos_step = CallInst::Create(func_pos_step, "pos_step", bb_entry);
380  pos_step->setCallingConv(CallingConv::C);
381  pos_step->setTailCall(true);
382  Attributes pos_step_pal;
383  pos_step->setAttributes(pos_step_pal);
384 
385  CallInst* group_buff_idx = nullptr;
386  if (!is_estimate_query) {
387  group_buff_idx = CallInst::Create(func_group_buff_idx, "group_buff_idx", bb_entry);
388  group_buff_idx->setCallingConv(CallingConv::C);
389  group_buff_idx->setTailCall(true);
390  Attributes group_buff_idx_pal;
391  group_buff_idx->setAttributes(group_buff_idx_pal);
392  }
393 
394  CastInst* pos_start_i64 = new SExtInst(pos_start, i64_type, "", bb_entry);
395  ICmpInst* enter_or_not =
396  new ICmpInst(*bb_entry, ICmpInst::ICMP_SLT, pos_start_i64, row_count, "");
397  BranchInst::Create(bb_preheader, bb_exit, enter_or_not, bb_entry);
398 
399  // Block .loop.preheader
400  CastInst* pos_step_i64 = new SExtInst(pos_step, i64_type, "", bb_preheader);
401  BranchInst::Create(bb_forbody, bb_preheader);
402 
403  // Block .forbody
404  Argument* pos_inc_pre = new Argument(i64_type);
405  PHINode* pos = PHINode::Create(i64_type, 2, "pos", bb_forbody);
406  pos->addIncoming(pos_start_i64, bb_preheader);
407  pos->addIncoming(pos_inc_pre, bb_forbody);
408 
409  std::vector<Value*> row_process_params;
410  row_process_params.insert(
411  row_process_params.end(), result_ptr_vec.begin(), result_ptr_vec.end());
412  if (is_estimate_query) {
413  row_process_params.push_back(
414  new LoadInst(get_pointer_element_type(out), out, "", false, bb_forbody));
415  }
416  row_process_params.push_back(agg_init_val);
417  row_process_params.push_back(pos);
418  row_process_params.push_back(frag_row_off_ptr);
419  row_process_params.push_back(row_count_ptr);
420  if (hoist_literals) {
421  CHECK(literals);
422  row_process_params.push_back(literals);
423  }
424  CallInst* row_process =
425  CallInst::Create(func_row_process, row_process_params, "", bb_forbody);
426  row_process->setCallingConv(CallingConv::C);
427  row_process->setTailCall(false);
428  Attributes row_process_pal;
429  row_process->setAttributes(row_process_pal);
430 
431  BinaryOperator* pos_inc =
432  BinaryOperator::CreateNSW(Instruction::Add, pos, pos_step_i64, "", bb_forbody);
433  ICmpInst* loop_or_exit =
434  new ICmpInst(*bb_forbody, ICmpInst::ICMP_SLT, pos_inc, row_count, "");
435  BranchInst::Create(bb_forbody, bb_crit_edge, loop_or_exit, bb_forbody);
436 
437  // Block ._crit_edge
438  std::vector<Instruction*> result_vec_pre;
439  if (!is_estimate_query) {
440  for (size_t i = 0; i < aggr_col_count; ++i) {
441  auto result = new LoadInst(get_pointer_element_type(result_ptr_vec[i]),
442  result_ptr_vec[i],
443  ".pre.result",
444  false,
445  bb_crit_edge);
446  result->setAlignment(LLVM_ALIGN(8));
447  result_vec_pre.push_back(result);
448  }
449  }
450 
451  BranchInst::Create(bb_exit, bb_crit_edge);
452 
453  // Block .exit
465  if (!is_estimate_query) {
466  std::vector<PHINode*> result_vec;
467  for (int64_t i = aggr_col_count - 1; i >= 0; --i) {
468  auto result =
469  PHINode::Create(IntegerType::get(mod->getContext(), 64), 2, "", bb_exit);
470  result->addIncoming(result_vec_pre[i], bb_crit_edge);
471  result->addIncoming(agg_init_val_vec[i], bb_entry);
472  result_vec.insert(result_vec.begin(), result);
473  }
474 
475  for (size_t i = 0; i < aggr_col_count; ++i) {
476  auto col_idx = ConstantInt::get(i32_type, i);
477  if (gpu_smem_context.isSharedMemoryUsed()) {
478  auto target_addr =
479  GetElementPtrInst::CreateInBounds(smem_output_buffer, col_idx, "", bb_exit);
480  // TODO: generalize this once we want to support other types of aggregate
481  // functions besides COUNT.
482  auto agg_func = mod->getFunction("agg_sum_shared");
483  CHECK(agg_func);
484  CallInst::Create(
485  agg_func, std::vector<llvm::Value*>{target_addr, result_vec[i]}, "", bb_exit);
486  } else {
487  auto out_gep = GetElementPtrInst::CreateInBounds(out, col_idx, "", bb_exit);
488  auto col_buffer =
489  new LoadInst(get_pointer_element_type(out_gep), out_gep, "", false, bb_exit);
490  col_buffer->setAlignment(LLVM_ALIGN(8));
491  auto slot_idx = BinaryOperator::CreateAdd(
493  BinaryOperator::CreateMul(frag_idx, pos_step, "", bb_exit),
494  "",
495  bb_exit);
496  auto target_addr =
497  GetElementPtrInst::CreateInBounds(col_buffer, slot_idx, "", bb_exit);
498  StoreInst* result_st = new StoreInst(result_vec[i], target_addr, false, bb_exit);
499  result_st->setAlignment(LLVM_ALIGN(8));
500  }
501  }
502  if (gpu_smem_context.isSharedMemoryUsed()) {
503  // final reduction of results from shared memory buffer back into global memory.
504  auto sync_thread_func = mod->getFunction("sync_threadblock");
505  CHECK(sync_thread_func);
506  CallInst::Create(sync_thread_func, std::vector<llvm::Value*>{}, "", bb_exit);
507  auto reduce_smem_to_gmem_func = mod->getFunction("write_back_non_grouped_agg");
508  CHECK(reduce_smem_to_gmem_func);
509  // each thread reduce the aggregate target corresponding to its own thread ID.
510  // If there are more targets than threads we do not currently use shared memory
511  // optimization. This can be relaxed if necessary
512  for (size_t i = 0; i < aggr_col_count; i++) {
513  auto out_gep = GetElementPtrInst::CreateInBounds(
514  out, ConstantInt::get(i32_type, i), "", bb_exit);
515  auto gmem_output_buffer = new LoadInst(get_pointer_element_type(out_gep),
516  out_gep,
517  "gmem_output_buffer_" + std::to_string(i),
518  false,
519  bb_exit);
520  CallInst::Create(
521  reduce_smem_to_gmem_func,
522  std::vector<llvm::Value*>{
523  smem_output_buffer, gmem_output_buffer, ConstantInt::get(i32_type, i)},
524  "",
525  bb_exit);
526  }
527  }
528  }
529 
530  ReturnInst::Create(mod->getContext(), bb_exit);
531 
532  // Resolve Forward References
533  pos_inc_pre->replaceAllUsesWith(pos_inc);
534  delete pos_inc_pre;
535 
536  if (verifyFunction(*query_func_ptr)) {
537  LOG(FATAL) << "Generated invalid code. ";
538  }
539 
540  return {query_func_ptr, row_process};
541 }
#define LOG(tag)
Definition: Logger.h:205
#define LLVM_ALIGN(alignment)
std::string to_string(char const *&&v)
llvm::Function * group_buff_idx(llvm::Module *mod)
#define CHECK(condition)
Definition: Logger.h:211
llvm::Type * get_pointer_element_type(llvm::Value *value)
llvm::Function * row_process(llvm::Module *mod, const size_t aggr_col_count, const bool hoist_literals)

+ Here is the call graph for this function: