19 #if LLVM_VERSION_MAJOR < 9
20 static_assert(
false,
"LLVM Version >= 9 is required.");
23 #include <llvm/Analysis/ScopedNoAliasAA.h>
24 #include <llvm/Analysis/TypeBasedAliasAnalysis.h>
25 #include <llvm/Bitcode/BitcodeReader.h>
26 #include <llvm/Bitcode/BitcodeWriter.h>
27 #include <llvm/ExecutionEngine/MCJIT.h>
28 #include <llvm/IR/Attributes.h>
29 #include <llvm/IR/GlobalValue.h>
30 #include <llvm/IR/InstIterator.h>
31 #include <llvm/IR/IntrinsicInst.h>
32 #include <llvm/IR/Intrinsics.h>
33 #include <llvm/IR/LegacyPassManager.h>
34 #include <llvm/IR/Verifier.h>
35 #include <llvm/IRReader/IRReader.h>
36 #if 14 <= LLVM_VERSION_MAJOR
37 #include <llvm/MC/TargetRegistry.h>
39 #include <llvm/Support/TargetRegistry.h>
41 #include <llvm/Support/Casting.h>
42 #include <llvm/Support/FileSystem.h>
43 #include <llvm/Support/FormattedStream.h>
44 #include <llvm/Support/MemoryBuffer.h>
45 #include <llvm/Support/SourceMgr.h>
46 #include <llvm/Support/TargetSelect.h>
47 #include <llvm/Support/raw_os_ostream.h>
48 #include <llvm/Support/raw_ostream.h>
49 #include <llvm/Transforms/IPO.h>
50 #include <llvm/Transforms/IPO/AlwaysInliner.h>
51 #include <llvm/Transforms/IPO/InferFunctionAttrs.h>
52 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
53 #include <llvm/Transforms/InstCombine/InstCombine.h>
54 #include <llvm/Transforms/Instrumentation.h>
55 #include <llvm/Transforms/Scalar.h>
56 #include <llvm/Transforms/Scalar/GVN.h>
57 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
58 #include <llvm/Transforms/Utils.h>
59 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
60 #include <llvm/Transforms/Utils/Cloning.h>
62 #if LLVM_VERSION_MAJOR >= 11
63 #include <llvm/Support/Host.h>
85 #include <llvm/Support/DynamicLibrary.h>
88 extern std::unique_ptr<std::string> g_libgeos_so_filename;
90 static llvm::sys::DynamicLibrary geos_dynamic_library;
91 static std::mutex geos_init_mutex;
95 void load_geos_dynamic_library() {
96 std::lock_guard<std::mutex> guard(geos_init_mutex);
98 if (!geos_dynamic_library.isValid()) {
99 if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
100 LOG(
WARNING) <<
"Misconfigured GEOS library file name, trying 'libgeos_c.so'";
101 g_libgeos_so_filename.reset(
new std::string(
"libgeos_c.so"));
103 auto filename = *g_libgeos_so_filename;
104 std::string error_message;
105 geos_dynamic_library =
106 llvm::sys::DynamicLibrary::getPermanentLibrary(
filename.c_str(), &error_message);
107 if (!geos_dynamic_library.isValid()) {
109 std::string exception_message =
"Failed to load GEOS library: " + error_message;
110 throw std::runtime_error(exception_message);
123 std::string src =
"",
124 const bool is_gpu =
false) {
125 std::string excname = (is_gpu ?
"NVVM IR ParseError: " :
"LLVM IR ParseError: ");
126 llvm::raw_string_ostream ss(excname);
127 parse_error.print(src.c_str(), ss,
false,
false);
141 #define SHOW_DEFINED(MODULE) \
143 std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
144 ::show_defined(MODULE); \
147 #define SHOW_FUNCTIONS(MODULE) \
149 std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
150 ::show_functions(MODULE); \
153 template <
typename T =
void>
155 std::cout <<
"defines: ";
156 for (
auto&
f : llvm_module.getFunctionList()) {
157 if (!
f.isDeclaration()) {
158 std::cout <<
f.getName().str() <<
", ";
161 std::cout << std::endl;
164 template <
typename T =
void>
166 if (llvm_module ==
nullptr) {
167 std::cout <<
"is null" << std::endl;
173 template <
typename T =
void>
192 template <
typename T =
void>
194 std::unordered_set<std::string>& defined,
195 std::unordered_set<std::string>& undefined,
196 const std::unordered_set<std::string>& ignored) {
197 for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
198 if (
auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
199 auto* F2 = CI->getCalledFunction();
201 auto F2name = F2->getName().str();
202 if (F2->isDeclaration()) {
203 if (F2name.rfind(
"__", 0) !=
205 && F2name.rfind(
"llvm.", 0) !=
207 && ignored.find(F2name) == ignored.end()
209 undefined.emplace(F2name);
212 if (defined.find(F2name) == defined.end()) {
213 defined.emplace(F2name);
214 scan_function_calls<T>(*F2, defined, undefined, ignored);
222 template <
typename T =
void>
224 std::unordered_set<std::string>& defined,
225 std::unordered_set<std::string>& undefined,
226 const std::unordered_set<std::string>& ignored) {
227 for (
auto& F : llvm_module) {
228 if (!F.isDeclaration()) {
234 template <
typename T =
void>
235 std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>>
237 const std::unordered_set<std::string>& ignored = {}) {
238 std::unordered_set<std::string> defined, undefined;
240 return std::make_tuple(defined, undefined);
243 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
246 const std::unordered_set<llvm::Function*>& live_funcs) {
247 std::vector<llvm::Function*> dead_funcs;
250 if (live_funcs.count(&F)) {
253 for (
auto U : F.users()) {
254 auto* C = llvm::dyn_cast<
const llvm::CallInst>(U);
255 if (!C || C->getParent()->getParent() != &F) {
261 dead_funcs.push_back(&F);
264 for (
auto pFn : dead_funcs) {
265 pFn->eraseFromParent();
273 bool check_module_requires_libdevice(llvm::Module* llvm_module) {
275 for (llvm::Function& F : *llvm_module) {
276 if (F.hasName() && F.getName().startswith(
"__nv_")) {
277 LOG(
INFO) <<
"Module requires linking with libdevice: " << std::string(F.getName());
281 LOG(
DEBUG1) <<
"module does not require linking against libdevice";
286 void add_intrinsics_to_module(llvm::Module* llvm_module) {
287 for (llvm::Function& F : *llvm_module) {
288 for (llvm::Instruction& I : instructions(F)) {
289 if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
290 if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
291 llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
292 llvm::Function& decl_fn =
293 *llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID(), Tys);
294 ii->setCalledFunction(&decl_fn);
297 llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID());
307 llvm::Module* llvm_module,
308 llvm::legacy::PassManager& pass_manager,
309 const std::unordered_set<llvm::Function*>& live_funcs,
310 const bool is_gpu_smem_used,
314 pass_manager.add(llvm::createVerifierPass());
315 pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
319 pass_manager.add(llvm::createSROAPass());
323 llvm::createEarlyCSEPass(
true));
325 if (!is_gpu_smem_used) {
330 pass_manager.add(llvm::createJumpThreadingPass());
332 pass_manager.add(llvm::createCFGSimplificationPass());
335 pass_manager.add(llvm::createNewGVNPass());
337 pass_manager.add(llvm::createDeadStoreEliminationPass());
338 pass_manager.add(llvm::createLICMPass());
340 pass_manager.add(llvm::createInstructionCombiningPass());
343 pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
344 pass_manager.add(llvm::createGlobalOptimizerPass());
346 pass_manager.add(llvm::createCFGSimplificationPass());
348 pass_manager.run(*llvm_module);
359 : execution_engine_(execution_engine) {}
363 : execution_engine_(execution_engine) {
366 #ifdef ENABLE_INTEL_JIT_LISTENER
370 LOG(
INFO) <<
"Registered IntelJITEventListener";
372 LOG(
WARNING) <<
"This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
373 "listener configuration parameter.";
374 #endif // ENABLE_INTEL_JIT_LISTENER
380 llvm::ExecutionEngine* execution_engine) {
387 std::stringstream err_ss;
388 llvm::raw_os_ostream err_os(err_ss);
389 err_os <<
"\n-----\n";
390 if (llvm::verifyFunction(*func, &err_os)) {
391 err_os <<
"\n-----\n";
392 func->print(err_os,
nullptr);
393 err_os <<
"\n-----\n";
401 llvm::Module* llvm_module) {
402 llvm::legacy::PassManager pass_manager;
403 auto cpu_target_machine = execution_engine->getTargetMachine();
404 CHECK(cpu_target_machine);
405 llvm::SmallString<256> code_str;
406 llvm::raw_svector_ostream os(code_str);
407 #if LLVM_VERSION_MAJOR >= 10
408 cpu_target_machine->addPassesToEmitFile(
409 pass_manager, os,
nullptr, llvm::CGFT_AssemblyFile);
411 cpu_target_machine->addPassesToEmitFile(
412 pass_manager, os,
nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
414 pass_manager.run(*llvm_module);
415 return "Assembly for the CPU:\n" + std::string(code_str.str()) +
"\nEnd of assembly";
419 llvm::EngineBuilder& eb,
423 CHECK(execution_engine.get());
425 llvm_module->setDataLayout(execution_engine->getDataLayout());
429 execution_engine->finalizeObject();
430 return execution_engine;
438 llvm::Function* func,
439 const std::unordered_set<llvm::Function*>& live_funcs,
442 llvm::Module* llvm_module = func->getParent();
445 #ifndef WITH_JIT_DEBUG
446 llvm::legacy::PassManager pass_manager;
448 func, llvm_module, pass_manager, live_funcs,
false, co);
449 #endif // WITH_JIT_DEBUG
462 auto init_err = llvm::InitializeNativeTarget();
465 llvm::InitializeAllTargetMCs();
466 llvm::InitializeNativeTargetAsmPrinter();
467 llvm::InitializeNativeTargetAsmParser();
470 std::unique_ptr<llvm::Module> owner(llvm_module);
472 llvm::EngineBuilder eb(std::move(owner));
473 eb.setErrorStr(&err_str);
474 eb.setEngineKind(llvm::EngineKind::JIT);
475 llvm::TargetOptions to;
476 to.EnableFastISel =
true;
477 eb.setTargetOptions(to);
486 llvm::Function* query_func,
487 llvm::Function* multifrag_query_func,
488 const std::unordered_set<llvm::Function*>& live_funcs,
493 llvm::Module* M = query_func->getParent();
494 auto* flag = llvm::mdconst::extract_or_null<llvm::ConstantInt>(
495 M->getModuleFlag(
"manage_memory_buffer"));
496 if (flag and flag->getZExtValue() == 1 and M->getFunction(
"allocate_varlen_buffer") and
497 M->getFunction(
"register_buffer_with_executor_rsm")) {
498 LOG(
INFO) <<
"including executor addr to cache key\n";
501 if (cgen_state_->filter_func_) {
504 for (
const auto helper : cgen_state_->helper_functions_) {
511 if (cgen_state_->needs_geos_) {
513 auto llvm_module = multifrag_query_func->getParent();
514 load_geos_dynamic_library();
517 auto rt_geos_module_copy = llvm::CloneModule(
518 *get_geos_module(), cgen_state_->vmap_, [](
const llvm::GlobalValue* gv) {
519 auto func = llvm::dyn_cast<llvm::Function>(gv);
523 switch (func->getLinkage()) {
524 case llvm::GlobalValue::LinkageTypes::InternalLinkage:
525 case llvm::GlobalValue::LinkageTypes::PrivateLinkage:
526 case llvm::GlobalValue::LinkageTypes::ExternalLinkage:
527 case llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage:
536 llvm::Linker::Flags::LinkOnlyNeeded);
538 throw std::runtime_error(
"GEOS is disabled in this build");
542 auto execution_engine =
544 auto cpu_compilation_context =
545 std::make_shared<CpuCompilationContext>(std::move(execution_engine));
546 cpu_compilation_context->setFunctionPointer(multifrag_query_func);
552 llvm::Module& llvm_module,
554 llvm::Linker::Flags flags) {
558 for (
auto&
f : *udf_module) {
559 auto func = llvm_module.getFunction(
f.getName());
561 LOG(
ERROR) <<
" Attempt to overwrite " <<
f.getName().str() <<
" in "
562 << llvm_module.getModuleIdentifier() <<
" from `"
563 << udf_module->getModuleIdentifier() <<
"`" << std::endl;
564 throw std::runtime_error(
565 "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
568 VLOG(1) <<
" Adding " <<
f.getName().str() <<
" to "
569 << llvm_module.getModuleIdentifier() <<
" from `"
570 << udf_module->getModuleIdentifier() <<
"`" << std::endl;
574 auto udf_module_copy = llvm::CloneModule(*udf_module, cgen_state->
vmap_);
576 udf_module_copy->setDataLayout(llvm_module.getDataLayout());
577 udf_module_copy->setTargetTriple(llvm_module.getTargetTriple());
580 llvm::Linker ld(llvm_module);
581 bool link_error =
false;
583 link_error = ld.linkInModule(std::move(udf_module_copy), flags);
586 throw std::runtime_error(
"link_udf_module: *** error linking module ***");
596 if (s ==
"int16_t") {
599 if (s ==
"int32_t") {
602 if (s ==
"int64_t") {
605 CHECK(s ==
"float" || s ==
"double");
611 for (
const std::string any_or_all : {
"any",
"all"}) {
612 for (
const std::string elem_type :
613 {
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"float",
"double"}) {
614 for (
const std::string needle_type :
615 {
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"float",
"double"}) {
616 for (
const std::string op_name : {
"eq",
"ne",
"lt",
"le",
"gt",
"ge"}) {
617 result += (
"declare i1 @array_" + any_or_all +
"_" + op_name +
"_" + elem_type +
629 for (
const std::string key_type : {
"int8_t",
"int16_t",
"int32_t",
"int64_t"}) {
631 result +=
"declare i64 @translate_null_key_" + key_type +
"(" + key_llvm_type +
", " +
632 key_llvm_type +
", i64);\n";
638 R
"(
declare void @llvm.dbg.declare(metadata, metadata, metadata)
declare void @llvm.dbg.value(metadata, metadata, metadata)
declare double @llvm.fmuladd.f64(double, double, double)
declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
declare i64 @get_thread_index();
declare i64 @get_block_index();
declare i32 @pos_start_impl(i32*);
declare i32 @group_buff_idx_impl();
declare i32 @pos_step_impl();
declare i8 @thread_warp_idx(i8);
declare i64* @init_shared_mem(i64*, i32);
declare i64* @init_shared_mem_nop(i64*, i32);
declare i64* @declare_dynamic_shared_memory();
declare void @write_back_nop(i64*, i64*, i32);
declare void @write_back_non_grouped_agg(i64*, i64*, i32);
declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8);
declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32);
declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32);
declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32);
declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32);
declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32);
declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32);
declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64);
declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64);
declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64);
declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64);
declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64);
declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double);
declare i64 @get_bucket_key_for_range_double(i8*, i64, double);
declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double);
declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64);
declare i64 @agg_count_shared(i64*, i64);
declare i64 @agg_count_skip_val_shared(i64*, i64, i64);
declare i32 @agg_count_int32_shared(i32*, i32);
declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32);
declare i64 @agg_count_double_shared(i64*, double);
declare i64 @agg_count_double_skip_val_shared(i64*, double, double);
declare i32 @agg_count_float_shared(i32*, float);
declare i32 @agg_count_float_skip_val_shared(i32*, float, float);
declare i64 @agg_count_if_shared(i64*, i64);
declare i64 @agg_count_if_skip_val_shared(i64*, i64, i64);
declare i32 @agg_count_if_int32_shared(i32*, i32);
declare i32 @agg_count_if_int32_skip_val_shared(i32*, i32, i32);
declare i64 @agg_sum_shared(i64*, i64);
declare i64 @agg_sum_skip_val_shared(i64*, i64, i64);
declare i32 @agg_sum_int32_shared(i32*, i32);
declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_sum_double_shared(i64*, double);
declare void @agg_sum_double_skip_val_shared(i64*, double, double);
declare void @agg_sum_float_shared(i32*, float);
declare void @agg_sum_float_skip_val_shared(i32*, float, float);
declare i64 @agg_sum_if_shared(i64*, i64, i8);
declare i64 @agg_sum_if_skip_val_shared(i64*, i64, i64, i8);
declare i32 @agg_sum_if_int32_shared(i32*, i32, i8);
declare i32 @agg_sum_if_int32_skip_val_shared(i32*, i32, i32, i8);
declare void @agg_sum_if_double_shared(i64*, double, i8);
declare void @agg_sum_if_double_skip_val_shared(i64*, double, double, i8);
declare void @agg_sum_if_float_shared(i32*, float, i8);
declare void @agg_sum_if_float_skip_val_shared(i32*, float, float, i8);
declare void @agg_max_shared(i64*, i64);
declare void @agg_max_skip_val_shared(i64*, i64, i64);
declare void @agg_max_int32_shared(i32*, i32);
declare void @agg_max_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_max_int16_shared(i16*, i16);
declare void @agg_max_int16_skip_val_shared(i16*, i16, i16);
declare void @agg_max_int8_shared(i8*, i8);
declare void @agg_max_int8_skip_val_shared(i8*, i8, i8);
declare void @agg_max_double_shared(i64*, double);
declare void @agg_max_double_skip_val_shared(i64*, double, double);
declare void @agg_max_float_shared(i32*, float);
declare void @agg_max_float_skip_val_shared(i32*, float, float);
declare void @agg_min_shared(i64*, i64);
declare void @agg_min_skip_val_shared(i64*, i64, i64);
declare void @agg_min_int32_shared(i32*, i32);
declare void @agg_min_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_min_int16_shared(i16*, i16);
declare void @agg_min_int16_skip_val_shared(i16*, i16, i16);
declare void @agg_min_int8_shared(i8*, i8);
declare void @agg_min_int8_skip_val_shared(i8*, i8, i8);
declare void @agg_min_double_shared(i64*, double);
declare void @agg_min_double_skip_val_shared(i64*, double, double);
declare void @agg_min_float_shared(i32*, float);
declare void @agg_min_float_skip_val_shared(i32*, float, float);
declare void @agg_id_shared(i64*, i64);
declare i8* @agg_id_varlen_shared(i8*, i64, i8*, i64);
declare void @agg_id_int32_shared(i32*, i32);
declare void @agg_id_int16_shared(i16*, i16);
declare void @agg_id_int8_shared(i8*, i8);
declare void @agg_id_double_shared(i64*, double);
declare void @agg_id_double_shared_slow(i64*, double*);
declare void @agg_id_float_shared(i32*, float);
declare i32 @checked_single_agg_id_shared(i64*, i64, i64);
declare i32 @checked_single_agg_id_double_shared(i64*, double, double);
declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double);
declare i32 @checked_single_agg_id_float_shared(i32*, float, float);
declare i1 @slotEmptyKeyCAS(i64*, i64, i64);
declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32);
declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16);
declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8);
declare i64 @datetrunc_century(i64);
declare i64 @datetrunc_day(i64);
declare i64 @datetrunc_decade(i64);
declare i64 @datetrunc_hour(i64);
declare i64 @datetrunc_millennium(i64);
declare i64 @datetrunc_minute(i64);
declare i64 @datetrunc_month(i64);
declare i64 @datetrunc_quarter(i64);
declare i64 @datetrunc_quarterday(i64);
declare i64 @datetrunc_week_monday(i64);
declare i64 @datetrunc_week_sunday(i64);
declare i64 @datetrunc_week_saturday(i64);
declare i64 @datetrunc_year(i64);
declare i64 @extract_epoch(i64);
declare i64 @extract_dateepoch(i64);
declare i64 @extract_quarterday(i64);
declare i64 @extract_hour(i64);
declare i64 @extract_minute(i64);
declare i64 @extract_second(i64);
declare i64 @extract_millisecond(i64);
declare i64 @extract_microsecond(i64);
declare i64 @extract_nanosecond(i64);
declare i64 @extract_dow(i64);
declare i64 @extract_isodow(i64);
declare i64 @extract_day(i64);
declare i64 @extract_week_monday(i64);
declare i64 @extract_week_sunday(i64);
declare i64 @extract_week_saturday(i64);
declare i64 @extract_day_of_year(i64);
declare i64 @extract_month(i64);
declare i64 @extract_quarter(i64);
declare i64 @extract_year(i64);
declare i64 @ExtractTimeFromHPTimestamp(i64,i64);
declare i64 @ExtractTimeFromHPTimestampNullable(i64,i64,i64);
declare i64 @ExtractTimeFromLPTimestamp(i64);
declare i64 @ExtractTimeFromLPTimestampNullable(i64,i64);
declare i64 @DateTruncateHighPrecisionToDate(i64, i64);
declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64);
declare i64 @DateDiff(i32, i64, i64);
declare i64 @DateDiffNullable(i32, i64, i64, i64);
declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32);
declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64);
declare i64 @DateAdd(i32, i64, i64);
declare i64 @DateAddNullable(i32, i64, i64, i64);
declare i64 @DateAddHighPrecision(i32, i64, i64, i32);
declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64);
declare {i8*,i64} @string_decode(i8*, i64);
declare i32 @array_size(i8*, i64, i32);
declare i32 @array_size_nullable(i8*, i64, i32, i32);
declare i32 @array_size_1_nullable(i8*, i64, i32);
declare i32 @fast_fixlen_array_size(i8*, i32);
declare i1 @array_is_null(i8*, i64);
declare i1 @point_coord_array_is_null(i8*, i64);
declare i8* @array_buff(i8*, i64);
declare i8* @fast_fixlen_array_buff(i8*, i64);
declare i64 @determine_fixed_array_len(i8*, i64);
declare i8 @array_at_int8_t(i8*, i64, i32);
declare i16 @array_at_int16_t(i8*, i64, i32);
declare i32 @array_at_int32_t(i8*, i64, i32);
declare i64 @array_at_int64_t(i8*, i64, i32);
declare float @array_at_float(i8*, i64, i32);
declare double @array_at_double(i8*, i64, i32);
declare i8 @varlen_array_at_int8_t(i8*, i64, i32);
declare i16 @varlen_array_at_int16_t(i8*, i64, i32);
declare i32 @varlen_array_at_int32_t(i8*, i64, i32);
declare i64 @varlen_array_at_int64_t(i8*, i64, i32);
declare float @varlen_array_at_float(i8*, i64, i32);
declare double @varlen_array_at_double(i8*, i64, i32);
declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32);
declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32);
declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32);
declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32);
declare float @varlen_notnull_array_at_float(i8*, i64, i32);
declare double @varlen_notnull_array_at_double(i8*, i64, i32);
declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8);
declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16);
declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32);
declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64);
declare float @array_at_float_checked(i8*, i64, i64, float);
declare double @array_at_double_checked(i8*, i64, i64, double);
declare i32 @char_length(i8*, i32);
declare i32 @char_length_nullable(i8*, i32, i32);
declare i32 @char_length_encoded(i8*, i32);
declare i32 @char_length_encoded_nullable(i8*, i32, i32);
declare i32 @key_for_string_encoded(i32);
declare i1 @sample_ratio(double, i64);
declare double @width_bucket(double, double, double, double, i32);
declare double @width_bucket_reverse(double, double, double, double, i32);
declare double @width_bucket_nullable(double, double, double, double, i32, double);
declare double @width_bucket_reversed_nullable(double, double, double, double, i32, double);
declare double @width_bucket_no_oob_check(double, double, double);
declare double @width_bucket_reverse_no_oob_check(double, double, double);
declare double @width_bucket_expr(double, i1, double, double, i32);
declare double @width_bucket_expr_nullable(double, i1, double, double, i32, double);
declare double @width_bucket_expr_no_oob_check(double, i1, double, double, i32);
declare i1 @string_like(i8*, i32, i8*, i32, i8);
declare i1 @string_ilike(i8*, i32, i8*, i32, i8);
declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8);
declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8);
declare i1 @string_like_simple(i8*, i32, i8*, i32);
declare i1 @string_ilike_simple(i8*, i32, i8*, i32);
declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8);
declare i1 @string_lt(i8*, i32, i8*, i32);
declare i1 @string_le(i8*, i32, i8*, i32);
declare i1 @string_gt(i8*, i32, i8*, i32);
declare i1 @string_ge(i8*, i32, i8*, i32);
declare i1 @string_eq(i8*, i32, i8*, i32);
declare i1 @string_ne(i8*, i32, i8*, i32);
declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8);
declare i1 @regexp_like(i8*, i32, i8*, i32, i8);
declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8);
declare void @linear_probabilistic_count(i8*, i32, i8*, i32);
declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64);
declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64);
declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64);
declare void @record_error_code(i32, i32*);
declare i32 @get_error_code(i32*);
declare i1 @dynamic_watchdog();
declare i1 @check_interrupt();
declare void @force_sync();
declare void @sync_warp();
declare void @sync_warp_protected(i64, i64);
declare void @sync_threadblock();
declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32);
declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64);
declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float);
declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double);
declare double @decompress_x_coord_geoint(i32);
declare double @decompress_y_coord_geoint(i32);
declare i32 @compress_x_coord_geoint(double);
declare i32 @compress_y_coord_geoint(double);
declare i64 @fixed_width_date_encode(i64, i32, i64);
declare i64 @fixed_width_date_decode(i64, i32, i64);
)" + gen_array_any_all_sigs() +
644 bool check_any_operand_is_stacksave_intrinsic(llvm::Instruction& inst) {
645 for (
auto op_it = inst.op_begin(); op_it != inst.op_end(); op_it++) {
646 if (
const llvm::IntrinsicInst* inst2 = llvm::dyn_cast<llvm::IntrinsicInst>(*op_it)) {
647 if (inst2->getIntrinsicID() == llvm::Intrinsic::stacksave) {
656 std::string extension_function_decls(
const std::unordered_set<std::string>& udf_decls) {
662 void legalize_nvvm_ir(llvm::Function* query_func) {
669 std::vector<llvm::Instruction*> stackrestore_intrinsics;
670 std::vector<llvm::Instruction*> stacksave_intrinsics;
671 std::vector<llvm::Instruction*> lifetime;
672 for (
auto& BB : *query_func) {
673 for (llvm::Instruction& I : BB) {
674 if (llvm::dyn_cast<llvm::PHINode>(&I)) {
675 if (check_any_operand_is_stacksave_intrinsic(I)) {
678 stacksave_intrinsics.push_back(&I);
679 VLOG(2) <<
"Remove PHI node having llvm::stacksave intrinsic as its operand";
681 }
else if (
const llvm::IntrinsicInst* II =
682 llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
683 if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
684 stacksave_intrinsics.push_back(&I);
685 }
else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
686 stackrestore_intrinsics.push_back(&I);
687 }
else if (II->getIntrinsicID() == llvm::Intrinsic::lifetime_start ||
688 II->getIntrinsicID() == llvm::Intrinsic::lifetime_end) {
689 lifetime.push_back(&I);
698 for (
auto& II : stackrestore_intrinsics) {
699 II->eraseFromParent();
701 for (
auto& II : stacksave_intrinsics) {
702 II->eraseFromParent();
705 for (
auto& II : lifetime) {
706 II->eraseFromParent();
714 return llvm::StringRef(
"nvptx64-nvidia-cuda");
718 return llvm::StringRef(
719 "e-p:64:64:64-i1:8:8-i8:8:8-"
720 "i16:16:16-i32:32:32-i64:64:64-"
721 "f32:32:32-f64:64:64-v16:16:16-"
722 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
726 std::map<std::string, std::string>
result;
728 result.insert(std::make_pair(
"cpu_name", llvm::sys::getHostCPUName()));
729 result.insert(std::make_pair(
"cpu_triple", llvm::sys::getProcessTriple()));
731 std::make_pair(
"cpu_cores",
std::to_string(llvm::sys::getHostNumPhysicalCores())));
735 std::string sizeof_types;
738 sizeof_types +=
"ssize_t:" +
std::to_string(
sizeof(ssize_t)) +
";";
740 sizeof_types +=
"uchar:" +
std::to_string(
sizeof(
unsigned char)) +
";";
742 sizeof_types +=
"ushort:" +
std::to_string(
sizeof(
unsigned short int)) +
";";
744 sizeof_types +=
"uint:" +
std::to_string(
sizeof(
unsigned int)) +
";";
746 sizeof_types +=
"ulong:" +
std::to_string(
sizeof(
unsigned long int)) +
";";
747 sizeof_types +=
"longlong:" +
std::to_string(
sizeof(
long long int)) +
";";
748 sizeof_types +=
"ulonglong:" +
std::to_string(
sizeof(
unsigned long long int)) +
";";
751 sizeof_types +=
"longdouble:" +
std::to_string(
sizeof(
long double)) +
";";
754 result.insert(std::make_pair(
"type_sizeof", sizeof_types));
756 std::string null_values;
757 null_values +=
"boolean1:" +
std::to_string(serialized_null_value<bool>()) +
";";
758 null_values +=
"boolean8:" +
std::to_string(serialized_null_value<int8_t>()) +
";";
759 null_values +=
"int8:" +
std::to_string(serialized_null_value<int8_t>()) +
";";
760 null_values +=
"int16:" +
std::to_string(serialized_null_value<int16_t>()) +
";";
761 null_values +=
"int32:" +
std::to_string(serialized_null_value<int32_t>()) +
";";
762 null_values +=
"int64:" +
std::to_string(serialized_null_value<int64_t>()) +
";";
763 null_values +=
"uint8:" +
std::to_string(serialized_null_value<uint8_t>()) +
";";
764 null_values +=
"uint16:" +
std::to_string(serialized_null_value<uint16_t>()) +
";";
765 null_values +=
"uint32:" +
std::to_string(serialized_null_value<uint32_t>()) +
";";
766 null_values +=
"uint64:" +
std::to_string(serialized_null_value<uint64_t>()) +
";";
767 null_values +=
"float32:" +
std::to_string(serialized_null_value<float>()) +
";";
768 null_values +=
"float64:" +
std::to_string(serialized_null_value<double>()) +
";";
770 "Array<boolean8>:" +
std::to_string(serialized_null_value<int8_t, true>()) +
";";
772 "Array<int8>:" +
std::to_string(serialized_null_value<int8_t, true>()) +
";";
774 "Array<int16>:" +
std::to_string(serialized_null_value<int16_t, true>()) +
";";
776 "Array<int32>:" +
std::to_string(serialized_null_value<int32_t, true>()) +
";";
778 "Array<int64>:" +
std::to_string(serialized_null_value<int64_t, true>()) +
";";
780 "Array<float32>:" +
std::to_string(serialized_null_value<float, true>()) +
";";
782 "Array<float64>:" +
std::to_string(serialized_null_value<double, true>()) +
";";
784 result.insert(std::make_pair(
"null_values", null_values));
786 llvm::StringMap<bool> cpu_features;
787 if (llvm::sys::getHostCPUFeatures(cpu_features)) {
788 std::string features_str =
"";
789 for (
auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
790 features_str += (it->getValue() ?
" +" :
" -");
791 features_str += it->getKey().str();
793 result.insert(std::make_pair(
"cpu_features", features_str));
796 result.insert(std::make_pair(
"llvm_version",
803 int device_count = 0;
807 char device_name[256];
808 int major = 0, minor = 0;
813 &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
815 &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
818 result.insert(std::make_pair(
"gpu_name", device_name));
819 result.insert(std::make_pair(
"gpu_count",
std::to_string(device_count)));
820 result.insert(std::make_pair(
"gpu_compute_capability",
824 result.insert(std::make_pair(
"gpu_driver",
830 std::make_pair(
"gpu_has_libdevice",
842 std::unordered_set<llvm::Function*> findAliveRuntimeFuncs(
843 llvm::Module& llvm_module,
844 const std::vector<llvm::Function*>& roots) {
845 std::queue<llvm::Function*> queue;
846 std::unordered_set<llvm::Function*> visited;
847 for (llvm::Function* F : roots) {
851 while (!queue.empty()) {
852 llvm::Function* F = queue.front();
854 if (visited.find(F) != visited.end()) {
859 for (llvm::inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
860 if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
861 if (CI->isInlineAsm()) {
864 llvm::Function* called = CI->getCalledFunction();
865 if (!called || visited.find(called) != visited.end()) {
880 llvm::Module& llvm_module,
881 llvm::PassManagerBuilder& pass_manager_builder,
882 const GPUTarget& gpu_target) {
886 if (!executor->has_libdevice_module()) {
888 throw std::runtime_error(
889 "libdevice library is not available but required by the UDF module");
893 std::vector<llvm::Function*> roots;
894 for (llvm::Function& fn : llvm_module) {
895 if (!fn.isDeclaration()) {
896 roots.emplace_back(&fn);
903 gpu_target.cgen_state,
904 llvm::Linker::Flags::OverrideFromSrc);
906 std::unordered_set<llvm::Function*> live_funcs =
907 findAliveRuntimeFuncs(llvm_module, roots);
909 std::vector<llvm::Function*> funcs_to_delete;
910 for (llvm::Function& fn : llvm_module) {
911 if (!live_funcs.count(&fn)) {
913 funcs_to_delete.emplace_back(&fn);
917 for (llvm::Function*
f : funcs_to_delete) {
918 f->eraseFromParent();
922 #if LLVM_VERSION_MAJOR >= 11
923 llvm::LLVMContext& ctx = llvm_module.getContext();
924 llvm_module.setModuleFlag(llvm::Module::Override,
926 llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
927 llvm::Type::getInt32Ty(ctx), uint32_t(1))));
929 llvm_module.addModuleFlag(llvm::Module::Override,
"nvvm-reflect-ftz", uint32_t(1));
931 for (llvm::Function& fn : llvm_module) {
932 fn.addFnAttr(
"nvptx-f32ftz",
"true");
936 gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
937 llvm::legacy::FunctionPassManager FPM(&llvm_module);
938 pass_manager_builder.populateFunctionPassManager(FPM);
941 FPM.doInitialization();
942 for (
auto& F : llvm_module) {
945 FPM.doFinalization();
951 llvm::Function* func,
952 llvm::Function* wrapper_func,
953 const std::unordered_set<llvm::Function*>& live_funcs,
954 const bool is_gpu_smem_used,
956 const GPUTarget& gpu_target) {
959 auto llvm_module = func->getParent();
980 CHECK(gpu_target.cgen_state->module_ == llvm_module);
981 CHECK(func->getParent() == wrapper_func->getParent());
982 llvm_module->setDataLayout(
983 "e-p:64:64:64-i1:8:8-i8:8:8-"
984 "i16:16:16-i32:32:32-i64:64:64-"
985 "f32:32:32-f64:64:64-v16:16:16-"
986 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
987 llvm_module->setTargetTriple(
"nvptx64-nvidia-cuda");
988 CHECK(gpu_target.nvptx_target_machine);
989 llvm::PassManagerBuilder pass_manager_builder = llvm::PassManagerBuilder();
991 pass_manager_builder.OptLevel = 0;
992 llvm::legacy::PassManager module_pass_manager;
993 pass_manager_builder.populateModulePassManager(module_pass_manager);
995 bool requires_libdevice = check_module_requires_libdevice(llvm_module);
997 if (requires_libdevice) {
1002 optimize_ir(func, llvm_module, module_pass_manager, live_funcs, is_gpu_smem_used, co);
1003 legalize_nvvm_ir(func);
1005 std::stringstream ss;
1006 llvm::raw_os_ostream os(ss);
1008 llvm::LLVMContext& ctx = llvm_module->getContext();
1010 llvm::NamedMDNode* md = llvm_module->getOrInsertNamedMetadata(
"nvvm.annotations");
1012 llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
1013 llvm::MDString::get(ctx,
"kernel"),
1014 llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
1015 llvm::Type::getInt32Ty(ctx), 1))};
1018 md->addOperand(llvm::MDNode::get(ctx, md_vals));
1020 std::unordered_set<llvm::Function*> roots{wrapper_func, func};
1021 if (gpu_target.row_func_not_inlined) {
1023 roots.insert(gpu_target.cgen_state->row_func_);
1024 if (gpu_target.cgen_state->filter_func_) {
1025 roots.insert(gpu_target.cgen_state->filter_func_);
1030 for (
auto f : gpu_target.cgen_state->helper_functions_) {
1034 if (requires_libdevice) {
1035 for (llvm::Function& F : *llvm_module) {
1043 if (F.hasName() && F.getName().startswith(
"__internal") && !F.isDeclaration()) {
1046 legalize_nvvm_ir(&F);
1051 std::unordered_set<std::string> udf_declarations;
1053 if (executor->has_udf_module(
true)) {
1054 for (
auto&
f : executor->get_udf_module(
true)->getFunctionList()) {
1055 llvm::Function* udf_function = llvm_module->getFunction(
f.getName());
1058 legalize_nvvm_ir(udf_function);
1059 roots.insert(udf_function);
1063 if (
f.isDeclaration()) {
1064 udf_declarations.insert(
f.getName().str());
1070 if (executor->has_rt_udf_module(
true)) {
1071 for (
auto&
f : executor->get_rt_udf_module(
true)->getFunctionList()) {
1072 llvm::Function* udf_function = llvm_module->getFunction(
f.getName());
1074 legalize_nvvm_ir(udf_function);
1075 roots.insert(udf_function);
1079 if (
f.isDeclaration()) {
1080 udf_declarations.insert(
f.getName().str());
1086 std::vector<llvm::Function*> rt_funcs;
1087 for (
auto& Fn : *llvm_module) {
1088 if (roots.count(&Fn)) {
1091 rt_funcs.push_back(&Fn);
1093 for (
auto& pFn : rt_funcs) {
1094 pFn->removeFromParent();
1097 if (requires_libdevice) {
1098 add_intrinsics_to_module(llvm_module);
1101 if (!llvm_module->getModuleFlag(
"Debug Info Version")) {
1103 llvm_module->addModuleFlag(
1104 llvm::Module::Error,
"Debug Info Version", llvm::DEBUG_METADATA_VERSION);
1107 llvm_module->print(os,
nullptr);
1110 for (
auto& pFn : rt_funcs) {
1111 llvm_module->getFunctionList().push_back(pFn);
1113 llvm_module->eraseNamedMetadata(md);
1115 auto cuda_llir = ss.str() + cuda_rt_decls + extension_function_decls(udf_declarations);
1119 cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
1121 LOG(
WARNING) <<
"Failed to generate PTX: " << e.what()
1122 <<
". Switching to CPU execution target.";
1125 LOG(
PTX) <<
"PTX for the GPU:\n" << ptx <<
"\nEnd of PTX";
1127 auto cubin_result =
ptx_to_cubin(ptx, gpu_target.cuda_mgr);
1128 auto& option_keys = cubin_result.option_keys;
1129 auto& option_values = cubin_result.option_values;
1130 auto cubin = cubin_result.cubin;
1131 auto link_state = cubin_result.link_state;
1132 const auto num_options = option_keys.size();
1134 auto func_name = wrapper_func->getName().str();
1135 auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
1136 for (
int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
1138 gpu_compilation_context->addDeviceCode(
1139 std::make_unique<GpuDeviceCompilationContext>(cubin,
1140 cubin_result.cubin_size,
1143 gpu_target.cuda_mgr,
1146 &option_values[0]));
1152 }
catch (std::runtime_error
const& e) {
1153 if (strcmp(e.what(),
"QueryEngine instance hasn't been created")) {
1154 LOG(
WARNING) <<
"QueryEngine::getInstance() failed: " << e.what();
1157 LOG(
WARNING) <<
"Fail to get QueryEngine instance";
1160 return gpu_compilation_context;
1167 llvm::Function* query_func,
1168 llvm::Function* multifrag_query_func,
1169 std::unordered_set<llvm::Function*>& live_funcs,
1170 const bool no_inline,
1172 const bool is_gpu_smem_used,
1180 if (cgen_state_->filter_func_) {
1183 for (
const auto helper : cgen_state_->helper_functions_) {
1190 bool row_func_not_inlined =
false;
1192 for (
auto it = llvm::inst_begin(cgen_state_->row_func_),
1193 e = llvm::inst_end(cgen_state_->row_func_);
1196 if (llvm::isa<llvm::CallInst>(*it)) {
1197 auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
1200 (*func_name ==
"array_size" || *func_name ==
"linear_probabilistic_count")) {
1202 row_func_not_inlined =
true;
1209 initializeNVPTXBackend();
1211 nvptx_target_machine_.get(), cuda_mgr, cgen_state_.get(), row_func_not_inlined};
1212 std::shared_ptr<GpuCompilationContext> compilation_context;
1217 multifrag_query_func,
1222 }
catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1223 if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1227 auto const num_entries_to_evict =
1229 auto evicted_kernels_size =
1230 code_cache_accessor->getSumSizeEvicted(num_entries_to_evict);
1231 LOG(
WARNING) <<
"Failed to allocate GPU memory for generated code. Evicting "
1232 << num_entries_to_evict <<
" (" << evicted_kernels_size
1233 <<
" bytes) cached GPU code and re-trying.";
1236 }
catch (std::runtime_error
const& e) {
1237 if (strcmp(e.what(),
"QueryEngine instance hasn't been created")) {
1238 LOG(
WARNING) <<
"QueryEngine::getInstance() failed: " << e.what();
1241 LOG(
WARNING) <<
"Fail to get QueryEngine instance";
1243 code_cache_accessor->evictEntries(num_entries_to_evict);
1246 multifrag_query_func,
1263 llvm::TargetMachine* nvptx_target_machine,
1264 llvm::LLVMContext& context) {
1266 auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir,
"",
false);
1268 llvm::SMDiagnostic parse_error;
1270 auto llvm_module = llvm::parseIR(mem_buff->getMemBufferRef(), parse_error, context);
1272 LOG(
IR) <<
"CodeGenerator::generatePTX:NVVM IR:\n" << cuda_llir <<
"\nEnd of NNVM IR";
1276 llvm::SmallString<256> code_str;
1277 llvm::raw_svector_ostream formatted_os(code_str);
1278 CHECK(nvptx_target_machine);
1280 llvm::legacy::PassManager ptxgen_pm;
1281 llvm_module->setDataLayout(nvptx_target_machine->createDataLayout());
1283 #if LLVM_VERSION_MAJOR >= 10
1284 nvptx_target_machine->addPassesToEmitFile(
1285 ptxgen_pm, formatted_os,
nullptr, llvm::CGFT_AssemblyFile);
1287 nvptx_target_machine->addPassesToEmitFile(
1288 ptxgen_pm, formatted_os,
nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1290 ptxgen_pm.run(*llvm_module);
1293 #if LLVM_VERSION_MAJOR >= 11
1294 return std::string(code_str);
1296 return code_str.str();
1308 llvm::InitializeAllTargets();
1309 llvm::InitializeAllTargetMCs();
1310 llvm::InitializeAllAsmPrinters();
1312 auto target = llvm::TargetRegistry::lookupTarget(
"nvptx64", err);
1316 return std::unique_ptr<llvm::TargetMachine>(
1317 target->createTargetMachine(
"nvptx64-nvidia-cuda",
1320 llvm::TargetOptions(),
1321 llvm::Reloc::Static));
1326 cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1330 if (nvptx_target_machine_) {
1333 const auto arch = cudaMgr()->getDeviceArch();
1340 {
"query_stub_hoisted_literals",
1341 "multifrag_query_hoisted_literals",
1344 "fixed_width_int_decode",
1345 "fixed_width_unsigned_decode",
1346 "diff_fixed_width_int_decode",
1347 "fixed_width_double_decode",
1348 "fixed_width_float_decode",
1349 "fixed_width_small_date_decode",
1350 "record_error_code",
1354 "group_buff_idx_impl",
1356 "init_shared_mem_nop",
1359 auto const candidate_func_name = func->getName().str();
1362 [candidate_func_name](std::string_view func_name) {
1363 return candidate_func_name == func_name;
1368 const std::string& bc_filename,
1369 llvm::LLVMContext& context) {
1370 llvm::SMDiagnostic err;
1372 auto buffer_or_error = llvm::MemoryBuffer::getFile(bc_filename);
1373 CHECK(!buffer_or_error.getError()) <<
"bc_filename=" << bc_filename;
1374 llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1376 auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1377 CHECK(!owner.takeError());
1378 CHECK(owner->get());
1379 return std::move(owner.get());
1383 const std::string& udf_ir_filename,
1384 llvm::LLVMContext& ctx,
1385 bool is_gpu =
false) {
1386 llvm::SMDiagnostic parse_error;
1388 llvm::StringRef file_name_arg(udf_ir_filename);
1390 auto owner = llvm::parseIRFile(file_name_arg, parse_error, ctx);
1396 llvm::Triple gpu_triple(owner->getTargetTriple());
1397 if (!gpu_triple.isNVPTX()) {
1399 <<
"Expected triple nvptx64-nvidia-cuda for NVVM IR of loadtime UDFs but got "
1400 << gpu_triple.str() <<
". Disabling the NVVM IR module.";
1401 return std::unique_ptr<llvm::Module>();
1408 const std::string& udf_ir_string,
1409 llvm::LLVMContext& ctx,
1410 bool is_gpu =
false) {
1411 llvm::SMDiagnostic parse_error;
1413 auto buf = std::make_unique<llvm::MemoryBufferRef>(udf_ir_string,
1414 "Runtime UDF/UDTF LLVM/NVVM IR");
1416 auto owner = llvm::parseIR(*buf, parse_error, ctx);
1418 LOG(
IR) <<
"read_llvm_module_from_ir_string:\n"
1419 << udf_ir_string <<
"\nEnd of LLVM/NVVM IR";
1424 llvm::Triple gpu_triple(owner->getTargetTriple());
1425 if (!gpu_triple.isNVPTX()) {
1426 LOG(
IR) <<
"read_llvm_module_from_ir_string:\n"
1427 << udf_ir_string <<
"\nEnd of NNVM IR";
1428 LOG(
WARNING) <<
"Expected triple nvptx64-nvidia-cuda for NVVM IR but got "
1430 <<
". Executing runtime UDF/UDTFs on GPU will be disabled.";
1431 return std::unique_ptr<llvm::Module>();
1441 const bool use_resume_param,
1442 llvm::Function* query_func,
1443 llvm::Module* llvm_module) {
1444 for (
auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1446 if (!llvm::isa<llvm::CallInst>(*it)) {
1449 auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1451 if (func_name && *func_name == pos_fn_name) {
1452 if (use_resume_param) {
1453 auto*
const row_index_resume =
get_arg_by_name(query_func,
"row_index_resume");
1454 llvm::ReplaceInstWithInst(
1456 llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name +
"_impl"),
1459 llvm::ReplaceInstWithInst(
1461 llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name +
"_impl")));
1469 const size_t in_col_count,
1470 const size_t agg_col_count,
1471 const bool hoist_literals) {
1472 auto arg_it = row_func->arg_begin();
1474 if (agg_col_count) {
1475 for (
size_t i = 0; i < agg_col_count; ++i) {
1476 arg_it->setName(
"out");
1480 arg_it->setName(
"group_by_buff");
1482 arg_it->setName(
"varlen_output_buff");
1484 arg_it->setName(
"crt_matched");
1486 arg_it->setName(
"total_matched");
1488 arg_it->setName(
"old_total_matched");
1490 arg_it->setName(
"max_matched");
1494 arg_it->setName(
"agg_init_val");
1497 arg_it->setName(
"pos");
1500 arg_it->setName(
"frag_row_off");
1503 arg_it->setName(
"num_rows_per_scan");
1506 if (hoist_literals) {
1507 arg_it->setName(
"literals");
1511 for (
size_t i = 0; i < in_col_count; ++i) {
1516 arg_it->setName(
"join_hash_tables");
1518 arg_it->setName(
"row_func_mgr");
1522 const size_t agg_col_count,
1523 const bool hoist_literals,
1524 llvm::Module* llvm_module,
1525 llvm::LLVMContext& context) {
1526 std::vector<llvm::Type*> row_process_arg_types;
1528 if (agg_col_count) {
1530 for (
size_t i = 0; i < agg_col_count; ++i) {
1531 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1535 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1537 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1539 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1541 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1543 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1545 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1549 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1552 row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1555 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1558 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1561 if (hoist_literals) {
1562 row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1566 for (
size_t i = 0; i < in_col_count; ++i) {
1567 row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1571 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1574 row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1578 llvm::FunctionType::get(
get_int_type(32, context), row_process_arg_types,
false);
1580 auto row_func = llvm::Function::Create(
1581 ft, llvm::Function::ExternalLinkage,
"row_func", llvm_module);
1591 const std::string& query_fname,
1592 llvm::Function* multifrag_query_func,
1593 llvm::Module* llvm_module) {
1594 std::vector<llvm::CallInst*> query_stubs;
1595 for (
auto it = llvm::inst_begin(multifrag_query_func),
1596 e = llvm::inst_end(multifrag_query_func);
1599 if (!llvm::isa<llvm::CallInst>(*it)) {
1602 auto& query_call = llvm::cast<llvm::CallInst>(*it);
1604 if (call_func_name && *call_func_name == query_fname) {
1605 query_stubs.push_back(&query_call);
1608 for (
auto& S : query_stubs) {
1610 for (
size_t i = 0; i < S->getNumOperands() - 1; ++i) {
1611 args.push_back(S->getArgOperand(i));
1613 llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args,
""));
1617 std::vector<std::string>
get_agg_fnames(
const std::vector<Analyzer::Expr*>& target_exprs,
1618 const bool is_group_by) {
1619 std::vector<std::string>
result;
1620 for (
size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1621 ++target_idx, ++agg_col_idx) {
1622 const auto target_expr = target_exprs[target_idx];
1624 const auto target_type_info = target_expr->get_type_info();
1626 const bool is_varlen =
1627 (target_type_info.is_string() &&
1629 target_type_info.is_array();
1630 if (!agg_expr || agg_expr->get_aggtype() ==
kSAMPLE) {
1631 result.emplace_back(target_type_info.is_fp() ?
"agg_id_double" :
"agg_id");
1633 result.emplace_back(
"agg_id");
1635 if (target_type_info.is_geometry()) {
1636 result.emplace_back(
"agg_id");
1637 for (
auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1638 result.emplace_back(
"agg_id");
1648 agg_type_info = target_type_info;
1651 agg_type_info = agg_expr->get_arg()->get_type_info();
1657 !agg_type_info.
is_fp()) {
1658 throw std::runtime_error(
"AVG is only valid on integer and floating point");
1662 :
"agg_sum_double");
1665 :
"agg_count_double");
1671 throw std::runtime_error(
1672 "MIN on strings, arrays or geospatial types not supported yet");
1676 :
"agg_min_double");
1682 throw std::runtime_error(
1683 "MAX on strings, arrays or geospatial types not supported yet");
1687 :
"agg_max_double");
1693 !agg_type_info.
is_fp()) {
1694 throw std::runtime_error(
1695 "SUM and SUM_IF is only valid on integer and floating point");
1703 result.emplace_back(func_name);
1707 result.emplace_back(agg_expr->get_is_distinct() ?
"agg_count_distinct"
1711 result.emplace_back(
"agg_count_if");
1714 result.emplace_back(agg_type_info.
is_fp() ?
"agg_id_double" :
"agg_id");
1719 result.emplace_back(agg_type_info.
is_fp() ?
"agg_id_double" :
"agg_id");
1723 result.emplace_back(
"agg_approximate_count_distinct");
1726 result.emplace_back(
"agg_approx_quantile");
1729 result.emplace_back(
"agg_mode_func");
1732 UNREACHABLE() <<
"Usupported agg_type: " << agg_type;
1741 const bool is_cuda_ir) {
1749 llvm::Module& llvm_module,
1750 const std::vector<llvm::Function*>& roots,
1751 const std::vector<llvm::Function*>& leaves) {
1753 std::unordered_set<llvm::Function*> live_funcs;
1754 live_funcs.insert(roots.begin(), roots.end());
1755 live_funcs.insert(leaves.begin(), leaves.end());
1757 if (
auto F = llvm_module.getFunction(
"init_shared_mem_nop")) {
1758 live_funcs.insert(F);
1760 if (
auto F = llvm_module.getFunction(
"write_back_nop")) {
1761 live_funcs.insert(F);
1764 for (
const llvm::Function* F : roots) {
1765 for (
const llvm::BasicBlock& BB : *F) {
1766 for (
const llvm::Instruction& I : BB) {
1767 if (
const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1768 live_funcs.insert(CI->getCalledFunction());
1774 for (llvm::Function& F : llvm_module) {
1775 if (!live_funcs.count(&F) && !F.isDeclaration()) {
1776 F.setLinkage(llvm::GlobalValue::InternalLinkage);
1786 template <
typename InstType>
1788 std::string bb_name,
1789 std::string variable_name) {
1790 llvm::Value* result =
nullptr;
1791 if (func ==
nullptr || variable_name.empty()) {
1794 bool is_found =
false;
1795 for (
auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1796 if (!bb_name.empty() && bb_it->getName() != bb_name) {
1799 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1800 if (llvm::isa<InstType>(*inst_it)) {
1801 if (inst_it->getName() == variable_name) {
1814 llvm::Function* query_func,
1815 bool run_with_dynamic_watchdog,
1816 bool run_with_allowing_runtime_interrupt,
1817 const std::vector<JoinLoop>& join_loops,
1819 const std::vector<InputTableInfo>& input_table_infos) {
1825 if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1828 run_with_allowing_runtime_interrupt =
false;
1834 executor_session_mutex_);
1835 if (current_query_session_.empty()) {
1836 run_with_allowing_runtime_interrupt =
false;
1840 llvm::Value* row_count =
nullptr;
1841 if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1844 find_variable_in_basic_block<llvm::LoadInst>(query_func,
".entry",
"row_count");
1847 bool done_splitting =
false;
1848 for (
auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1850 llvm::Value* pos =
nullptr;
1851 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1852 if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1853 llvm::isa<llvm::PHINode>(*inst_it)) {
1854 if (inst_it->getName() ==
"pos") {
1859 if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1862 auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1864 if (row_func_name && *row_func_name ==
"row_process") {
1865 auto next_inst_it = inst_it;
1867 auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1868 auto& br_instr = bb_it->back();
1869 llvm::IRBuilder<> ir_builder(&br_instr);
1870 llvm::Value* err_lv = &*inst_it;
1871 llvm::Value* err_lv_returned_from_row_func =
nullptr;
1872 if (run_with_dynamic_watchdog) {
1874 llvm::Value* call_watchdog_lv =
nullptr;
1880 auto crit_edge_rem =
1881 (blockSize() & (blockSize() - 1))
1882 ? ir_builder.CreateSRem(
1884 cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1885 : ir_builder.CreateAnd(
1887 cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1888 auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1889 crit_edge_threshold->setName(
"crit_edge_threshold");
1894 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1897 auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1898 call_watchdog_lv = ir_builder.CreateICmp(
1899 llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1901 CHECK(call_watchdog_lv);
1902 auto error_check_bb = bb_it->splitBasicBlock(
1903 llvm::BasicBlock::iterator(br_instr),
".error_check");
1904 auto& watchdog_br_instr = bb_it->back();
1906 auto watchdog_check_bb = llvm::BasicBlock::Create(
1907 cgen_state_->context_,
".watchdog_check", query_func, error_check_bb);
1908 llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1909 auto detected_timeout = watchdog_ir_builder.CreateCall(
1910 cgen_state_->module_->getFunction(
"dynamic_watchdog"), {});
1911 auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1913 watchdog_ir_builder.CreateBr(error_check_bb);
1915 llvm::ReplaceInstWithInst(
1917 llvm::BranchInst::Create(
1918 watchdog_check_bb, error_check_bb, call_watchdog_lv));
1919 ir_builder.SetInsertPoint(&br_instr);
1920 auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1922 unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1923 unified_err_lv->addIncoming(err_lv, &*bb_it);
1924 err_lv = unified_err_lv;
1925 }
else if (run_with_allowing_runtime_interrupt) {
1927 llvm::Value* call_check_interrupt_lv{
nullptr};
1928 llvm::Value* interrupt_err_lv{
nullptr};
1929 llvm::BasicBlock* error_check_bb{
nullptr};
1930 llvm::BasicBlock* interrupt_check_bb{
nullptr};
1931 llvm::Instruction* check_interrupt_br_instr{
nullptr};
1934 join_loops.begin(), join_loops.end(), [](
const JoinLoop& join_loop) {
1935 return join_loop.isNestedLoopJoin();
1937 auto codegen_interrupt_checker = [&]() {
1938 error_check_bb = bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
1940 check_interrupt_br_instr = &bb_it->back();
1942 interrupt_check_bb = llvm::BasicBlock::Create(
1943 cgen_state_->context_,
".interrupt_check", query_func, error_check_bb);
1944 llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1945 auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1946 cgen_state_->module_->getFunction(
"check_interrupt"), {});
1947 interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1951 interrupt_checker_ir_builder.CreateBr(error_check_bb);
1953 if (has_loop_join) {
1954 codegen_interrupt_checker();
1955 CHECK(interrupt_check_bb);
1956 CHECK(check_interrupt_br_instr);
1957 llvm::ReplaceInstWithInst(check_interrupt_br_instr,
1958 llvm::BranchInst::Create(interrupt_check_bb));
1959 ir_builder.SetInsertPoint(&br_instr);
1960 err_lv = interrupt_err_lv;
1972 int64_t total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1973 uint64_t interrupt_checking_freq = 32;
1977 if (!input_table_infos.empty()) {
1978 const auto& outer_table_info = *input_table_infos.begin();
1979 auto num_outer_table_tuples =
1980 outer_table_info.info.getFragmentNumTuplesUpperBound();
1981 if (num_outer_table_tuples > 0) {
1989 auto max_inc = uint64_t(
1990 floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1996 auto calibrated_inc =
1997 uint64_t(floor(max_inc * (1 - freq_control_knob)));
1998 interrupt_checking_freq =
2003 if (interrupt_checking_freq > max_inc) {
2004 interrupt_checking_freq = max_inc / 2;
2006 if (interrupt_checking_freq < 8) {
2009 interrupt_checking_freq = 8;
2013 VLOG(1) <<
"Set the running query interrupt checking frequency: "
2014 << interrupt_checking_freq;
2016 llvm::Value* pos_shifted_per_iteration =
2017 ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
2018 auto interrupt_predicate = ir_builder.CreateAnd(pos_shifted_per_iteration,
2019 interrupt_checking_freq);
2020 call_check_interrupt_lv =
2021 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2022 interrupt_predicate,
2023 cgen_state_->llInt(int64_t(0LL)));
2026 auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
2027 call_check_interrupt_lv =
2028 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2029 interrupt_predicate,
2030 cgen_state_->llInt(int64_t(0LL)));
2032 codegen_interrupt_checker();
2033 CHECK(call_check_interrupt_lv);
2034 CHECK(interrupt_err_lv);
2035 CHECK(interrupt_check_bb);
2036 CHECK(error_check_bb);
2037 CHECK(check_interrupt_br_instr);
2038 llvm::ReplaceInstWithInst(
2039 check_interrupt_br_instr,
2040 llvm::BranchInst::Create(
2041 interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
2042 ir_builder.SetInsertPoint(&br_instr);
2043 auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
2045 unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
2046 unified_err_lv->addIncoming(err_lv, &*bb_it);
2047 err_lv = unified_err_lv;
2050 if (!err_lv_returned_from_row_func) {
2051 err_lv_returned_from_row_func = err_lv;
2057 err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2061 err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
2063 cgen_state_->llInt(static_cast<int32_t>(0)));
2065 auto error_bb = llvm::BasicBlock::Create(
2066 cgen_state_->context_,
".error_exit", query_func, new_bb);
2067 const auto error_code_arg =
get_arg_by_name(query_func,
"error_code");
2068 llvm::CallInst::Create(
2069 cgen_state_->module_->getFunction(
"record_error_code"),
2070 std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
2073 llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2074 llvm::ReplaceInstWithInst(&br_instr,
2075 llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2076 done_splitting =
true;
2081 CHECK(done_splitting);
2085 llvm::Module* M = cgen_state_->module_;
2086 if (M->getFunction(
"allocate_varlen_buffer") ==
nullptr) {
2091 bool should_track =
false;
2092 auto* flag = M->getModuleFlag(
"manage_memory_buffer");
2093 if (
auto* cnt = llvm::mdconst::extract_or_null<llvm::ConstantInt>(flag)) {
2094 if (cnt->getZExtValue() == 1) {
2095 should_track =
true;
2099 if (!should_track) {
2104 LOG(
INFO) <<
"Found 'manage_memory_buffer' metadata.";
2105 llvm::SmallVector<llvm::CallInst*, 4> calls_to_analyze;
2107 for (llvm::Function& F : *M) {
2108 for (llvm::BasicBlock& BB : F) {
2109 for (llvm::Instruction& I : BB) {
2110 if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&I)) {
2113 if (called_func_name && *called_func_name ==
"allocate_varlen_buffer") {
2114 calls_to_analyze.push_back(CI);
2123 llvm::IRBuilder<> Builder(cgen_state_->context_);
2126 auto void_ = llvm::Type::getVoidTy(cgen_state_->context_);
2127 llvm::FunctionType* fnty = llvm::FunctionType::get(void_, {i64, i8p},
false);
2128 llvm::FunctionCallee register_buffer_fn =
2129 M->getOrInsertFunction(
"register_buffer_with_executor_rsm", fnty, {});
2131 int64_t executor_addr =
reinterpret_cast<int64_t
>(
this);
2132 for (llvm::CallInst* CI : calls_to_analyze) {
2137 for (llvm::User* U : CI->users()) {
2138 if (llvm::CallInst* call = llvm::dyn_cast<llvm::CallInst>(U)) {
2140 if (func_name && *func_name ==
"register_buffer_with_executor_rsm") {
2147 Builder.SetInsertPoint(CI->getNextNode());
2148 Builder.CreateCall(register_buffer_fn,
2149 {
ll_int(executor_addr, cgen_state_->context_), CI});
2157 std::vector<llvm::Value*> hoisted_literals;
2161 std::vector<llvm::Type*> row_process_arg_types;
2163 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2164 E = cgen_state_->row_func_->arg_end();
2167 row_process_arg_types.push_back(I->getType());
2170 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2171 for (
auto value : element.second) {
2172 row_process_arg_types.push_back(value->getType());
2176 auto ft = llvm::FunctionType::get(
2177 get_int_type(32, cgen_state_->context_), row_process_arg_types,
false);
2178 auto row_func_with_hoisted_literals =
2179 llvm::Function::Create(ft,
2180 llvm::Function::ExternalLinkage,
2181 "row_func_hoisted_literals",
2182 cgen_state_->row_func_->getParent());
2184 auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
2185 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2186 E = cgen_state_->row_func_->arg_end();
2190 row_func_arg_it->setName(I->getName());
2195 decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{
nullptr};
2196 decltype(row_func_arg_it) filter_func_arg_it{
nullptr};
2197 if (cgen_state_->filter_func_) {
2200 std::vector<llvm::Type*> filter_func_arg_types;
2202 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2203 E = cgen_state_->filter_func_->arg_end();
2206 filter_func_arg_types.push_back(I->getType());
2209 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2210 for (
auto value : element.second) {
2211 filter_func_arg_types.push_back(value->getType());
2215 auto ft2 = llvm::FunctionType::get(
2216 get_int_type(32, cgen_state_->context_), filter_func_arg_types,
false);
2217 filter_func_with_hoisted_literals =
2218 llvm::Function::Create(ft2,
2219 llvm::Function::ExternalLinkage,
2220 "filter_func_hoisted_literals",
2221 cgen_state_->filter_func_->getParent());
2223 filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
2224 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2225 E = cgen_state_->filter_func_->arg_end();
2229 filter_func_arg_it->setName(I->getName());
2231 ++filter_func_arg_it;
2235 std::unordered_map<int, std::vector<llvm::Value*>>
2236 query_func_literal_loads_function_arguments,
2237 query_func_literal_loads_function_arguments2;
2239 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2240 std::vector<llvm::Value*> argument_values, argument_values2;
2242 for (
auto value : element.second) {
2243 hoisted_literals.push_back(value);
2244 argument_values.push_back(&*row_func_arg_it);
2245 if (cgen_state_->filter_func_) {
2246 argument_values2.push_back(&*filter_func_arg_it);
2247 cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
2249 if (value->hasName()) {
2250 row_func_arg_it->setName(
"arg_" + value->getName());
2251 if (cgen_state_->filter_func_) {
2252 filter_func_arg_it->getContext();
2253 filter_func_arg_it->setName(
"arg_" + value->getName());
2257 ++filter_func_arg_it;
2260 query_func_literal_loads_function_arguments[element.first] = argument_values;
2261 query_func_literal_loads_function_arguments2[element.first] = argument_values2;
2267 row_func_with_hoisted_literals->getBasicBlockList().splice(
2268 row_func_with_hoisted_literals->begin(),
2269 cgen_state_->row_func_->getBasicBlockList());
2272 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2273 E = cgen_state_->row_func_->arg_end(),
2274 I2 = row_func_with_hoisted_literals->arg_begin();
2277 I->replaceAllUsesWith(&*I2);
2279 cgen_state_->filter_func_args_.replace(&*I, &*I2);
2283 cgen_state_->row_func_ = row_func_with_hoisted_literals;
2286 std::vector<llvm::Instruction*> placeholders;
2287 std::string prefix(
"__placeholder__literal_");
2288 for (
auto it = llvm::inst_begin(row_func_with_hoisted_literals),
2289 e = llvm::inst_end(row_func_with_hoisted_literals);
2292 if (it->hasName() && it->getName().startswith(prefix)) {
2293 auto offset_and_index_entry =
2294 cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
2295 CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2297 int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2298 int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2300 it->replaceAllUsesWith(
2301 query_func_literal_loads_function_arguments[lit_off][lit_idx]);
2302 placeholders.push_back(&*it);
2305 for (
auto placeholder : placeholders) {
2306 placeholder->removeFromParent();
2309 if (cgen_state_->filter_func_) {
2313 filter_func_with_hoisted_literals->getBasicBlockList().splice(
2314 filter_func_with_hoisted_literals->begin(),
2315 cgen_state_->filter_func_->getBasicBlockList());
2319 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2320 E = cgen_state_->filter_func_->arg_end(),
2321 I2 = filter_func_with_hoisted_literals->arg_begin();
2324 I->replaceAllUsesWith(&*I2);
2329 cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2332 std::vector<llvm::Instruction*> placeholders;
2333 std::string prefix(
"__placeholder__literal_");
2334 for (
auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2335 e = llvm::inst_end(filter_func_with_hoisted_literals);
2338 if (it->hasName() && it->getName().startswith(prefix)) {
2339 auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2340 llvm::dyn_cast<llvm::Value>(&*it));
2341 CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2343 int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2344 int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2346 it->replaceAllUsesWith(
2347 query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2348 placeholders.push_back(&*it);
2351 for (
auto placeholder : placeholders) {
2352 placeholder->removeFromParent();
2356 return hoisted_literals;
2363 return shared_mem_used
2370 if (
auto const agg_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
2371 if (shared::is_any<SQLAgg::kCOUNT, SQLAgg::kCOUNT_IF>(agg_expr->get_aggtype())) {
2381 CaseExprDetector() : detect_case_expr_(
false) {}
2385 return detect_case_expr_;
2390 detect_case_expr_ =
true;
2395 mutable bool detect_case_expr_;
2402 CaseExprDetector detector;
2404 if (detector.detectCaseExpr(expr.get())) {
2415 const unsigned cuda_blocksize,
2416 const unsigned num_blocks_per_mp) {
2423 CHECK(query_mem_desc_ptr);
2447 if (cuda_blocksize < query_mem_desc_ptr->getEntryCount()) {
2452 const auto target_infos =
2455 if (std::find_if(target_infos.begin(),
2458 if (ti.sql_type.is_varlen() ||
2459 !supported_aggs.count(ti.agg_kind)) {
2464 }) == target_infos.end()) {
2479 if (cuda_blocksize < query_mem_desc_ptr->getEntryCount()) {
2491 const size_t shared_memory_threshold_bytes = std::min(
2494 const auto output_buffer_size =
2496 if (output_buffer_size > shared_memory_threshold_bytes) {
2503 const auto target_infos =
2509 if (std::find_if(target_infos.begin(),
2512 if (ti.sql_type.is_varlen() ||
2513 !supported_aggs.count(ti.agg_kind)) {
2518 }) == target_infos.end()) {
2529 std::string llvm_ir;
2530 std::unordered_set<llvm::MDNode*> md;
2533 for (
auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2534 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2535 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2536 instr_it->getAllMetadata(imd);
2537 for (
auto [kind, node] : imd) {
2544 for (
auto bb_it = cgen_state->
row_func_->begin(); bb_it != cgen_state->
row_func_->end();
2546 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2547 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2548 instr_it->getAllMetadata(imd);
2549 for (
auto [kind, node] : imd) {
2560 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2561 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2562 instr_it->getAllMetadata(imd);
2563 for (
auto [kind, node] : imd) {
2572 std::map<size_t, std::string> sorted_strings;
2575 llvm::raw_string_ostream os(str);
2576 p->print(os, cgen_state->
module_,
true);
2578 auto fields =
split(str, {}, 1);
2579 if (fields.empty() || fields[0].empty()) {
2582 sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2585 for (
auto [
id, text] : sorted_strings) {
2596 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2603 const bool allow_lazy_fetch,
2604 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2605 const size_t max_groups_buffer_entry_guess,
2606 const int8_t crt_min_byte_width,
2607 const bool has_cardinality_estimation,
2619 static std::uint64_t counter = 0;
2621 VLOG(1) <<
"CODEGEN #" << counter <<
":";
2622 LOG(
IR) <<
"CODEGEN #" << counter <<
":";
2624 LOG(
ASM) <<
"CODEGEN #" << counter <<
":";
2634 addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
2642 has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2646 max_groups_buffer_entry_guess,
2653 !has_cardinality_estimation && (!render_info || !render_info->
isInSitu()) &&
2655 const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2659 const bool output_columnar =
query_mem_desc->didOutputColumnar();
2660 const bool gpu_shared_mem_optimization =
2665 cuda_mgr ? this->blockSize() : 1,
2666 cuda_mgr ?
this->numBlocksPerMP() : 1);
2667 if (gpu_shared_mem_optimization) {
2670 LOG(
DEBUG1) <<
"GPU shared memory is used for the " +
2681 const size_t num_count_distinct_descs =
2683 for (
size_t i = 0; i < num_count_distinct_descs; i++) {
2684 const auto& count_distinct_descriptor =
2697 if (
auto gby_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
2698 bool has_multiple_gpus = cuda_mgr ? cuda_mgr->getDeviceCount() > 1 :
false;
2699 if (gby_expr->get_aggtype() ==
SQLAgg::kSAMPLE && has_multiple_gpus &&
2702 bool (*)(
const Analyzer::ColumnVar*,
const Analyzer::ColumnVar*)>
2705 for (
const auto cv : colvar_set) {
2706 if (cv->get_type_info().is_varlen()) {
2707 const auto tbl_key = cv->getTableKey();
2708 std::for_each(query_infos.begin(),
2711 if (input_table_info.table_key == tbl_key &&
2712 input_table_info.info.fragments.size() > 1) {
2726 CHECK(cgen_state_->module_ ==
nullptr);
2727 cgen_state_->set_module_shallow_copy(get_rt_module(),
true);
2734 if (has_udf_module(is_gpu)) {
2736 get_udf_module(is_gpu), *cgen_state_->module_, cgen_state_.get());
2738 if (has_rt_udf_module(is_gpu)) {
2740 get_rt_udf_module(is_gpu), *cgen_state_->module_, cgen_state_.get());
2748 const auto agg_slot_count = ra_exe_unit.
estimator ? size_t(1) : agg_fnames.size();
2751 auto [query_func, row_func_call] = is_group_by
2761 !!ra_exe_unit.estimator,
2767 cgen_state_->query_func_ = query_func;
2768 cgen_state_->row_func_call_ = row_func_call;
2769 cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2770 &query_func->getEntryBlock().front());
2774 auto& fetch_bb = query_func->front();
2775 llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2776 fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2780 cgen_state_->context_);
2784 is_group_by ? 0 : agg_slot_count,
2786 cgen_state_->module_,
2787 cgen_state_->context_);
2788 CHECK(cgen_state_->row_func_);
2789 cgen_state_->row_func_bb_ =
2790 llvm::BasicBlock::Create(cgen_state_->context_,
"entry", cgen_state_->row_func_);
2793 auto filter_func_ft =
2794 llvm::FunctionType::get(
get_int_type(32, cgen_state_->context_), {},
false);
2795 cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2796 llvm::Function::ExternalLinkage,
2798 cgen_state_->module_);
2799 CHECK(cgen_state_->filter_func_);
2800 cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2801 cgen_state_->context_,
"entry", cgen_state_->filter_func_);
2804 cgen_state_->current_func_ = cgen_state_->row_func_;
2805 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2807 preloadFragOffsets(ra_exe_unit.
input_descs, query_infos);
2809 const auto join_loops =
2810 buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2814 plan_state_->addSimpleQual(simple_qual);
2816 const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2817 if (is_not_deleted_bb) {
2818 cgen_state_->row_func_bb_ = is_not_deleted_bb;
2820 if (!join_loops.empty()) {
2821 codegenJoinLoops(join_loops,
2822 body_execution_unit,
2823 group_by_and_aggregate,
2825 cgen_state_->row_func_bb_,
2830 const bool can_return_error = compileBody(
2831 ra_exe_unit, group_by_and_aggregate, *
query_mem_desc, co, gpu_smem_context);
2834 createErrorCheckControlFlow(query_func,
2839 group_by_and_aggregate.query_infos_);
2842 std::vector<llvm::Value*> hoisted_literals;
2845 VLOG(1) <<
"number of hoisted literals: "
2846 << cgen_state_->query_func_literal_loads_.size()
2847 <<
" / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2851 if (co.
hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2853 hoisted_literals = inlineHoistedLiterals();
2857 std::vector<llvm::Value*> row_func_args;
2858 for (
size_t i = 0; i < cgen_state_->row_func_call_->getNumOperands() - 1; ++i) {
2859 row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2861 row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2862 row_func_args.push_back(
get_arg_by_name(query_func,
"join_hash_tables"));
2865 row_func_args.insert(
2866 row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2867 llvm::ReplaceInstWithInst(
2868 cgen_state_->row_func_call_,
2869 llvm::CallInst::Create(cgen_state_->row_func_, row_func_args,
""));
2872 if (cgen_state_->filter_func_) {
2873 std::vector<llvm::Value*> filter_func_args;
2874 for (
auto arg_it = cgen_state_->filter_func_args_.begin();
2875 arg_it != cgen_state_->filter_func_args_.end();
2877 filter_func_args.push_back(arg_it->first);
2879 llvm::ReplaceInstWithInst(
2880 cgen_state_->filter_func_call_,
2881 llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args,
""));
2885 plan_state_->init_agg_vals_ =
2895 if (gpu_smem_context.isSharedMemoryUsed()) {
2899 cgen_state_->module_,
2900 cgen_state_->context_,
2903 plan_state_->init_agg_vals_,
2905 gpu_smem_code.codegen();
2906 gpu_smem_code.injectFunctionsInto(query_func);
2909 cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2910 cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2911 LOG(
IR) << gpu_smem_code.toString();
2915 auto multifrag_query_func = cgen_state_->module_->getFunction(
2916 "multifrag_query" + std::string(co.
hoist_literals ?
"_hoisted_literals" :
""));
2917 CHECK(multifrag_query_func);
2920 insertErrorCodeChecker(multifrag_query_func,
2927 "query_stub" + std::string(co.
hoist_literals ?
"_hoisted_literals" :
""),
2928 multifrag_query_func,
2929 cgen_state_->module_);
2931 std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2932 if (cgen_state_->filter_func_) {
2933 root_funcs.push_back(cgen_state_->filter_func_);
2936 *cgen_state_->module_, root_funcs, {multifrag_query_func});
2943 if (cgen_state_->filter_func_) {
2954 std::string llvm_ir =
2958 VLOG(3) <<
"Unoptimized IR for the " << device_str <<
"\n" << llvm_ir <<
"\nEnd of IR";
2960 #ifdef WITH_JIT_DEBUG
2961 throw std::runtime_error(
2962 "Explain optimized not available when JIT runtime debug symbols are enabled");
2966 llvm::legacy::PassManager pass_manager;
2968 cgen_state_->module_,
2971 gpu_smem_context.isSharedMemoryUsed(),
2973 #endif // WITH_JIT_DEBUG
2984 LOG(
IR) <<
"IR for the " << device_str;
2996 AutoTrackBuffersInRuntimeIR();
3000 if (cgen_state_->filter_func_) {
3005 return std::make_tuple(
3008 ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
3009 : optimizeAndCodegenGPU(query_func,
3010 multifrag_query_func,
3012 is_group_by || ra_exe_unit.estimator,
3014 gpu_smem_context.isSharedMemoryUsed(),
3016 cgen_state_->getLiterals(),
3019 std::move(gpu_smem_context)},
3024 unsigned const error_code_idx,
3025 bool hoist_literals,
3026 bool allow_runtime_query_interrupt) {
3027 auto query_stub_func_name =
3028 "query_stub" + std::string(hoist_literals ?
"_hoisted_literals" :
"");
3029 for (
auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
3030 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
3031 if (!llvm::isa<llvm::CallInst>(*inst_it)) {
3034 auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
3036 if (row_func_name && *row_func_name == query_stub_func_name) {
3037 auto next_inst_it = inst_it;
3039 auto new_bb = bb_it->splitBasicBlock(next_inst_it);
3040 auto& br_instr = bb_it->back();
3041 llvm::IRBuilder<> ir_builder(&br_instr);
3042 llvm::Value* err_lv = &*inst_it;
3043 auto error_check_bb =
3044 bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
".error_check");
3046 llvm::Value*
const error_code_arg =
get_arg_by_index(query_func, error_code_idx);
3047 CHECK(error_code_arg) << error_code_idx <<
'/' << query_func->arg_size();
3048 llvm::Value* err_code =
nullptr;
3049 if (allow_runtime_query_interrupt) {
3051 auto& check_interrupt_br_instr = bb_it->back();
3052 auto interrupt_check_bb = llvm::BasicBlock::Create(
3053 cgen_state_->context_,
".interrupt_check", query_func, error_check_bb);
3054 llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
3055 auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
3056 cgen_state_->module_->getFunction(
"check_interrupt"), {});
3057 auto detected_error = interrupt_checker_ir_builder.CreateCall(
3058 cgen_state_->module_->getFunction(
"get_error_code"),
3059 std::vector<llvm::Value*>{error_code_arg});
3060 err_code = interrupt_checker_ir_builder.CreateSelect(
3064 interrupt_checker_ir_builder.CreateBr(error_check_bb);
3065 llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
3066 llvm::BranchInst::Create(interrupt_check_bb));
3067 ir_builder.SetInsertPoint(&br_instr);
3070 ir_builder.SetInsertPoint(&br_instr);
3072 ir_builder.CreateCall(cgen_state_->module_->getFunction(
"get_error_code"),
3073 std::vector<llvm::Value*>{error_code_arg});
3075 err_lv = ir_builder.CreateICmp(
3076 llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
3077 auto error_bb = llvm::BasicBlock::Create(
3078 cgen_state_->context_,
".error_exit", query_func, new_bb);
3079 llvm::CallInst::Create(cgen_state_->module_->getFunction(
"record_error_code"),
3080 std::vector<llvm::Value*>{err_code, error_code_arg},
3083 llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
3084 llvm::ReplaceInstWithInst(&br_instr,
3085 llvm::BranchInst::Create(error_bb, new_bb, err_lv));
3100 const auto& outer_input_desc = ra_exe_unit.
input_descs[0];
3104 const auto& table_key = outer_input_desc.getTableKey();
3105 const auto deleted_cd = plan_state_->getDeletedColForTable(table_key);
3109 CHECK(deleted_cd->columnType.is_boolean());
3110 const auto deleted_expr =
3111 makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
3113 outer_input_desc.getNestLevel());
3115 const auto is_deleted =
3116 code_generator.toBool(code_generator.codegen(deleted_expr.get(),
true, co).front());
3117 const auto is_deleted_bb = llvm::BasicBlock::Create(
3118 cgen_state_->context_,
"is_deleted", cgen_state_->row_func_);
3119 llvm::BasicBlock* bb = llvm::BasicBlock::Create(
3120 cgen_state_->context_,
"is_not_deleted", cgen_state_->row_func_);
3121 cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
3122 cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
3123 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3124 cgen_state_->ir_builder_.SetInsertPoint(bb);
3139 cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
3140 llvm::Value* loop_done{
nullptr};
3141 std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
3142 if (cgen_state_->filter_func_) {
3143 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3144 auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
3145 cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
3146 row_func_entry_bb->begin());
3147 loop_done = cgen_state_->ir_builder_.CreateAlloca(
3148 get_int_type(1, cgen_state_->context_),
nullptr,
"loop_done");
3149 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3150 cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(
true), loop_done);
3152 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
3153 cgen_state_->current_func_ = cgen_state_->filter_func_;
3154 fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
3158 std::vector<Analyzer::Expr*> primary_quals;
3159 std::vector<Analyzer::Expr*> deferred_quals;
3161 ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
3162 if (short_circuited) {
3164 <<
"short-circuited and deferred " <<
std::to_string(deferred_quals.size())
3167 llvm::Value* filter_lv = cgen_state_->llBool(
true);
3169 for (
auto expr : primary_quals) {
3171 auto cond = code_generator.toBool(code_generator.codegen(expr,
true, co).front());
3172 filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
3174 CHECK(filter_lv->getType()->isIntegerTy(1));
3175 llvm::BasicBlock* sc_false{
nullptr};
3176 if (!deferred_quals.empty()) {
3177 auto sc_true = llvm::BasicBlock::Create(
3178 cgen_state_->context_,
"sc_true", cgen_state_->current_func_);
3179 sc_false = llvm::BasicBlock::Create(
3180 cgen_state_->context_,
"sc_false", cgen_state_->current_func_);
3181 cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
3182 cgen_state_->ir_builder_.SetInsertPoint(sc_false);
3184 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
3186 cgen_state_->ir_builder_.SetInsertPoint(sc_true);
3187 filter_lv = cgen_state_->llBool(
true);
3189 for (
auto expr : deferred_quals) {
3190 filter_lv = cgen_state_->ir_builder_.CreateAnd(
3191 filter_lv, code_generator.toBool(code_generator.codegen(expr,
true, co).front()));
3194 CHECK(filter_lv->getType()->isIntegerTy(1));
3195 auto ret = group_by_and_aggregate.
codegen(
3196 filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
3200 if (cgen_state_->filter_func_) {
3201 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3202 cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(
false), loop_done);
3203 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3206 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3207 cgen_state_->current_func_ = cgen_state_->row_func_;
3208 cgen_state_->filter_func_call_ =
3209 cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
3213 redeclareFilterFunction();
3215 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3216 auto loop_done_true = llvm::BasicBlock::Create(
3217 cgen_state_->context_,
"loop_done_true", cgen_state_->row_func_);
3218 auto loop_done_false = llvm::BasicBlock::Create(
3219 cgen_state_->context_,
"loop_done_false", cgen_state_->row_func_);
3220 auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(
3221 loop_done->getType()->getPointerElementType(), loop_done);
3222 cgen_state_->ir_builder_.CreateCondBr(
3223 loop_done_flag, loop_done_true, loop_done_false);
3224 cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
3225 cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3226 cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
3228 cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3235 llvm::Value* byte_stream_arg,
3236 llvm::IRBuilder<>& ir_builder,
3237 llvm::LLVMContext& ctx) {
3238 CHECK(byte_stream_arg);
3239 const auto max_col_local_id = num_columns - 1;
3241 std::vector<llvm::Value*> col_heads;
3242 for (
int col_id = 0; col_id <= max_col_local_id; ++col_id) {
3243 auto* gep = ir_builder.CreateGEP(
3244 byte_stream_arg->getType()->getScalarType()->getPointerElementType(),
3246 llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id));
3247 col_heads.emplace_back(
3248 ir_builder.CreateLoad(gep->getType()->getPointerElementType(), gep));
3252 void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, const std::vector< JoinLoop > &join_loops, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
std::optional< std::string_view > getCalledFunctionName(llvm::CallInst &call_inst)
std::vector< Analyzer::Expr * > target_exprs
double g_running_query_interrupt_freq
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
std::string get_cuda_libdevice_dir(void)
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned cuda_blocksize, const unsigned num_blocks_per_mp)
std::string gen_translate_null_key_sigs()
bool countDistinctDescriptorsLogicallyEmpty() const
size_t getEntryCount() const
static const int32_t ERR_INTERRUPTED
std::unordered_map< shared::TableKey, const ColumnDescriptor * > DeletedColumnsMap
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
void mark_function_never_inline(llvm::Function *func)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
void optimize_ir(llvm::Function *query_func, llvm::Module *llvm_module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co)
bool with_dynamic_watchdog
Streaming Top N algorithm.
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void AutoTrackBuffersInRuntimeIR()
void checkCudaErrors(CUresult err)
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
llvm::ConstantInt * ll_int(const T v, llvm::LLVMContext &context)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *llvm_module)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_string(const std::string &udf_ir_string, llvm::LLVMContext &ctx, bool is_gpu=false)
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *mod, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
bool hasKeylessHash() const
void insertErrorCodeChecker(llvm::Function *query_func, unsigned const error_code_idx, bool hoist_literals, bool allow_runtime_query_interrupt)
std::vector< std::string > CodeCacheKey
ExecutorOptLevel opt_level
bool g_enable_dynamic_watchdog
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
T visit(const Analyzer::Expr *expr) const
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *mod, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
bool filter_on_deleted_column
size_t getRowSize() const
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
llvm::Function * row_func_
bool g_enable_smem_non_grouped_agg
std::shared_lock< T > shared_lock
unsigned getExpOfTwo(unsigned n)
bool output_columnar_hint
llvm::StringRef get_gpu_target_triple_string()
Supported runtime functions management and retrieval.
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
void verify_function_ir(const llvm::Function *func)
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
bool useStreamingTopN() const
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
ExecutionEngineWrapper create_execution_engine(llvm::Module *llvm_module, llvm::EngineBuilder &eb, const CompilationOptions &co)
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx, bool is_gpu=false)
ExecutorExplainType explain_type
unsigned get_index_by_name(llvm::Function *func, const std::string &name)
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
static const int32_t ERR_OUT_OF_TIME
void initializeNVPTXBackend() const
size_t getMinSharedMemoryPerBlockForAllDevices() const
const std::string cuda_rt_decls
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
CubinResult ptx_to_cubin(const std::string &ptx, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
QueryDescriptionType getQueryDescriptionType() const
static std::mutex initialize_cpu_backend_mutex_
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *llvm_module, llvm::LLVMContext &context)
ExecutorDeviceType device_type
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *llvm_module)
llvm::Function * filter_func_
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
static void addUdfIrToModule(const std::string &udf_ir_filename, const bool is_cuda_ir)
bool isArchMaxwellOrLaterForAll() const
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *llvm_module)
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool is_gpu_smem_used, const CompilationOptions &)
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(Executor *executor, llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co, const GPUTarget &gpu_target)
bool g_enable_smem_grouped_non_count_agg
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::unordered_map< shared::TableKey, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
bool has_count_expr(RelAlgExecutionUnit const &ra_exe_unit)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
static std::map< ExtModuleKinds, std::string > extension_module_sources
void show_defined(llvm::Module &llvm_module)
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
bool g_enable_filter_function
static void linkModuleWithLibdevice(Executor *executor, llvm::Module &module, llvm::PassManagerBuilder &pass_manager_builder, const GPUTarget &gpu_target)
virtual T visitCaseExpr(const Analyzer::CaseExpr *case_) const
float g_fraction_code_cache_to_evict
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
SQLAgg get_aggtype() const
std::string filename(char const *path)
std::list< std::shared_ptr< Analyzer::Expr > > quals
std::string gen_array_any_all_sigs()
bool didOutputColumnar() const
bool g_enable_watchdog false
#define DEBUG_TIMER(name)
llvm::ValueToValueMapTy vmap_
std::vector< llvm::Value * > inlineHoistedLiterals()
static std::shared_ptr< QueryEngine > getInstance()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool register_intel_jit_listener
bool isArchPascal() const
bool any_of(std::vector< Analyzer::Expr * > const &target_exprs)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls, const bool is_gpu=false)
bool allow_runtime_query_interrupt
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
llvm::Type * get_int_ptr_type(const int width, llvm::LLVMContext &context)
constexpr std::array< std::string_view, 18 > TARGET_RUNTIME_FUNCTIONS_FOR_MODULE_CLONING
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
llvm::Value * get_arg_by_index(llvm::Function *func, unsigned const index)
std::unique_ptr< llvm::Module > read_llvm_module_from_bc_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
bool has_case_expr_within_groupby_expr(RelAlgExecutionUnit const &ra_exe_unit)
static std::mutex initialize_nvptx_mutex_
size_t g_gpu_smem_threshold