19 #if LLVM_VERSION_MAJOR < 9
20 static_assert(
false,
"LLVM Version >= 9 is required.");
23 #include <llvm/Analysis/ScopedNoAliasAA.h>
24 #include <llvm/Analysis/TypeBasedAliasAnalysis.h>
25 #include <llvm/Bitcode/BitcodeReader.h>
26 #include <llvm/Bitcode/BitcodeWriter.h>
27 #include <llvm/ExecutionEngine/MCJIT.h>
28 #include <llvm/IR/Attributes.h>
29 #include <llvm/IR/GlobalValue.h>
30 #include <llvm/IR/InstIterator.h>
31 #include <llvm/IR/IntrinsicInst.h>
32 #include <llvm/IR/Intrinsics.h>
33 #include <llvm/IR/LegacyPassManager.h>
34 #include <llvm/IR/Verifier.h>
35 #include <llvm/IRReader/IRReader.h>
36 #if 14 <= LLVM_VERSION_MAJOR
37 #include <llvm/MC/TargetRegistry.h>
39 #include <llvm/Support/TargetRegistry.h>
41 #include <llvm/Support/Casting.h>
42 #include <llvm/Support/FileSystem.h>
43 #include <llvm/Support/FormattedStream.h>
44 #include <llvm/Support/MemoryBuffer.h>
45 #include <llvm/Support/SourceMgr.h>
46 #include <llvm/Support/TargetSelect.h>
47 #include <llvm/Support/raw_os_ostream.h>
48 #include <llvm/Support/raw_ostream.h>
49 #include <llvm/Transforms/IPO.h>
50 #include <llvm/Transforms/IPO/AlwaysInliner.h>
51 #include <llvm/Transforms/IPO/InferFunctionAttrs.h>
52 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
53 #include <llvm/Transforms/InstCombine/InstCombine.h>
54 #include <llvm/Transforms/Instrumentation.h>
55 #include <llvm/Transforms/Scalar.h>
56 #include <llvm/Transforms/Scalar/GVN.h>
57 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
58 #include <llvm/Transforms/Utils.h>
59 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
60 #include <llvm/Transforms/Utils/Cloning.h>
62 #if LLVM_VERSION_MAJOR >= 11
63 #include <llvm/Support/Host.h>
85 #include <llvm/Support/DynamicLibrary.h>
88 extern std::unique_ptr<std::string> g_libgeos_so_filename;
90 static llvm::sys::DynamicLibrary geos_dynamic_library;
91 static std::mutex geos_init_mutex;
95 void load_geos_dynamic_library() {
96 std::lock_guard<std::mutex> guard(geos_init_mutex);
98 if (!geos_dynamic_library.isValid()) {
99 if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
100 LOG(
WARNING) <<
"Misconfigured GEOS library file name, trying 'libgeos_c.so'";
101 g_libgeos_so_filename.reset(
new std::string(
"libgeos_c.so"));
103 auto filename = *g_libgeos_so_filename;
104 std::string error_message;
105 geos_dynamic_library =
106 llvm::sys::DynamicLibrary::getPermanentLibrary(
filename.c_str(), &error_message);
107 if (!geos_dynamic_library.isValid()) {
109 std::string exception_message =
"Failed to load GEOS library: " + error_message;
110 throw std::runtime_error(exception_message);
123 std::string src =
"",
124 const bool is_gpu =
false) {
125 std::string excname = (is_gpu ?
"NVVM IR ParseError: " :
"LLVM IR ParseError: ");
126 llvm::raw_string_ostream ss(excname);
127 parse_error.print(src.c_str(), ss,
false,
false);
141 #define SHOW_DEFINED(MODULE) \
143 std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
144 ::show_defined(MODULE); \
147 #define SHOW_FUNCTIONS(MODULE) \
149 std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
150 ::show_functions(MODULE); \
153 template <
typename T =
void>
155 std::cout <<
"defines: ";
156 for (
auto&
f : llvm_module.getFunctionList()) {
157 if (!
f.isDeclaration()) {
158 std::cout <<
f.getName().str() <<
", ";
161 std::cout << std::endl;
164 template <
typename T =
void>
166 if (llvm_module ==
nullptr) {
167 std::cout <<
"is null" << std::endl;
173 template <
typename T =
void>
192 template <
typename T =
void>
194 std::unordered_set<std::string>& defined,
195 std::unordered_set<std::string>& undefined,
196 const std::unordered_set<std::string>& ignored) {
197 for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
198 if (
auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
199 auto* F2 = CI->getCalledFunction();
201 auto F2name = F2->getName().str();
202 if (F2->isDeclaration()) {
203 if (F2name.rfind(
"__", 0) !=
205 && F2name.rfind(
"llvm.", 0) !=
207 && ignored.find(F2name) == ignored.end()
209 undefined.emplace(F2name);
212 if (defined.find(F2name) == defined.end()) {
213 defined.emplace(F2name);
214 scan_function_calls<T>(*F2, defined, undefined, ignored);
222 template <
typename T =
void>
224 std::unordered_set<std::string>& defined,
225 std::unordered_set<std::string>& undefined,
226 const std::unordered_set<std::string>& ignored) {
227 for (
auto& F : llvm_module) {
228 if (!F.isDeclaration()) {
234 template <
typename T =
void>
235 std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>>
237 const std::unordered_set<std::string>& ignored = {}) {
238 std::unordered_set<std::string> defined, undefined;
240 return std::make_tuple(defined, undefined);
243 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
246 const std::unordered_set<llvm::Function*>& live_funcs) {
247 std::vector<llvm::Function*> dead_funcs;
250 if (live_funcs.count(&F)) {
253 for (
auto U : F.users()) {
254 auto* C = llvm::dyn_cast<
const llvm::CallInst>(U);
255 if (!C || C->getParent()->getParent() != &F) {
261 dead_funcs.push_back(&F);
264 for (
auto pFn : dead_funcs) {
265 pFn->eraseFromParent();
273 bool check_module_requires_libdevice(llvm::Module* llvm_module) {
275 for (llvm::Function& F : *llvm_module) {
276 if (F.hasName() && F.getName().startswith(
"__nv_")) {
277 LOG(
INFO) <<
"Module requires linking with libdevice: " << std::string(F.getName());
281 LOG(
DEBUG1) <<
"module does not require linking against libdevice";
286 void add_intrinsics_to_module(llvm::Module* llvm_module) {
287 for (llvm::Function& F : *llvm_module) {
288 for (llvm::Instruction& I : instructions(F)) {
289 if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
290 if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
291 llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
292 llvm::Function& decl_fn =
293 *llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID(), Tys);
294 ii->setCalledFunction(&decl_fn);
297 llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID());
307 llvm::Module* llvm_module,
308 llvm::legacy::PassManager& pass_manager,
309 const std::unordered_set<llvm::Function*>& live_funcs,
310 const bool is_gpu_smem_used,
314 pass_manager.add(llvm::createVerifierPass());
315 pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
319 pass_manager.add(llvm::createSROAPass());
323 llvm::createEarlyCSEPass(
true));
325 if (!is_gpu_smem_used) {
330 pass_manager.add(llvm::createJumpThreadingPass());
332 pass_manager.add(llvm::createCFGSimplificationPass());
335 pass_manager.add(llvm::createNewGVNPass());
337 pass_manager.add(llvm::createDeadStoreEliminationPass());
338 pass_manager.add(llvm::createLICMPass());
340 pass_manager.add(llvm::createInstructionCombiningPass());
343 pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
344 pass_manager.add(llvm::createGlobalOptimizerPass());
346 pass_manager.add(llvm::createCFGSimplificationPass());
348 pass_manager.run(*llvm_module);
359 : execution_engine_(execution_engine) {}
363 : execution_engine_(execution_engine) {
366 #ifdef ENABLE_INTEL_JIT_LISTENER
370 LOG(
INFO) <<
"Registered IntelJITEventListener";
372 LOG(
WARNING) <<
"This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
373 "listener configuration parameter.";
374 #endif // ENABLE_INTEL_JIT_LISTENER
380 llvm::ExecutionEngine* execution_engine) {
387 std::stringstream err_ss;
388 llvm::raw_os_ostream err_os(err_ss);
389 err_os <<
"\n-----\n";
390 if (llvm::verifyFunction(*func, &err_os)) {
391 err_os <<
"\n-----\n";
392 func->print(err_os,
nullptr);
393 err_os <<
"\n-----\n";
401 llvm::Module* llvm_module) {
402 llvm::legacy::PassManager pass_manager;
403 auto cpu_target_machine = execution_engine->getTargetMachine();
404 CHECK(cpu_target_machine);
405 llvm::SmallString<256> code_str;
406 llvm::raw_svector_ostream os(code_str);
407 #if LLVM_VERSION_MAJOR >= 10
408 cpu_target_machine->addPassesToEmitFile(
409 pass_manager, os,
nullptr, llvm::CGFT_AssemblyFile);
411 cpu_target_machine->addPassesToEmitFile(
412 pass_manager, os,
nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
414 pass_manager.run(*llvm_module);
415 return "Assembly for the CPU:\n" + std::string(code_str.str()) +
"\nEnd of assembly";
419 llvm::EngineBuilder& eb,
423 CHECK(execution_engine.get());
425 llvm_module->setDataLayout(execution_engine->getDataLayout());
429 execution_engine->finalizeObject();
430 return execution_engine;
438 llvm::Function* func,
439 const std::unordered_set<llvm::Function*>& live_funcs,
442 llvm::Module* llvm_module = func->getParent();
445 #ifndef WITH_JIT_DEBUG
446 llvm::legacy::PassManager pass_manager;
448 func, llvm_module, pass_manager, live_funcs,
false, co);
449 #endif // WITH_JIT_DEBUG
462 auto init_err = llvm::InitializeNativeTarget();
465 llvm::InitializeAllTargetMCs();
466 llvm::InitializeNativeTargetAsmPrinter();
467 llvm::InitializeNativeTargetAsmParser();
470 std::unique_ptr<llvm::Module> owner(llvm_module);
472 llvm::EngineBuilder eb(std::move(owner));
473 eb.setErrorStr(&err_str);
474 eb.setEngineKind(llvm::EngineKind::JIT);
475 llvm::TargetOptions to;
476 to.EnableFastISel =
true;
477 eb.setTargetOptions(to);
486 llvm::Function* query_func,
487 llvm::Function* multifrag_query_func,
488 const std::unordered_set<llvm::Function*>& live_funcs,
493 llvm::Module* M = query_func->getParent();
494 auto* flag = llvm::mdconst::extract_or_null<llvm::ConstantInt>(
495 M->getModuleFlag(
"manage_memory_buffer"));
496 if (flag and flag->getZExtValue() == 1 and M->getFunction(
"allocate_varlen_buffer") and
497 M->getFunction(
"register_buffer_with_executor_rsm")) {
498 LOG(
INFO) <<
"including executor addr to cache key\n";
501 if (cgen_state_->filter_func_) {
504 for (
const auto helper : cgen_state_->helper_functions_) {
512 if (cgen_state_->needs_geos_) {
514 auto llvm_module = multifrag_query_func->getParent();
515 load_geos_dynamic_library();
518 auto rt_geos_module_copy = llvm::CloneModule(
519 *get_geos_module(), cgen_state_->vmap_, [](
const llvm::GlobalValue* gv) {
520 auto func = llvm::dyn_cast<llvm::Function>(gv);
524 switch (func->getLinkage()) {
525 case llvm::GlobalValue::LinkageTypes::InternalLinkage:
526 case llvm::GlobalValue::LinkageTypes::PrivateLinkage:
527 case llvm::GlobalValue::LinkageTypes::ExternalLinkage:
528 case llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage:
537 llvm::Linker::Flags::LinkOnlyNeeded);
539 throw std::runtime_error(
"GEOS is disabled in this build");
543 auto execution_engine =
545 auto cpu_compilation_context =
546 std::make_shared<CpuCompilationContext>(std::move(execution_engine));
547 cpu_compilation_context->setFunctionPointer(multifrag_query_func);
553 llvm::Module& llvm_module,
555 llvm::Linker::Flags flags) {
559 for (
auto&
f : *udf_module) {
560 auto func = llvm_module.getFunction(
f.getName());
562 LOG(
ERROR) <<
" Attempt to overwrite " <<
f.getName().str() <<
" in "
563 << llvm_module.getModuleIdentifier() <<
" from `"
564 << udf_module->getModuleIdentifier() <<
"`" << std::endl;
565 throw std::runtime_error(
566 "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
569 VLOG(1) <<
" Adding " <<
f.getName().str() <<
" to "
570 << llvm_module.getModuleIdentifier() <<
" from `"
571 << udf_module->getModuleIdentifier() <<
"`" << std::endl;
575 auto udf_module_copy = llvm::CloneModule(*udf_module, cgen_state->
vmap_);
577 udf_module_copy->setDataLayout(llvm_module.getDataLayout());
578 udf_module_copy->setTargetTriple(llvm_module.getTargetTriple());
581 llvm::Linker ld(llvm_module);
582 bool link_error =
false;
584 link_error = ld.linkInModule(std::move(udf_module_copy), flags);
587 throw std::runtime_error(
"link_udf_module: *** error linking module ***");
597 if (s ==
"int16_t") {
600 if (s ==
"int32_t") {
603 if (s ==
"int64_t") {
606 CHECK(s ==
"float" || s ==
"double");
612 for (
const std::string any_or_all : {
"any",
"all"}) {
613 for (
const std::string elem_type :
614 {
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"float",
"double"}) {
615 for (
const std::string needle_type :
616 {
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"float",
"double"}) {
617 for (
const std::string op_name : {
"eq",
"ne",
"lt",
"le",
"gt",
"ge"}) {
618 result += (
"declare i1 @array_" + any_or_all +
"_" + op_name +
"_" + elem_type +
630 for (
const std::string key_type : {
"int8_t",
"int16_t",
"int32_t",
"int64_t"}) {
632 result +=
"declare i64 @translate_null_key_" + key_type +
"(" + key_llvm_type +
", " +
633 key_llvm_type +
", i64);\n";
639 R
"(
declare void @llvm.dbg.declare(metadata, metadata, metadata)
declare void @llvm.dbg.value(metadata, metadata, metadata)
declare double @llvm.fmuladd.f64(double, double, double)
declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
declare i64 @get_thread_index();
declare i64 @get_block_index();
declare i32 @pos_start_impl(i32*);
declare i32 @group_buff_idx_impl();
declare i32 @pos_step_impl();
declare i8 @thread_warp_idx(i8);
declare i64* @init_shared_mem(i64*, i32);
declare i64* @init_shared_mem_nop(i64*, i32);
declare i64* @declare_dynamic_shared_memory();
declare void @write_back_nop(i64*, i64*, i32);
declare void @write_back_non_grouped_agg(i64*, i64*, i32);
declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8);
declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32);
declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32);
declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32);
declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32);
declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32);
declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32);
declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64);
declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64);
declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64);
declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64);
declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64);
declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double);
declare i64 @get_bucket_key_for_range_double(i8*, i64, double);
declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double);
declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64);
declare i64 @agg_count_shared(i64*, i64);
declare i64 @agg_count_skip_val_shared(i64*, i64, i64);
declare i32 @agg_count_int32_shared(i32*, i32);
declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32);
declare i64 @agg_count_double_shared(i64*, double);
declare i64 @agg_count_double_skip_val_shared(i64*, double, double);
declare i32 @agg_count_float_shared(i32*, float);
declare i32 @agg_count_float_skip_val_shared(i32*, float, float);
declare i64 @agg_count_if_shared(i64*, i64);
declare i64 @agg_count_if_skip_val_shared(i64*, i64, i64);
declare i32 @agg_count_if_int32_shared(i32*, i32);
declare i32 @agg_count_if_int32_skip_val_shared(i32*, i32, i32);
declare i64 @agg_sum_shared(i64*, i64);
declare i64 @agg_sum_skip_val_shared(i64*, i64, i64);
declare i32 @agg_sum_int32_shared(i32*, i32);
declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_sum_double_shared(i64*, double);
declare void @agg_sum_double_skip_val_shared(i64*, double, double);
declare void @agg_sum_float_shared(i32*, float);
declare void @agg_sum_float_skip_val_shared(i32*, float, float);
declare i64 @agg_sum_if_shared(i64*, i64, i8);
declare i64 @agg_sum_if_skip_val_shared(i64*, i64, i64, i8);
declare i32 @agg_sum_if_int32_shared(i32*, i32, i8);
declare i32 @agg_sum_if_int32_skip_val_shared(i32*, i32, i32, i8);
declare void @agg_sum_if_double_shared(i64*, double, i8);
declare void @agg_sum_if_double_skip_val_shared(i64*, double, double, i8);
declare void @agg_sum_if_float_shared(i32*, float, i8);
declare void @agg_sum_if_float_skip_val_shared(i32*, float, float, i8);
declare void @agg_max_shared(i64*, i64);
declare void @agg_max_skip_val_shared(i64*, i64, i64);
declare void @agg_max_int32_shared(i32*, i32);
declare void @agg_max_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_max_int16_shared(i16*, i16);
declare void @agg_max_int16_skip_val_shared(i16*, i16, i16);
declare void @agg_max_int8_shared(i8*, i8);
declare void @agg_max_int8_skip_val_shared(i8*, i8, i8);
declare void @agg_max_double_shared(i64*, double);
declare void @agg_max_double_skip_val_shared(i64*, double, double);
declare void @agg_max_float_shared(i32*, float);
declare void @agg_max_float_skip_val_shared(i32*, float, float);
declare void @agg_min_shared(i64*, i64);
declare void @agg_min_skip_val_shared(i64*, i64, i64);
declare void @agg_min_int32_shared(i32*, i32);
declare void @agg_min_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_min_int16_shared(i16*, i16);
declare void @agg_min_int16_skip_val_shared(i16*, i16, i16);
declare void @agg_min_int8_shared(i8*, i8);
declare void @agg_min_int8_skip_val_shared(i8*, i8, i8);
declare void @agg_min_double_shared(i64*, double);
declare void @agg_min_double_skip_val_shared(i64*, double, double);
declare void @agg_min_float_shared(i32*, float);
declare void @agg_min_float_skip_val_shared(i32*, float, float);
declare void @agg_id_shared(i64*, i64);
declare i8* @agg_id_varlen_shared(i8*, i64, i8*, i64);
declare void @agg_id_int32_shared(i32*, i32);
declare void @agg_id_int16_shared(i16*, i16);
declare void @agg_id_int8_shared(i8*, i8);
declare void @agg_id_double_shared(i64*, double);
declare void @agg_id_double_shared_slow(i64*, double*);
declare void @agg_id_float_shared(i32*, float);
declare i32 @checked_single_agg_id_shared(i64*, i64, i64);
declare i32 @checked_single_agg_id_double_shared(i64*, double, double);
declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double);
declare i32 @checked_single_agg_id_float_shared(i32*, float, float);
declare i1 @slotEmptyKeyCAS(i64*, i64, i64);
declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32);
declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16);
declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8);
declare i64 @datetrunc_century(i64);
declare i64 @datetrunc_day(i64);
declare i64 @datetrunc_decade(i64);
declare i64 @datetrunc_hour(i64);
declare i64 @datetrunc_millennium(i64);
declare i64 @datetrunc_minute(i64);
declare i64 @datetrunc_month(i64);
declare i64 @datetrunc_quarter(i64);
declare i64 @datetrunc_quarterday(i64);
declare i64 @datetrunc_week_monday(i64);
declare i64 @datetrunc_week_sunday(i64);
declare i64 @datetrunc_week_saturday(i64);
declare i64 @datetrunc_year(i64);
declare i64 @extract_epoch(i64);
declare i64 @extract_dateepoch(i64);
declare i64 @extract_quarterday(i64);
declare i64 @extract_hour(i64);
declare i64 @extract_minute(i64);
declare i64 @extract_second(i64);
declare i64 @extract_millisecond(i64);
declare i64 @extract_microsecond(i64);
declare i64 @extract_nanosecond(i64);
declare i64 @extract_dow(i64);
declare i64 @extract_isodow(i64);
declare i64 @extract_day(i64);
declare i64 @extract_week_monday(i64);
declare i64 @extract_week_sunday(i64);
declare i64 @extract_week_saturday(i64);
declare i64 @extract_day_of_year(i64);
declare i64 @extract_month(i64);
declare i64 @extract_quarter(i64);
declare i64 @extract_year(i64);
declare i64 @ExtractTimeFromHPTimestamp(i64,i64);
declare i64 @ExtractTimeFromHPTimestampNullable(i64,i64,i64);
declare i64 @ExtractTimeFromLPTimestamp(i64);
declare i64 @ExtractTimeFromLPTimestampNullable(i64,i64);
declare i64 @DateTruncateHighPrecisionToDate(i64, i64);
declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64);
declare i64 @DateDiff(i32, i64, i64);
declare i64 @DateDiffNullable(i32, i64, i64, i64);
declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32);
declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64);
declare i64 @DateAdd(i32, i64, i64);
declare i64 @DateAddNullable(i32, i64, i64, i64);
declare i64 @DateAddHighPrecision(i32, i64, i64, i32);
declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64);
declare {i8*,i64} @string_decode(i8*, i64);
declare i32 @array_size(i8*, i64, i32);
declare i32 @array_size_nullable(i8*, i64, i32, i32);
declare i32 @array_size_1_nullable(i8*, i64, i32);
declare i32 @fast_fixlen_array_size(i8*, i32);
declare i1 @array_is_null(i8*, i64);
declare i1 @point_coord_array_is_null(i8*, i64);
declare i8* @array_buff(i8*, i64);
declare i8* @fast_fixlen_array_buff(i8*, i64);
declare i64 @determine_fixed_array_len(i8*, i64);
declare i8 @array_at_int8_t(i8*, i64, i32);
declare i16 @array_at_int16_t(i8*, i64, i32);
declare i32 @array_at_int32_t(i8*, i64, i32);
declare i64 @array_at_int64_t(i8*, i64, i32);
declare float @array_at_float(i8*, i64, i32);
declare double @array_at_double(i8*, i64, i32);
declare i8 @varlen_array_at_int8_t(i8*, i64, i32);
declare i16 @varlen_array_at_int16_t(i8*, i64, i32);
declare i32 @varlen_array_at_int32_t(i8*, i64, i32);
declare i64 @varlen_array_at_int64_t(i8*, i64, i32);
declare float @varlen_array_at_float(i8*, i64, i32);
declare double @varlen_array_at_double(i8*, i64, i32);
declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32);
declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32);
declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32);
declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32);
declare float @varlen_notnull_array_at_float(i8*, i64, i32);
declare double @varlen_notnull_array_at_double(i8*, i64, i32);
declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8);
declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16);
declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32);
declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64);
declare float @array_at_float_checked(i8*, i64, i64, float);
declare double @array_at_double_checked(i8*, i64, i64, double);
declare i32 @char_length(i8*, i32);
declare i32 @char_length_nullable(i8*, i32, i32);
declare i32 @char_length_encoded(i8*, i32);
declare i32 @char_length_encoded_nullable(i8*, i32, i32);
declare i32 @key_for_string_encoded(i32);
declare i1 @sample_ratio(double, i64);
declare double @width_bucket(double, double, double, double, i32);
declare double @width_bucket_reverse(double, double, double, double, i32);
declare double @width_bucket_nullable(double, double, double, double, i32, double);
declare double @width_bucket_reversed_nullable(double, double, double, double, i32, double);
declare double @width_bucket_no_oob_check(double, double, double);
declare double @width_bucket_reverse_no_oob_check(double, double, double);
declare double @width_bucket_expr(double, i1, double, double, i32);
declare double @width_bucket_expr_nullable(double, i1, double, double, i32, double);
declare double @width_bucket_expr_no_oob_check(double, i1, double, double, i32);
declare i1 @string_like(i8*, i32, i8*, i32, i8);
declare i1 @string_ilike(i8*, i32, i8*, i32, i8);
declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8);
declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8);
declare i1 @string_like_simple(i8*, i32, i8*, i32);
declare i1 @string_ilike_simple(i8*, i32, i8*, i32);
declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8);
declare i1 @string_lt(i8*, i32, i8*, i32);
declare i1 @string_le(i8*, i32, i8*, i32);
declare i1 @string_gt(i8*, i32, i8*, i32);
declare i1 @string_ge(i8*, i32, i8*, i32);
declare i1 @string_eq(i8*, i32, i8*, i32);
declare i1 @string_ne(i8*, i32, i8*, i32);
declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8);
declare i1 @regexp_like(i8*, i32, i8*, i32, i8);
declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8);
declare void @linear_probabilistic_count(i8*, i32, i8*, i32);
declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64, i64);
declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64, i64);
declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64);
declare void @record_error_code(i32, i32*);
declare i32 @get_error_code(i32*);
declare i1 @dynamic_watchdog();
declare i1 @check_interrupt();
declare void @force_sync();
declare void @sync_warp();
declare void @sync_warp_protected(i64, i64);
declare void @sync_threadblock();
declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32);
declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64);
declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float);
declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double);
declare double @decompress_x_coord_geoint(i32);
declare double @decompress_y_coord_geoint(i32);
declare i32 @compress_x_coord_geoint(double);
declare i32 @compress_y_coord_geoint(double);
declare i64 @fixed_width_date_encode(i64, i32, i64);
declare i64 @fixed_width_date_decode(i64, i32, i64);
)" + gen_array_any_all_sigs() +
645 bool check_any_operand_is_stacksave_intrinsic(llvm::Instruction& inst) {
646 for (
auto op_it = inst.op_begin(); op_it != inst.op_end(); op_it++) {
647 if (
const llvm::IntrinsicInst* inst2 = llvm::dyn_cast<llvm::IntrinsicInst>(*op_it)) {
648 if (inst2->getIntrinsicID() == llvm::Intrinsic::stacksave) {
657 std::string extension_function_decls(
const std::unordered_set<std::string>& udf_decls) {
663 void legalize_nvvm_ir(llvm::Function* query_func) {
670 std::vector<llvm::Instruction*> stackrestore_intrinsics;
671 std::vector<llvm::Instruction*> stacksave_intrinsics;
672 std::vector<llvm::Instruction*> lifetime;
673 for (
auto& BB : *query_func) {
674 for (llvm::Instruction& I : BB) {
675 if (llvm::dyn_cast<llvm::PHINode>(&I)) {
676 if (check_any_operand_is_stacksave_intrinsic(I)) {
679 stacksave_intrinsics.push_back(&I);
680 VLOG(2) <<
"Remove PHI node having llvm::stacksave intrinsic as its operand";
682 }
else if (
const llvm::IntrinsicInst* II =
683 llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
684 if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
685 stacksave_intrinsics.push_back(&I);
686 }
else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
687 stackrestore_intrinsics.push_back(&I);
688 }
else if (II->getIntrinsicID() == llvm::Intrinsic::lifetime_start ||
689 II->getIntrinsicID() == llvm::Intrinsic::lifetime_end) {
690 lifetime.push_back(&I);
699 for (
auto& II : stackrestore_intrinsics) {
700 II->eraseFromParent();
702 for (
auto& II : stacksave_intrinsics) {
703 II->eraseFromParent();
706 for (
auto& II : lifetime) {
707 II->eraseFromParent();
715 return llvm::StringRef(
"nvptx64-nvidia-cuda");
719 return llvm::StringRef(
720 "e-p:64:64:64-i1:8:8-i8:8:8-"
721 "i16:16:16-i32:32:32-i64:64:64-"
722 "f32:32:32-f64:64:64-v16:16:16-"
723 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
727 std::map<std::string, std::string>
result;
729 result.insert(std::make_pair(
"cpu_name", llvm::sys::getHostCPUName()));
730 result.insert(std::make_pair(
"cpu_triple", llvm::sys::getProcessTriple()));
732 std::make_pair(
"cpu_cores",
std::to_string(llvm::sys::getHostNumPhysicalCores())));
736 std::string sizeof_types;
739 sizeof_types +=
"ssize_t:" +
std::to_string(
sizeof(ssize_t)) +
";";
741 sizeof_types +=
"uchar:" +
std::to_string(
sizeof(
unsigned char)) +
";";
743 sizeof_types +=
"ushort:" +
std::to_string(
sizeof(
unsigned short int)) +
";";
745 sizeof_types +=
"uint:" +
std::to_string(
sizeof(
unsigned int)) +
";";
747 sizeof_types +=
"ulong:" +
std::to_string(
sizeof(
unsigned long int)) +
";";
748 sizeof_types +=
"longlong:" +
std::to_string(
sizeof(
long long int)) +
";";
749 sizeof_types +=
"ulonglong:" +
std::to_string(
sizeof(
unsigned long long int)) +
";";
752 sizeof_types +=
"longdouble:" +
std::to_string(
sizeof(
long double)) +
";";
755 result.insert(std::make_pair(
"type_sizeof", sizeof_types));
757 std::string null_values;
758 null_values +=
"boolean1:" +
std::to_string(serialized_null_value<bool>()) +
";";
759 null_values +=
"boolean8:" +
std::to_string(serialized_null_value<int8_t>()) +
";";
760 null_values +=
"int8:" +
std::to_string(serialized_null_value<int8_t>()) +
";";
761 null_values +=
"int16:" +
std::to_string(serialized_null_value<int16_t>()) +
";";
762 null_values +=
"int32:" +
std::to_string(serialized_null_value<int32_t>()) +
";";
763 null_values +=
"int64:" +
std::to_string(serialized_null_value<int64_t>()) +
";";
764 null_values +=
"uint8:" +
std::to_string(serialized_null_value<uint8_t>()) +
";";
765 null_values +=
"uint16:" +
std::to_string(serialized_null_value<uint16_t>()) +
";";
766 null_values +=
"uint32:" +
std::to_string(serialized_null_value<uint32_t>()) +
";";
767 null_values +=
"uint64:" +
std::to_string(serialized_null_value<uint64_t>()) +
";";
768 null_values +=
"float32:" +
std::to_string(serialized_null_value<float>()) +
";";
769 null_values +=
"float64:" +
std::to_string(serialized_null_value<double>()) +
";";
771 "Array<boolean8>:" +
std::to_string(serialized_null_value<int8_t, true>()) +
";";
773 "Array<int8>:" +
std::to_string(serialized_null_value<int8_t, true>()) +
";";
775 "Array<int16>:" +
std::to_string(serialized_null_value<int16_t, true>()) +
";";
777 "Array<int32>:" +
std::to_string(serialized_null_value<int32_t, true>()) +
";";
779 "Array<int64>:" +
std::to_string(serialized_null_value<int64_t, true>()) +
";";
781 "Array<float32>:" +
std::to_string(serialized_null_value<float, true>()) +
";";
783 "Array<float64>:" +
std::to_string(serialized_null_value<double, true>()) +
";";
785 result.insert(std::make_pair(
"null_values", null_values));
787 llvm::StringMap<bool> cpu_features;
788 if (llvm::sys::getHostCPUFeatures(cpu_features)) {
789 std::string features_str =
"";
790 for (
auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
791 features_str += (it->getValue() ?
" +" :
" -");
792 features_str += it->getKey().str();
794 result.insert(std::make_pair(
"cpu_features", features_str));
797 result.insert(std::make_pair(
"llvm_version",
804 int device_count = 0;
808 char device_name[256];
809 int major = 0, minor = 0;
814 &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
816 &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
819 result.insert(std::make_pair(
"gpu_name", device_name));
820 result.insert(std::make_pair(
"gpu_count",
std::to_string(device_count)));
821 result.insert(std::make_pair(
"gpu_compute_capability",
825 result.insert(std::make_pair(
"gpu_driver",
831 std::make_pair(
"gpu_has_libdevice",
843 std::unordered_set<llvm::Function*> findAliveRuntimeFuncs(
844 llvm::Module& llvm_module,
845 const std::vector<llvm::Function*>& roots) {
846 std::queue<llvm::Function*> queue;
847 std::unordered_set<llvm::Function*> visited;
848 for (llvm::Function* F : roots) {
852 while (!queue.empty()) {
853 llvm::Function* F = queue.front();
855 if (visited.find(F) != visited.end()) {
860 for (llvm::inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
861 if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
862 if (CI->isInlineAsm()) {
865 llvm::Function* called = CI->getCalledFunction();
866 if (!called || visited.find(called) != visited.end()) {
881 llvm::Module& llvm_module,
882 llvm::PassManagerBuilder& pass_manager_builder,
883 const GPUTarget& gpu_target) {
887 if (!executor->has_libdevice_module()) {
889 throw std::runtime_error(
890 "libdevice library is not available but required by the UDF module");
894 std::vector<llvm::Function*> roots;
895 for (llvm::Function& fn : llvm_module) {
896 if (!fn.isDeclaration()) {
897 roots.emplace_back(&fn);
904 gpu_target.cgen_state,
905 llvm::Linker::Flags::OverrideFromSrc);
907 std::unordered_set<llvm::Function*> live_funcs =
908 findAliveRuntimeFuncs(llvm_module, roots);
910 std::vector<llvm::Function*> funcs_to_delete;
911 for (llvm::Function& fn : llvm_module) {
912 if (!live_funcs.count(&fn)) {
914 funcs_to_delete.emplace_back(&fn);
918 for (llvm::Function*
f : funcs_to_delete) {
919 f->eraseFromParent();
923 #if LLVM_VERSION_MAJOR >= 11
924 llvm::LLVMContext& ctx = llvm_module.getContext();
925 llvm_module.setModuleFlag(llvm::Module::Override,
927 llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
928 llvm::Type::getInt32Ty(ctx), uint32_t(1))));
930 llvm_module.addModuleFlag(llvm::Module::Override,
"nvvm-reflect-ftz", uint32_t(1));
932 for (llvm::Function& fn : llvm_module) {
933 fn.addFnAttr(
"nvptx-f32ftz",
"true");
937 gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
938 llvm::legacy::FunctionPassManager FPM(&llvm_module);
939 pass_manager_builder.populateFunctionPassManager(FPM);
942 FPM.doInitialization();
943 for (
auto& F : llvm_module) {
946 FPM.doFinalization();
952 llvm::Function* func,
953 llvm::Function* wrapper_func,
954 const std::unordered_set<llvm::Function*>& live_funcs,
955 const bool is_gpu_smem_used,
957 const GPUTarget& gpu_target) {
960 auto llvm_module = func->getParent();
981 CHECK(gpu_target.cgen_state->module_ == llvm_module);
982 CHECK(func->getParent() == wrapper_func->getParent());
983 llvm_module->setDataLayout(
984 "e-p:64:64:64-i1:8:8-i8:8:8-"
985 "i16:16:16-i32:32:32-i64:64:64-"
986 "f32:32:32-f64:64:64-v16:16:16-"
987 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
988 llvm_module->setTargetTriple(
"nvptx64-nvidia-cuda");
989 CHECK(gpu_target.nvptx_target_machine);
990 llvm::PassManagerBuilder pass_manager_builder = llvm::PassManagerBuilder();
992 pass_manager_builder.OptLevel = 0;
993 llvm::legacy::PassManager module_pass_manager;
994 pass_manager_builder.populateModulePassManager(module_pass_manager);
996 bool requires_libdevice = check_module_requires_libdevice(llvm_module);
998 if (requires_libdevice) {
1003 optimize_ir(func, llvm_module, module_pass_manager, live_funcs, is_gpu_smem_used, co);
1004 legalize_nvvm_ir(func);
1006 std::stringstream ss;
1007 llvm::raw_os_ostream os(ss);
1009 llvm::LLVMContext& ctx = llvm_module->getContext();
1011 llvm::NamedMDNode* md = llvm_module->getOrInsertNamedMetadata(
"nvvm.annotations");
1013 llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
1014 llvm::MDString::get(ctx,
"kernel"),
1015 llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
1016 llvm::Type::getInt32Ty(ctx), 1))};
1019 md->addOperand(llvm::MDNode::get(ctx, md_vals));
1021 std::unordered_set<llvm::Function*> roots{wrapper_func, func};
1022 if (gpu_target.row_func_not_inlined) {
1024 roots.insert(gpu_target.cgen_state->row_func_);
1025 if (gpu_target.cgen_state->filter_func_) {
1026 roots.insert(gpu_target.cgen_state->filter_func_);
1031 for (
auto f : gpu_target.cgen_state->helper_functions_) {
1035 if (requires_libdevice) {
1036 for (llvm::Function& F : *llvm_module) {
1044 if (F.hasName() && F.getName().startswith(
"__internal") && !F.isDeclaration()) {
1047 legalize_nvvm_ir(&F);
1052 std::unordered_set<std::string> udf_declarations;
1054 if (executor->has_udf_module(
true)) {
1055 for (
auto&
f : executor->get_udf_module(
true)->getFunctionList()) {
1056 llvm::Function* udf_function = llvm_module->getFunction(
f.getName());
1059 legalize_nvvm_ir(udf_function);
1060 roots.insert(udf_function);
1064 if (
f.isDeclaration()) {
1065 udf_declarations.insert(
f.getName().str());
1071 if (executor->has_rt_udf_module(
true)) {
1072 for (
auto&
f : executor->get_rt_udf_module(
true)->getFunctionList()) {
1073 llvm::Function* udf_function = llvm_module->getFunction(
f.getName());
1075 legalize_nvvm_ir(udf_function);
1076 roots.insert(udf_function);
1080 if (
f.isDeclaration()) {
1081 udf_declarations.insert(
f.getName().str());
1087 std::vector<llvm::Function*> rt_funcs;
1088 for (
auto& Fn : *llvm_module) {
1089 if (roots.count(&Fn)) {
1092 rt_funcs.push_back(&Fn);
1094 for (
auto& pFn : rt_funcs) {
1095 pFn->removeFromParent();
1098 if (requires_libdevice) {
1099 add_intrinsics_to_module(llvm_module);
1102 if (!llvm_module->getModuleFlag(
"Debug Info Version")) {
1104 llvm_module->addModuleFlag(
1105 llvm::Module::Error,
"Debug Info Version", llvm::DEBUG_METADATA_VERSION);
1108 llvm_module->print(os,
nullptr);
1111 for (
auto& pFn : rt_funcs) {
1112 llvm_module->getFunctionList().push_back(pFn);
1114 llvm_module->eraseNamedMetadata(md);
1116 auto cuda_llir = ss.str() + cuda_rt_decls + extension_function_decls(udf_declarations);
1120 cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
1122 LOG(
WARNING) <<
"Failed to generate PTX: " << e.what()
1123 <<
". Switching to CPU execution target.";
1126 LOG(
PTX) <<
"PTX for the GPU:\n" << ptx <<
"\nEnd of PTX";
1128 auto cubin_result =
ptx_to_cubin(ptx, gpu_target.cuda_mgr);
1129 auto& option_keys = cubin_result.option_keys;
1130 auto& option_values = cubin_result.option_values;
1131 auto cubin = cubin_result.cubin;
1132 auto link_state = cubin_result.link_state;
1133 const auto num_options = option_keys.size();
1135 auto func_name = wrapper_func->getName().str();
1136 auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
1137 for (
int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
1139 gpu_compilation_context->addDeviceCode(
1140 std::make_unique<GpuDeviceCompilationContext>(cubin,
1141 cubin_result.cubin_size,
1144 gpu_target.cuda_mgr,
1147 &option_values[0]));
1151 return gpu_compilation_context;
1158 llvm::Function* query_func,
1159 llvm::Function* multifrag_query_func,
1160 std::unordered_set<llvm::Function*>& live_funcs,
1161 const bool no_inline,
1163 const bool is_gpu_smem_used,
1171 if (cgen_state_->filter_func_) {
1174 for (
const auto helper : cgen_state_->helper_functions_) {
1182 bool row_func_not_inlined =
false;
1184 for (
auto it = llvm::inst_begin(cgen_state_->row_func_),
1185 e = llvm::inst_end(cgen_state_->row_func_);
1188 if (llvm::isa<llvm::CallInst>(*it)) {
1189 auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
1192 (*func_name ==
"array_size" || *func_name ==
"linear_probabilistic_count")) {
1194 row_func_not_inlined =
true;
1201 initializeNVPTXBackend();
1203 nvptx_target_machine_.get(), cuda_mgr, cgen_state_.get(), row_func_not_inlined};
1204 std::shared_ptr<GpuCompilationContext> compilation_context;
1209 multifrag_query_func,
1214 }
catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1215 if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1219 auto const num_entries_to_evict =
1221 code_cache_accessor->evictEntries(num_entries_to_evict);
1224 multifrag_query_func,
1241 llvm::TargetMachine* nvptx_target_machine,
1242 llvm::LLVMContext& context) {
1244 auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir,
"",
false);
1246 llvm::SMDiagnostic parse_error;
1248 auto llvm_module = llvm::parseIR(mem_buff->getMemBufferRef(), parse_error, context);
1250 LOG(
IR) <<
"CodeGenerator::generatePTX:NVVM IR:\n" << cuda_llir <<
"\nEnd of NNVM IR";
1254 llvm::SmallString<256> code_str;
1255 llvm::raw_svector_ostream formatted_os(code_str);
1256 CHECK(nvptx_target_machine);
1258 llvm::legacy::PassManager ptxgen_pm;
1259 llvm_module->setDataLayout(nvptx_target_machine->createDataLayout());
1261 #if LLVM_VERSION_MAJOR >= 10
1262 nvptx_target_machine->addPassesToEmitFile(
1263 ptxgen_pm, formatted_os,
nullptr, llvm::CGFT_AssemblyFile);
1265 nvptx_target_machine->addPassesToEmitFile(
1266 ptxgen_pm, formatted_os,
nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1268 ptxgen_pm.run(*llvm_module);
1271 #if LLVM_VERSION_MAJOR >= 11
1272 return std::string(code_str);
1274 return code_str.str();
1286 llvm::InitializeAllTargets();
1287 llvm::InitializeAllTargetMCs();
1288 llvm::InitializeAllAsmPrinters();
1290 auto target = llvm::TargetRegistry::lookupTarget(
"nvptx64", err);
1294 return std::unique_ptr<llvm::TargetMachine>(
1295 target->createTargetMachine(
"nvptx64-nvidia-cuda",
1298 llvm::TargetOptions(),
1299 llvm::Reloc::Static));
1304 cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1308 if (nvptx_target_machine_) {
1311 const auto arch = cudaMgr()->getDeviceArch();
1318 {
"query_stub_hoisted_literals",
1319 "multifrag_query_hoisted_literals",
1322 "fixed_width_int_decode",
1323 "fixed_width_unsigned_decode",
1324 "diff_fixed_width_int_decode",
1325 "fixed_width_double_decode",
1326 "fixed_width_float_decode",
1327 "fixed_width_small_date_decode",
1328 "record_error_code",
1332 "group_buff_idx_impl",
1334 "init_shared_mem_nop",
1337 auto const candidate_func_name = func->getName().str();
1340 [candidate_func_name](std::string_view func_name) {
1341 return candidate_func_name == func_name;
1346 const std::string& bc_filename,
1347 llvm::LLVMContext& context) {
1348 llvm::SMDiagnostic err;
1350 auto buffer_or_error = llvm::MemoryBuffer::getFile(bc_filename);
1351 CHECK(!buffer_or_error.getError()) <<
"bc_filename=" << bc_filename;
1352 llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1354 auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1355 CHECK(!owner.takeError());
1356 CHECK(owner->get());
1357 return std::move(owner.get());
1361 const std::string& udf_ir_filename,
1362 llvm::LLVMContext& ctx,
1363 bool is_gpu =
false) {
1364 llvm::SMDiagnostic parse_error;
1366 llvm::StringRef file_name_arg(udf_ir_filename);
1368 auto owner = llvm::parseIRFile(file_name_arg, parse_error, ctx);
1374 llvm::Triple gpu_triple(owner->getTargetTriple());
1375 if (!gpu_triple.isNVPTX()) {
1377 <<
"Expected triple nvptx64-nvidia-cuda for NVVM IR of loadtime UDFs but got "
1378 << gpu_triple.str() <<
". Disabling the NVVM IR module.";
1379 return std::unique_ptr<llvm::Module>();
1386 const std::string& udf_ir_string,
1387 llvm::LLVMContext& ctx,
1388 bool is_gpu =
false) {
1389 llvm::SMDiagnostic parse_error;
1391 auto buf = std::make_unique<llvm::MemoryBufferRef>(udf_ir_string,
1392 "Runtime UDF/UDTF LLVM/NVVM IR");
1394 auto owner = llvm::parseIR(*buf, parse_error, ctx);
1396 LOG(
IR) <<
"read_llvm_module_from_ir_string:\n"
1397 << udf_ir_string <<
"\nEnd of LLVM/NVVM IR";
1402 llvm::Triple gpu_triple(owner->getTargetTriple());
1403 if (!gpu_triple.isNVPTX()) {
1404 LOG(
IR) <<
"read_llvm_module_from_ir_string:\n"
1405 << udf_ir_string <<
"\nEnd of NNVM IR";
1406 LOG(
WARNING) <<
"Expected triple nvptx64-nvidia-cuda for NVVM IR but got "
1408 <<
". Executing runtime UDF/UDTFs on GPU will be disabled.";
1409 return std::unique_ptr<llvm::Module>();
1419 const bool use_resume_param,
1420 llvm::Function* query_func,
1421 llvm::Module* llvm_module) {
1422 for (
auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1424 if (!llvm::isa<llvm::CallInst>(*it)) {
1427 auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1429 if (func_name && *func_name == pos_fn_name) {
1430 if (use_resume_param) {
1431 auto*
const row_index_resume =
get_arg_by_name(query_func,
"row_index_resume");
1432 llvm::ReplaceInstWithInst(
1434 llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name +
"_impl"),
1437 llvm::ReplaceInstWithInst(
1439 llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name +
"_impl")));
1447 const size_t in_col_count,
1448 const size_t agg_col_count,
1449 const bool hoist_literals) {
1450 auto arg_it = row_func->arg_begin();
1452 if (agg_col_count) {
1453 for (
size_t i = 0; i < agg_col_count; ++i) {
1454 arg_it->setName(
"out");
1458 arg_it->setName(
"group_by_buff");
1460 arg_it->setName(
"varlen_output_buff");
1462 arg_it->setName(
"crt_matched");
1464 arg_it->setName(
"total_matched");
1466 arg_it->setName(
"old_total_matched");
1468 arg_it->setName(
"max_matched");
1472 arg_it->setName(
"agg_init_val");
1475 arg_it->setName(
"pos");
1478 arg_it->setName(
"frag_row_off");
1481 arg_it->setName(
"num_rows_per_scan");
1484 if (hoist_literals) {
1485 arg_it->setName(
"literals");
1489 for (
size_t i = 0; i < in_col_count; ++i) {
1494 arg_it->setName(
"join_hash_tables");
1496 arg_it->setName(
"row_func_mgr");
1500 const size_t agg_col_count,
1501 const bool hoist_literals,
1502 llvm::Module* llvm_module,
1503 llvm::LLVMContext& context) {
1504 std::vector<llvm::Type*> row_process_arg_types;
1506 if (agg_col_count) {
1508 for (
size_t i = 0; i < agg_col_count; ++i) {
1509 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1513 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1515 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1517 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1519 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1521 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1523 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1527 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1530 row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1533 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1536 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1539 if (hoist_literals) {
1540 row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1544 for (
size_t i = 0; i < in_col_count; ++i) {
1545 row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1549 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1552 row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1556 llvm::FunctionType::get(
get_int_type(32, context), row_process_arg_types,
false);
1558 auto row_func = llvm::Function::Create(
1559 ft, llvm::Function::ExternalLinkage,
"row_func", llvm_module);
1569 const std::string& query_fname,
1570 llvm::Function* multifrag_query_func,
1571 llvm::Module* llvm_module) {
1572 std::vector<llvm::CallInst*> query_stubs;
1573 for (
auto it = llvm::inst_begin(multifrag_query_func),
1574 e = llvm::inst_end(multifrag_query_func);
1577 if (!llvm::isa<llvm::CallInst>(*it)) {
1580 auto& query_call = llvm::cast<llvm::CallInst>(*it);
1582 if (call_func_name && *call_func_name == query_fname) {
1583 query_stubs.push_back(&query_call);
1586 for (
auto& S : query_stubs) {
1588 for (
size_t i = 0; i < S->getNumOperands() - 1; ++i) {
1589 args.push_back(S->getArgOperand(i));
1591 llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args,
""));
1595 std::vector<std::string>
get_agg_fnames(
const std::vector<Analyzer::Expr*>& target_exprs,
1596 const bool is_group_by) {
1597 std::vector<std::string>
result;
1598 for (
size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1599 ++target_idx, ++agg_col_idx) {
1600 const auto target_expr = target_exprs[target_idx];
1602 const auto target_type_info = target_expr->get_type_info();
1604 const bool is_varlen =
1605 (target_type_info.is_string() &&
1607 target_type_info.is_array();
1608 if (!agg_expr || agg_expr->get_aggtype() ==
kSAMPLE) {
1609 result.emplace_back(target_type_info.is_fp() ?
"agg_id_double" :
"agg_id");
1611 result.emplace_back(
"agg_id");
1613 if (target_type_info.is_geometry()) {
1614 result.emplace_back(
"agg_id");
1615 for (
auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1616 result.emplace_back(
"agg_id");
1626 agg_type_info = target_type_info;
1629 agg_type_info = agg_expr->get_arg()->get_type_info();
1635 !agg_type_info.
is_fp()) {
1636 throw std::runtime_error(
"AVG is only valid on integer and floating point");
1640 :
"agg_sum_double");
1643 :
"agg_count_double");
1649 throw std::runtime_error(
1650 "MIN on strings, arrays or geospatial types not supported yet");
1654 :
"agg_min_double");
1660 throw std::runtime_error(
1661 "MAX on strings, arrays or geospatial types not supported yet");
1665 :
"agg_max_double");
1671 !agg_type_info.
is_fp()) {
1672 throw std::runtime_error(
1673 "SUM and SUM_IF is only valid on integer and floating point");
1681 result.emplace_back(func_name);
1685 result.emplace_back(agg_expr->get_is_distinct() ?
"agg_count_distinct"
1689 result.emplace_back(
"agg_count_if");
1692 result.emplace_back(agg_type_info.
is_fp() ?
"agg_id_double" :
"agg_id");
1697 result.emplace_back(agg_type_info.
is_fp() ?
"agg_id_double" :
"agg_id");
1701 result.emplace_back(
"agg_approximate_count_distinct");
1704 result.emplace_back(
"agg_approx_quantile");
1707 result.emplace_back(
"agg_mode_func");
1710 UNREACHABLE() <<
"Usupported agg_type: " << agg_type;
1719 const bool is_cuda_ir) {
1727 llvm::Module& llvm_module,
1728 const std::vector<llvm::Function*>& roots,
1729 const std::vector<llvm::Function*>& leaves) {
1731 std::unordered_set<llvm::Function*> live_funcs;
1732 live_funcs.insert(roots.begin(), roots.end());
1733 live_funcs.insert(leaves.begin(), leaves.end());
1735 if (
auto F = llvm_module.getFunction(
"init_shared_mem_nop")) {
1736 live_funcs.insert(F);
1738 if (
auto F = llvm_module.getFunction(
"write_back_nop")) {
1739 live_funcs.insert(F);
1742 for (
const llvm::Function* F : roots) {
1743 for (
const llvm::BasicBlock& BB : *F) {
1744 for (
const llvm::Instruction& I : BB) {
1745 if (
const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1746 live_funcs.insert(CI->getCalledFunction());
1752 for (llvm::Function& F : llvm_module) {
1753 if (!live_funcs.count(&F) && !F.isDeclaration()) {
1754 F.setLinkage(llvm::GlobalValue::InternalLinkage);
1764 template <
typename InstType>
1766 std::string bb_name,
1767 std::string variable_name) {
1768 llvm::Value* result =
nullptr;
1769 if (func ==
nullptr || variable_name.empty()) {
1772 bool is_found =
false;
1773 for (
auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1774 if (!bb_name.empty() && bb_it->getName() != bb_name) {
1777 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1778 if (llvm::isa<InstType>(*inst_it)) {
1779 if (inst_it->getName() == variable_name) {
1792 llvm::Function* query_func,
1793 bool run_with_dynamic_watchdog,
1794 bool run_with_allowing_runtime_interrupt,
1795 const std::vector<JoinLoop>& join_loops,
1797 const std::vector<InputTableInfo>& input_table_infos) {
1803 if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1806 run_with_allowing_runtime_interrupt =
false;
1812 executor_session_mutex_);
1813 if (current_query_session_.empty()) {
1814 run_with_allowing_runtime_interrupt =
false;
1818 llvm::Value* row_count =
nullptr;
1819 if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1822 find_variable_in_basic_block<llvm::LoadInst>(query_func,
".entry",
"row_count");
1825 bool done_splitting =
false;
1826 for (
auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1828 llvm::Value* pos =
nullptr;
1829 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1830 if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1831 llvm::isa<llvm::PHINode>(*inst_it)) {
1832 if (inst_it->getName() ==
"pos") {
1837 if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1840 auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1842 if (row_func_name && *row_func_name ==
"row_process") {
1843 auto next_inst_it = inst_it;
1845 auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1846 auto& br_instr = bb_it->back();
1847 llvm::IRBuilder<> ir_builder(&br_instr);
1848 llvm::Value* err_lv = &*inst_it;
1849 llvm::Value* err_lv_returned_from_row_func =
nullptr;
1850 if (run_with_dynamic_watchdog) {
1852 llvm::Value* call_watchdog_lv =
nullptr;
1858 auto crit_edge_rem =
1859 (blockSize() & (blockSize() - 1))
1860 ? ir_builder.CreateSRem(
1862 cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1863 : ir_builder.CreateAnd(
1865 cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1866 auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1867 crit_edge_threshold->setName(
"crit_edge_threshold");
1872 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1875 auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1876 call_watchdog_lv = ir_builder.CreateICmp(
1877 llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1879 CHECK(call_watchdog_lv);
1880 auto error_check_bb = bb_it->splitBasicBlock(
1881 llvm::BasicBlock::iterator(br_instr),
".error_check");
1882 auto& watchdog_br_instr = bb_it->back();
1884 auto watchdog_check_bb = llvm::BasicBlock::Create(
1885 cgen_state_->context_,
".watchdog_check", query_func, error_check_bb);
1886 llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1887 auto detected_timeout = watchdog_ir_builder.CreateCall(
1888 cgen_state_->module_->getFunction(
"dynamic_watchdog"), {});
1889 auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1891 watchdog_ir_builder.CreateBr(error_check_bb);
1893 llvm::ReplaceInstWithInst(
1895 llvm::BranchInst::Create(
1896 watchdog_check_bb, error_check_bb, call_watchdog_lv));
1897 ir_builder.SetInsertPoint(&br_instr);
1898 auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1900 unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1901 unified_err_lv->addIncoming(err_lv, &*bb_it);
1902 err_lv = unified_err_lv;
1903 }
else if (run_with_allowing_runtime_interrupt) {
1905 llvm::Value* call_check_interrupt_lv{
nullptr};
1906 llvm::Value* interrupt_err_lv{
nullptr};
1907 llvm::BasicBlock* error_check_bb{
nullptr};
1908 llvm::BasicBlock* interrupt_check_bb{
nullptr};
1909 llvm::Instruction* check_interrupt_br_instr{
nullptr};
1912 join_loops.begin(), join_loops.end(), [](
const JoinLoop& join_loop) {
1913 return join_loop.isNestedLoopJoin();
1915 auto codegen_interrupt_checker = [&]() {
1916 error_check_bb = bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
1918 check_interrupt_br_instr = &bb_it->back();
1920 interrupt_check_bb = llvm::BasicBlock::Create(
1921 cgen_state_->context_,
".interrupt_check", query_func, error_check_bb);
1922 llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1923 auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1924 cgen_state_->module_->getFunction(
"check_interrupt"), {});
1925 interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1929 interrupt_checker_ir_builder.CreateBr(error_check_bb);
1931 if (has_loop_join) {
1932 codegen_interrupt_checker();
1933 CHECK(interrupt_check_bb);
1934 CHECK(check_interrupt_br_instr);
1935 llvm::ReplaceInstWithInst(check_interrupt_br_instr,
1936 llvm::BranchInst::Create(interrupt_check_bb));
1937 ir_builder.SetInsertPoint(&br_instr);
1938 err_lv = interrupt_err_lv;
1950 int64_t total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1951 uint64_t interrupt_checking_freq = 32;
1955 if (!input_table_infos.empty()) {
1956 const auto& outer_table_info = *input_table_infos.begin();
1957 auto num_outer_table_tuples =
1958 outer_table_info.info.getFragmentNumTuplesUpperBound();
1959 if (num_outer_table_tuples > 0) {
1967 auto max_inc = uint64_t(
1968 floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1974 auto calibrated_inc =
1975 uint64_t(floor(max_inc * (1 - freq_control_knob)));
1976 interrupt_checking_freq =
1981 if (interrupt_checking_freq > max_inc) {
1982 interrupt_checking_freq = max_inc / 2;
1984 if (interrupt_checking_freq < 8) {
1987 interrupt_checking_freq = 8;
1991 VLOG(1) <<
"Set the running query interrupt checking frequency: "
1992 << interrupt_checking_freq;
1994 llvm::Value* pos_shifted_per_iteration =
1995 ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1996 auto interrupt_predicate = ir_builder.CreateAnd(pos_shifted_per_iteration,
1997 interrupt_checking_freq);
1998 call_check_interrupt_lv =
1999 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2000 interrupt_predicate,
2001 cgen_state_->llInt(int64_t(0LL)));
2004 auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
2005 call_check_interrupt_lv =
2006 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2007 interrupt_predicate,
2008 cgen_state_->llInt(int64_t(0LL)));
2010 codegen_interrupt_checker();
2011 CHECK(call_check_interrupt_lv);
2012 CHECK(interrupt_err_lv);
2013 CHECK(interrupt_check_bb);
2014 CHECK(error_check_bb);
2015 CHECK(check_interrupt_br_instr);
2016 llvm::ReplaceInstWithInst(
2017 check_interrupt_br_instr,
2018 llvm::BranchInst::Create(
2019 interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
2020 ir_builder.SetInsertPoint(&br_instr);
2021 auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
2023 unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
2024 unified_err_lv->addIncoming(err_lv, &*bb_it);
2025 err_lv = unified_err_lv;
2028 if (!err_lv_returned_from_row_func) {
2029 err_lv_returned_from_row_func = err_lv;
2035 err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2039 err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
2041 cgen_state_->llInt(static_cast<int32_t>(0)));
2043 auto error_bb = llvm::BasicBlock::Create(
2044 cgen_state_->context_,
".error_exit", query_func, new_bb);
2045 const auto error_code_arg =
get_arg_by_name(query_func,
"error_code");
2046 llvm::CallInst::Create(
2047 cgen_state_->module_->getFunction(
"record_error_code"),
2048 std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
2051 llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2052 llvm::ReplaceInstWithInst(&br_instr,
2053 llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2054 done_splitting =
true;
2059 CHECK(done_splitting);
2063 llvm::Module* M = cgen_state_->module_;
2064 if (M->getFunction(
"allocate_varlen_buffer") ==
nullptr) {
2069 bool should_track =
false;
2070 auto* flag = M->getModuleFlag(
"manage_memory_buffer");
2071 if (
auto* cnt = llvm::mdconst::extract_or_null<llvm::ConstantInt>(flag)) {
2072 if (cnt->getZExtValue() == 1) {
2073 should_track =
true;
2077 if (!should_track) {
2082 LOG(
INFO) <<
"Found 'manage_memory_buffer' metadata.";
2083 llvm::SmallVector<llvm::CallInst*, 4> calls_to_analyze;
2085 for (llvm::Function& F : *M) {
2086 for (llvm::BasicBlock& BB : F) {
2087 for (llvm::Instruction& I : BB) {
2088 if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&I)) {
2091 if (called_func_name && *called_func_name ==
"allocate_varlen_buffer") {
2092 calls_to_analyze.push_back(CI);
2101 llvm::IRBuilder<> Builder(cgen_state_->context_);
2104 auto void_ = llvm::Type::getVoidTy(cgen_state_->context_);
2105 llvm::FunctionType* fnty = llvm::FunctionType::get(void_, {i64, i8p},
false);
2106 llvm::FunctionCallee register_buffer_fn =
2107 M->getOrInsertFunction(
"register_buffer_with_executor_rsm", fnty, {});
2109 int64_t executor_addr =
reinterpret_cast<int64_t
>(
this);
2110 for (llvm::CallInst* CI : calls_to_analyze) {
2115 for (llvm::User* U : CI->users()) {
2116 if (llvm::CallInst* call = llvm::dyn_cast<llvm::CallInst>(U)) {
2118 if (func_name && *func_name ==
"register_buffer_with_executor_rsm") {
2125 Builder.SetInsertPoint(CI->getNextNode());
2126 Builder.CreateCall(register_buffer_fn,
2127 {
ll_int(executor_addr, cgen_state_->context_), CI});
2135 std::vector<llvm::Value*> hoisted_literals;
2139 std::vector<llvm::Type*> row_process_arg_types;
2141 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2142 E = cgen_state_->row_func_->arg_end();
2145 row_process_arg_types.push_back(I->getType());
2148 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2149 for (
auto value : element.second) {
2150 row_process_arg_types.push_back(value->getType());
2154 auto ft = llvm::FunctionType::get(
2155 get_int_type(32, cgen_state_->context_), row_process_arg_types,
false);
2156 auto row_func_with_hoisted_literals =
2157 llvm::Function::Create(ft,
2158 llvm::Function::ExternalLinkage,
2159 "row_func_hoisted_literals",
2160 cgen_state_->row_func_->getParent());
2162 auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
2163 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2164 E = cgen_state_->row_func_->arg_end();
2168 row_func_arg_it->setName(I->getName());
2173 decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{
nullptr};
2174 decltype(row_func_arg_it) filter_func_arg_it{
nullptr};
2175 if (cgen_state_->filter_func_) {
2178 std::vector<llvm::Type*> filter_func_arg_types;
2180 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2181 E = cgen_state_->filter_func_->arg_end();
2184 filter_func_arg_types.push_back(I->getType());
2187 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2188 for (
auto value : element.second) {
2189 filter_func_arg_types.push_back(value->getType());
2193 auto ft2 = llvm::FunctionType::get(
2194 get_int_type(32, cgen_state_->context_), filter_func_arg_types,
false);
2195 filter_func_with_hoisted_literals =
2196 llvm::Function::Create(ft2,
2197 llvm::Function::ExternalLinkage,
2198 "filter_func_hoisted_literals",
2199 cgen_state_->filter_func_->getParent());
2201 filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
2202 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2203 E = cgen_state_->filter_func_->arg_end();
2207 filter_func_arg_it->setName(I->getName());
2209 ++filter_func_arg_it;
2213 std::unordered_map<int, std::vector<llvm::Value*>>
2214 query_func_literal_loads_function_arguments,
2215 query_func_literal_loads_function_arguments2;
2217 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2218 std::vector<llvm::Value*> argument_values, argument_values2;
2220 for (
auto value : element.second) {
2221 hoisted_literals.push_back(value);
2222 argument_values.push_back(&*row_func_arg_it);
2223 if (cgen_state_->filter_func_) {
2224 argument_values2.push_back(&*filter_func_arg_it);
2225 cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
2227 if (value->hasName()) {
2228 row_func_arg_it->setName(
"arg_" + value->getName());
2229 if (cgen_state_->filter_func_) {
2230 filter_func_arg_it->getContext();
2231 filter_func_arg_it->setName(
"arg_" + value->getName());
2235 ++filter_func_arg_it;
2238 query_func_literal_loads_function_arguments[element.first] = argument_values;
2239 query_func_literal_loads_function_arguments2[element.first] = argument_values2;
2245 row_func_with_hoisted_literals->getBasicBlockList().splice(
2246 row_func_with_hoisted_literals->begin(),
2247 cgen_state_->row_func_->getBasicBlockList());
2250 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2251 E = cgen_state_->row_func_->arg_end(),
2252 I2 = row_func_with_hoisted_literals->arg_begin();
2255 I->replaceAllUsesWith(&*I2);
2257 cgen_state_->filter_func_args_.replace(&*I, &*I2);
2261 cgen_state_->row_func_ = row_func_with_hoisted_literals;
2264 std::vector<llvm::Instruction*> placeholders;
2265 std::string prefix(
"__placeholder__literal_");
2266 for (
auto it = llvm::inst_begin(row_func_with_hoisted_literals),
2267 e = llvm::inst_end(row_func_with_hoisted_literals);
2270 if (it->hasName() && it->getName().startswith(prefix)) {
2271 auto offset_and_index_entry =
2272 cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
2273 CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2275 int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2276 int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2278 it->replaceAllUsesWith(
2279 query_func_literal_loads_function_arguments[lit_off][lit_idx]);
2280 placeholders.push_back(&*it);
2283 for (
auto placeholder : placeholders) {
2284 placeholder->removeFromParent();
2287 if (cgen_state_->filter_func_) {
2291 filter_func_with_hoisted_literals->getBasicBlockList().splice(
2292 filter_func_with_hoisted_literals->begin(),
2293 cgen_state_->filter_func_->getBasicBlockList());
2297 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2298 E = cgen_state_->filter_func_->arg_end(),
2299 I2 = filter_func_with_hoisted_literals->arg_begin();
2302 I->replaceAllUsesWith(&*I2);
2307 cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2310 std::vector<llvm::Instruction*> placeholders;
2311 std::string prefix(
"__placeholder__literal_");
2312 for (
auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2313 e = llvm::inst_end(filter_func_with_hoisted_literals);
2316 if (it->hasName() && it->getName().startswith(prefix)) {
2317 auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2318 llvm::dyn_cast<llvm::Value>(&*it));
2319 CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2321 int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2322 int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2324 it->replaceAllUsesWith(
2325 query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2326 placeholders.push_back(&*it);
2329 for (
auto placeholder : placeholders) {
2330 placeholder->removeFromParent();
2334 return hoisted_literals;
2341 return shared_mem_used
2348 if (
auto const agg_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
2349 if (shared::is_any<SQLAgg::kCOUNT, SQLAgg::kCOUNT_IF>(agg_expr->get_aggtype())) {
2359 CaseExprDetector() : detect_case_expr_(
false) {}
2363 return detect_case_expr_;
2368 detect_case_expr_ =
true;
2373 mutable bool detect_case_expr_;
2380 CaseExprDetector detector;
2382 if (detector.detectCaseExpr(expr.get())) {
2393 const unsigned cuda_blocksize,
2394 const unsigned num_blocks_per_mp) {
2401 CHECK(query_mem_desc_ptr);
2425 if (cuda_blocksize < query_mem_desc_ptr->getEntryCount()) {
2430 const auto target_infos =
2433 if (std::find_if(target_infos.begin(),
2436 if (ti.sql_type.is_varlen() ||
2437 !supported_aggs.count(ti.agg_kind)) {
2442 }) == target_infos.end()) {
2457 if (cuda_blocksize < query_mem_desc_ptr->getEntryCount()) {
2469 const size_t shared_memory_threshold_bytes = std::min(
2472 const auto output_buffer_size =
2474 if (output_buffer_size > shared_memory_threshold_bytes) {
2481 const auto target_infos =
2487 if (std::find_if(target_infos.begin(),
2490 if (ti.sql_type.is_varlen() ||
2491 !supported_aggs.count(ti.agg_kind)) {
2496 }) == target_infos.end()) {
2507 std::string llvm_ir;
2508 std::unordered_set<llvm::MDNode*> md;
2511 for (
auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2512 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2513 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2514 instr_it->getAllMetadata(imd);
2515 for (
auto [kind, node] : imd) {
2522 for (
auto bb_it = cgen_state->
row_func_->begin(); bb_it != cgen_state->
row_func_->end();
2524 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2525 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2526 instr_it->getAllMetadata(imd);
2527 for (
auto [kind, node] : imd) {
2538 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2539 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2540 instr_it->getAllMetadata(imd);
2541 for (
auto [kind, node] : imd) {
2550 std::map<size_t, std::string> sorted_strings;
2553 llvm::raw_string_ostream os(str);
2554 p->print(os, cgen_state->
module_,
true);
2556 auto fields =
split(str, {}, 1);
2557 if (fields.empty() || fields[0].empty()) {
2560 sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2563 for (
auto [
id, text] : sorted_strings) {
2574 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2581 const bool allow_lazy_fetch,
2582 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2583 const size_t max_groups_buffer_entry_guess,
2584 const int8_t crt_min_byte_width,
2585 const bool has_cardinality_estimation,
2597 static std::uint64_t counter = 0;
2599 VLOG(1) <<
"CODEGEN #" << counter <<
":";
2600 LOG(
IR) <<
"CODEGEN #" << counter <<
":";
2602 LOG(
ASM) <<
"CODEGEN #" << counter <<
":";
2612 addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
2620 has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2624 max_groups_buffer_entry_guess,
2631 !has_cardinality_estimation && (!render_info || !render_info->
isInSitu()) &&
2633 const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2637 const bool output_columnar =
query_mem_desc->didOutputColumnar();
2638 const bool gpu_shared_mem_optimization =
2643 cuda_mgr ? this->blockSize() : 1,
2644 cuda_mgr ?
this->numBlocksPerMP() : 1);
2645 if (gpu_shared_mem_optimization) {
2648 LOG(
DEBUG1) <<
"GPU shared memory is used for the " +
2659 const size_t num_count_distinct_descs =
2661 for (
size_t i = 0; i < num_count_distinct_descs; i++) {
2662 const auto& count_distinct_descriptor =
2675 if (
auto gby_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
2676 bool has_multiple_gpus = cuda_mgr ? cuda_mgr->getDeviceCount() > 1 :
false;
2677 if (gby_expr->get_aggtype() ==
SQLAgg::kSAMPLE && has_multiple_gpus &&
2680 bool (*)(
const Analyzer::ColumnVar*,
const Analyzer::ColumnVar*)>
2683 for (
const auto cv : colvar_set) {
2684 if (cv->get_type_info().is_varlen()) {
2685 const auto tbl_key = cv->getTableKey();
2686 std::for_each(query_infos.begin(),
2689 if (input_table_info.table_key == tbl_key &&
2690 input_table_info.info.fragments.size() > 1) {
2704 CHECK(cgen_state_->module_ ==
nullptr);
2705 cgen_state_->set_module_shallow_copy(get_rt_module(),
true);
2712 if (has_udf_module(is_gpu)) {
2714 get_udf_module(is_gpu), *cgen_state_->module_, cgen_state_.get());
2716 if (has_rt_udf_module(is_gpu)) {
2718 get_rt_udf_module(is_gpu), *cgen_state_->module_, cgen_state_.get());
2726 const auto agg_slot_count = ra_exe_unit.
estimator ? size_t(1) : agg_fnames.size();
2729 auto [query_func, row_func_call] = is_group_by
2739 !!ra_exe_unit.estimator,
2745 cgen_state_->query_func_ = query_func;
2746 cgen_state_->row_func_call_ = row_func_call;
2747 cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2748 &query_func->getEntryBlock().front());
2752 auto& fetch_bb = query_func->front();
2753 llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2754 fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2758 cgen_state_->context_);
2762 is_group_by ? 0 : agg_slot_count,
2764 cgen_state_->module_,
2765 cgen_state_->context_);
2766 CHECK(cgen_state_->row_func_);
2767 cgen_state_->row_func_bb_ =
2768 llvm::BasicBlock::Create(cgen_state_->context_,
"entry", cgen_state_->row_func_);
2771 auto filter_func_ft =
2772 llvm::FunctionType::get(
get_int_type(32, cgen_state_->context_), {},
false);
2773 cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2774 llvm::Function::ExternalLinkage,
2776 cgen_state_->module_);
2777 CHECK(cgen_state_->filter_func_);
2778 cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2779 cgen_state_->context_,
"entry", cgen_state_->filter_func_);
2782 cgen_state_->current_func_ = cgen_state_->row_func_;
2783 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2785 preloadFragOffsets(ra_exe_unit.
input_descs, query_infos);
2787 const auto join_loops =
2788 buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2792 plan_state_->addSimpleQual(simple_qual);
2794 const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2795 if (is_not_deleted_bb) {
2796 cgen_state_->row_func_bb_ = is_not_deleted_bb;
2798 if (!join_loops.empty()) {
2799 codegenJoinLoops(join_loops,
2800 body_execution_unit,
2801 group_by_and_aggregate,
2803 cgen_state_->row_func_bb_,
2808 const bool can_return_error = compileBody(
2809 ra_exe_unit, group_by_and_aggregate, *
query_mem_desc, co, gpu_smem_context);
2812 createErrorCheckControlFlow(query_func,
2817 group_by_and_aggregate.query_infos_);
2820 std::vector<llvm::Value*> hoisted_literals;
2823 VLOG(1) <<
"number of hoisted literals: "
2824 << cgen_state_->query_func_literal_loads_.size()
2825 <<
" / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2829 if (co.
hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2831 hoisted_literals = inlineHoistedLiterals();
2835 std::vector<llvm::Value*> row_func_args;
2836 for (
size_t i = 0; i < cgen_state_->row_func_call_->getNumOperands() - 1; ++i) {
2837 row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2839 row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2840 row_func_args.push_back(
get_arg_by_name(query_func,
"join_hash_tables"));
2843 row_func_args.insert(
2844 row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2845 llvm::ReplaceInstWithInst(
2846 cgen_state_->row_func_call_,
2847 llvm::CallInst::Create(cgen_state_->row_func_, row_func_args,
""));
2850 if (cgen_state_->filter_func_) {
2851 std::vector<llvm::Value*> filter_func_args;
2852 for (
auto arg_it = cgen_state_->filter_func_args_.begin();
2853 arg_it != cgen_state_->filter_func_args_.end();
2855 filter_func_args.push_back(arg_it->first);
2857 llvm::ReplaceInstWithInst(
2858 cgen_state_->filter_func_call_,
2859 llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args,
""));
2863 plan_state_->init_agg_vals_ =
2873 if (gpu_smem_context.isSharedMemoryUsed()) {
2877 cgen_state_->module_,
2878 cgen_state_->context_,
2881 plan_state_->init_agg_vals_,
2883 gpu_smem_code.codegen();
2884 gpu_smem_code.injectFunctionsInto(query_func);
2887 cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2888 cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2889 LOG(
IR) << gpu_smem_code.toString();
2893 auto multifrag_query_func = cgen_state_->module_->getFunction(
2894 "multifrag_query" + std::string(co.
hoist_literals ?
"_hoisted_literals" :
""));
2895 CHECK(multifrag_query_func);
2898 insertErrorCodeChecker(multifrag_query_func,
2905 "query_stub" + std::string(co.
hoist_literals ?
"_hoisted_literals" :
""),
2906 multifrag_query_func,
2907 cgen_state_->module_);
2909 std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2910 if (cgen_state_->filter_func_) {
2911 root_funcs.push_back(cgen_state_->filter_func_);
2914 *cgen_state_->module_, root_funcs, {multifrag_query_func});
2921 if (cgen_state_->filter_func_) {
2932 std::string llvm_ir =
2936 VLOG(3) <<
"Unoptimized IR for the " << device_str <<
"\n" << llvm_ir <<
"\nEnd of IR";
2938 #ifdef WITH_JIT_DEBUG
2939 throw std::runtime_error(
2940 "Explain optimized not available when JIT runtime debug symbols are enabled");
2944 llvm::legacy::PassManager pass_manager;
2946 cgen_state_->module_,
2949 gpu_smem_context.isSharedMemoryUsed(),
2951 #endif // WITH_JIT_DEBUG
2962 LOG(
IR) <<
"IR for the " << device_str;
2974 AutoTrackBuffersInRuntimeIR();
2978 if (cgen_state_->filter_func_) {
2983 return std::make_tuple(
2986 ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2987 : optimizeAndCodegenGPU(query_func,
2988 multifrag_query_func,
2990 is_group_by || ra_exe_unit.estimator,
2992 gpu_smem_context.isSharedMemoryUsed(),
2994 cgen_state_->getLiterals(),
2997 std::move(gpu_smem_context)},
3002 unsigned const error_code_idx,
3003 bool hoist_literals,
3004 bool allow_runtime_query_interrupt) {
3005 auto query_stub_func_name =
3006 "query_stub" + std::string(hoist_literals ?
"_hoisted_literals" :
"");
3007 for (
auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
3008 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
3009 if (!llvm::isa<llvm::CallInst>(*inst_it)) {
3012 auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
3014 if (row_func_name && *row_func_name == query_stub_func_name) {
3015 auto next_inst_it = inst_it;
3017 auto new_bb = bb_it->splitBasicBlock(next_inst_it);
3018 auto& br_instr = bb_it->back();
3019 llvm::IRBuilder<> ir_builder(&br_instr);
3020 llvm::Value* err_lv = &*inst_it;
3021 auto error_check_bb =
3022 bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
".error_check");
3024 llvm::Value*
const error_code_arg =
get_arg_by_index(query_func, error_code_idx);
3025 CHECK(error_code_arg) << error_code_idx <<
'/' << query_func->arg_size();
3026 llvm::Value* err_code =
nullptr;
3027 if (allow_runtime_query_interrupt) {
3029 auto& check_interrupt_br_instr = bb_it->back();
3030 auto interrupt_check_bb = llvm::BasicBlock::Create(
3031 cgen_state_->context_,
".interrupt_check", query_func, error_check_bb);
3032 llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
3033 auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
3034 cgen_state_->module_->getFunction(
"check_interrupt"), {});
3035 auto detected_error = interrupt_checker_ir_builder.CreateCall(
3036 cgen_state_->module_->getFunction(
"get_error_code"),
3037 std::vector<llvm::Value*>{error_code_arg});
3038 err_code = interrupt_checker_ir_builder.CreateSelect(
3042 interrupt_checker_ir_builder.CreateBr(error_check_bb);
3043 llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
3044 llvm::BranchInst::Create(interrupt_check_bb));
3045 ir_builder.SetInsertPoint(&br_instr);
3048 ir_builder.SetInsertPoint(&br_instr);
3050 ir_builder.CreateCall(cgen_state_->module_->getFunction(
"get_error_code"),
3051 std::vector<llvm::Value*>{error_code_arg});
3053 err_lv = ir_builder.CreateICmp(
3054 llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
3055 auto error_bb = llvm::BasicBlock::Create(
3056 cgen_state_->context_,
".error_exit", query_func, new_bb);
3057 llvm::CallInst::Create(cgen_state_->module_->getFunction(
"record_error_code"),
3058 std::vector<llvm::Value*>{err_code, error_code_arg},
3061 llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
3062 llvm::ReplaceInstWithInst(&br_instr,
3063 llvm::BranchInst::Create(error_bb, new_bb, err_lv));
3078 const auto& outer_input_desc = ra_exe_unit.
input_descs[0];
3082 const auto& table_key = outer_input_desc.getTableKey();
3083 const auto deleted_cd = plan_state_->getDeletedColForTable(table_key);
3087 CHECK(deleted_cd->columnType.is_boolean());
3088 const auto deleted_expr =
3089 makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
3091 outer_input_desc.getNestLevel());
3093 const auto is_deleted =
3094 code_generator.toBool(code_generator.codegen(deleted_expr.get(),
true, co).front());
3095 const auto is_deleted_bb = llvm::BasicBlock::Create(
3096 cgen_state_->context_,
"is_deleted", cgen_state_->row_func_);
3097 llvm::BasicBlock* bb = llvm::BasicBlock::Create(
3098 cgen_state_->context_,
"is_not_deleted", cgen_state_->row_func_);
3099 cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
3100 cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
3101 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3102 cgen_state_->ir_builder_.SetInsertPoint(bb);
3117 cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
3118 llvm::Value* loop_done{
nullptr};
3119 std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
3120 if (cgen_state_->filter_func_) {
3121 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3122 auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
3123 cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
3124 row_func_entry_bb->begin());
3125 loop_done = cgen_state_->ir_builder_.CreateAlloca(
3126 get_int_type(1, cgen_state_->context_),
nullptr,
"loop_done");
3127 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3128 cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(
true), loop_done);
3130 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
3131 cgen_state_->current_func_ = cgen_state_->filter_func_;
3132 fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
3136 std::vector<Analyzer::Expr*> primary_quals;
3137 std::vector<Analyzer::Expr*> deferred_quals;
3139 ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
3140 if (short_circuited) {
3142 <<
"short-circuited and deferred " <<
std::to_string(deferred_quals.size())
3145 llvm::Value* filter_lv = cgen_state_->llBool(
true);
3147 for (
auto expr : primary_quals) {
3149 auto cond = code_generator.toBool(code_generator.codegen(expr,
true, co).front());
3150 filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
3152 CHECK(filter_lv->getType()->isIntegerTy(1));
3153 llvm::BasicBlock* sc_false{
nullptr};
3154 if (!deferred_quals.empty()) {
3155 auto sc_true = llvm::BasicBlock::Create(
3156 cgen_state_->context_,
"sc_true", cgen_state_->current_func_);
3157 sc_false = llvm::BasicBlock::Create(
3158 cgen_state_->context_,
"sc_false", cgen_state_->current_func_);
3159 cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
3160 cgen_state_->ir_builder_.SetInsertPoint(sc_false);
3162 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
3164 cgen_state_->ir_builder_.SetInsertPoint(sc_true);
3165 filter_lv = cgen_state_->llBool(
true);
3167 for (
auto expr : deferred_quals) {
3168 filter_lv = cgen_state_->ir_builder_.CreateAnd(
3169 filter_lv, code_generator.toBool(code_generator.codegen(expr,
true, co).front()));
3172 CHECK(filter_lv->getType()->isIntegerTy(1));
3173 auto ret = group_by_and_aggregate.
codegen(
3174 filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
3178 if (cgen_state_->filter_func_) {
3179 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3180 cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(
false), loop_done);
3181 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3184 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3185 cgen_state_->current_func_ = cgen_state_->row_func_;
3186 cgen_state_->filter_func_call_ =
3187 cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
3191 redeclareFilterFunction();
3193 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3194 auto loop_done_true = llvm::BasicBlock::Create(
3195 cgen_state_->context_,
"loop_done_true", cgen_state_->row_func_);
3196 auto loop_done_false = llvm::BasicBlock::Create(
3197 cgen_state_->context_,
"loop_done_false", cgen_state_->row_func_);
3198 auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(
3199 loop_done->getType()->getPointerElementType(), loop_done);
3200 cgen_state_->ir_builder_.CreateCondBr(
3201 loop_done_flag, loop_done_true, loop_done_false);
3202 cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
3203 cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3204 cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
3206 cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3213 llvm::Value* byte_stream_arg,
3214 llvm::IRBuilder<>& ir_builder,
3215 llvm::LLVMContext& ctx) {
3216 CHECK(byte_stream_arg);
3217 const auto max_col_local_id = num_columns - 1;
3219 std::vector<llvm::Value*> col_heads;
3220 for (
int col_id = 0; col_id <= max_col_local_id; ++col_id) {
3221 auto* gep = ir_builder.CreateGEP(
3222 byte_stream_arg->getType()->getScalarType()->getPointerElementType(),
3224 llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id));
3225 auto* load_gep = ir_builder.CreateLoad(gep->getType()->getPointerElementType(), gep);
3226 load_gep->setName(byte_stream_arg->getName() +
"_" +
std::to_string(col_id) +
"_ptr");
3227 col_heads.emplace_back(load_gep);
3231 void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, const std::vector< JoinLoop > &join_loops, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
std::optional< std::string_view > getCalledFunctionName(llvm::CallInst &call_inst)
std::vector< Analyzer::Expr * > target_exprs
double g_running_query_interrupt_freq
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
std::string get_cuda_libdevice_dir(void)
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned cuda_blocksize, const unsigned num_blocks_per_mp)
std::string gen_translate_null_key_sigs()
bool countDistinctDescriptorsLogicallyEmpty() const
size_t getEntryCount() const
static const int32_t ERR_INTERRUPTED
std::unordered_map< shared::TableKey, const ColumnDescriptor * > DeletedColumnsMap
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
void mark_function_never_inline(llvm::Function *func)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
void optimize_ir(llvm::Function *query_func, llvm::Module *llvm_module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co)
bool with_dynamic_watchdog
Streaming Top N algorithm.
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void AutoTrackBuffersInRuntimeIR()
void checkCudaErrors(CUresult err)
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
llvm::ConstantInt * ll_int(const T v, llvm::LLVMContext &context)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *llvm_module)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_string(const std::string &udf_ir_string, llvm::LLVMContext &ctx, bool is_gpu=false)
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *mod, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
bool hasKeylessHash() const
void insertErrorCodeChecker(llvm::Function *query_func, unsigned const error_code_idx, bool hoist_literals, bool allow_runtime_query_interrupt)
std::vector< std::string > CodeCacheKey
ExecutorOptLevel opt_level
bool g_enable_dynamic_watchdog
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
T visit(const Analyzer::Expr *expr) const
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *mod, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
bool filter_on_deleted_column
size_t getRowSize() const
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
llvm::Function * row_func_
bool g_enable_smem_non_grouped_agg
std::shared_lock< T > shared_lock
unsigned getExpOfTwo(unsigned n)
bool output_columnar_hint
llvm::StringRef get_gpu_target_triple_string()
Supported runtime functions management and retrieval.
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
void verify_function_ir(const llvm::Function *func)
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
bool useStreamingTopN() const
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
ExecutionEngineWrapper create_execution_engine(llvm::Module *llvm_module, llvm::EngineBuilder &eb, const CompilationOptions &co)
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx, bool is_gpu=false)
ExecutorExplainType explain_type
unsigned get_index_by_name(llvm::Function *func, const std::string &name)
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
static const int32_t ERR_OUT_OF_TIME
void initializeNVPTXBackend() const
size_t getMinSharedMemoryPerBlockForAllDevices() const
const std::string cuda_rt_decls
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
CubinResult ptx_to_cubin(const std::string &ptx, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
QueryDescriptionType getQueryDescriptionType() const
static std::mutex initialize_cpu_backend_mutex_
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *llvm_module, llvm::LLVMContext &context)
ExecutorDeviceType device_type
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *llvm_module)
llvm::Function * filter_func_
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
static void addUdfIrToModule(const std::string &udf_ir_filename, const bool is_cuda_ir)
bool isArchMaxwellOrLaterForAll() const
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *llvm_module)
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool is_gpu_smem_used, const CompilationOptions &)
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(Executor *executor, llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co, const GPUTarget &gpu_target)
bool g_enable_smem_grouped_non_count_agg
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::unordered_map< shared::TableKey, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
bool has_count_expr(RelAlgExecutionUnit const &ra_exe_unit)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
static std::map< ExtModuleKinds, std::string > extension_module_sources
void show_defined(llvm::Module &llvm_module)
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
bool g_enable_filter_function
static void linkModuleWithLibdevice(Executor *executor, llvm::Module &module, llvm::PassManagerBuilder &pass_manager_builder, const GPUTarget &gpu_target)
virtual T visitCaseExpr(const Analyzer::CaseExpr *case_) const
float g_fraction_code_cache_to_evict
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
SQLAgg get_aggtype() const
std::string filename(char const *path)
std::list< std::shared_ptr< Analyzer::Expr > > quals
std::string gen_array_any_all_sigs()
bool didOutputColumnar() const
bool g_enable_watchdog false
#define DEBUG_TIMER(name)
llvm::ValueToValueMapTy vmap_
std::vector< llvm::Value * > inlineHoistedLiterals()
static std::shared_ptr< QueryEngine > getInstance()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool register_intel_jit_listener
bool isArchPascal() const
bool any_of(std::vector< Analyzer::Expr * > const &target_exprs)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls, const bool is_gpu=false)
bool allow_runtime_query_interrupt
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
llvm::Type * get_int_ptr_type(const int width, llvm::LLVMContext &context)
constexpr std::array< std::string_view, 18 > TARGET_RUNTIME_FUNCTIONS_FOR_MODULE_CLONING
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
llvm::Value * get_arg_by_index(llvm::Function *func, unsigned const index)
std::unique_ptr< llvm::Module > read_llvm_module_from_bc_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
bool has_case_expr_within_groupby_expr(RelAlgExecutionUnit const &ra_exe_unit)
static std::mutex initialize_nvptx_mutex_
size_t g_gpu_smem_threshold