19 #if LLVM_VERSION_MAJOR < 9
20 static_assert(
false,
"LLVM Version >= 9 is required.");
23 #include <llvm/Analysis/ScopedNoAliasAA.h>
24 #include <llvm/Analysis/TypeBasedAliasAnalysis.h>
25 #include <llvm/Bitcode/BitcodeReader.h>
26 #include <llvm/Bitcode/BitcodeWriter.h>
27 #include <llvm/ExecutionEngine/MCJIT.h>
28 #include <llvm/IR/Attributes.h>
29 #include <llvm/IR/GlobalValue.h>
30 #include <llvm/IR/InstIterator.h>
31 #include <llvm/IR/IntrinsicInst.h>
32 #include <llvm/IR/Intrinsics.h>
33 #include <llvm/IR/LegacyPassManager.h>
34 #include <llvm/IR/Verifier.h>
35 #include <llvm/IRReader/IRReader.h>
36 #if 14 <= LLVM_VERSION_MAJOR
37 #include <llvm/MC/TargetRegistry.h>
39 #include <llvm/Support/TargetRegistry.h>
41 #include <llvm/Support/Casting.h>
42 #include <llvm/Support/FileSystem.h>
43 #include <llvm/Support/FormattedStream.h>
44 #include <llvm/Support/MemoryBuffer.h>
45 #include <llvm/Support/SourceMgr.h>
46 #include <llvm/Support/TargetSelect.h>
47 #include <llvm/Support/raw_os_ostream.h>
48 #include <llvm/Support/raw_ostream.h>
49 #include <llvm/Transforms/IPO.h>
50 #include <llvm/Transforms/IPO/AlwaysInliner.h>
51 #include <llvm/Transforms/IPO/InferFunctionAttrs.h>
52 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
53 #include <llvm/Transforms/InstCombine/InstCombine.h>
54 #include <llvm/Transforms/Instrumentation.h>
55 #include <llvm/Transforms/Scalar.h>
56 #include <llvm/Transforms/Scalar/GVN.h>
57 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
58 #include <llvm/Transforms/Utils.h>
59 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
60 #include <llvm/Transforms/Utils/Cloning.h>
62 #if LLVM_VERSION_MAJOR >= 11
63 #include <llvm/Support/Host.h>
85 #include <llvm/Support/DynamicLibrary.h>
87 #ifndef GEOS_LIBRARY_FILENAME
88 #error Configuration should include GEOS library file name
90 std::unique_ptr<std::string> g_libgeos_so_filename(
91 new std::string(GEOS_LIBRARY_FILENAME));
92 static llvm::sys::DynamicLibrary geos_dynamic_library;
93 static std::mutex geos_init_mutex;
97 void load_geos_dynamic_library() {
98 std::lock_guard<std::mutex> guard(geos_init_mutex);
100 if (!geos_dynamic_library.isValid()) {
101 if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
102 LOG(
WARNING) <<
"Misconfigured GEOS library file name, trying 'libgeos_c.so'";
103 g_libgeos_so_filename.reset(
new std::string(
"libgeos_c.so"));
105 auto filename = *g_libgeos_so_filename;
106 std::string error_message;
107 geos_dynamic_library =
108 llvm::sys::DynamicLibrary::getPermanentLibrary(
filename.c_str(), &error_message);
109 if (!geos_dynamic_library.isValid()) {
111 std::string exception_message =
"Failed to load GEOS library: " + error_message;
112 throw std::runtime_error(exception_message.c_str());
125 std::string src =
"",
126 const bool is_gpu =
false) {
127 std::string excname = (is_gpu ?
"NVVM IR ParseError: " :
"LLVM IR ParseError: ");
128 llvm::raw_string_ostream ss(excname);
129 parse_error.print(src.c_str(), ss,
false,
false);
143 #define SHOW_DEFINED(MODULE) \
145 std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
146 ::show_defined(MODULE); \
149 #define SHOW_FUNCTIONS(MODULE) \
151 std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
152 ::show_functions(MODULE); \
155 template <
typename T =
void>
157 std::cout <<
"defines: ";
158 for (
auto&
f : llvm_module.getFunctionList()) {
159 if (!
f.isDeclaration()) {
160 std::cout <<
f.getName().str() <<
", ";
163 std::cout << std::endl;
166 template <
typename T =
void>
168 if (llvm_module ==
nullptr) {
169 std::cout <<
"is null" << std::endl;
175 template <
typename T =
void>
194 template <
typename T =
void>
196 std::unordered_set<std::string>& defined,
197 std::unordered_set<std::string>& undefined,
198 const std::unordered_set<std::string>& ignored) {
199 for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
200 if (
auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
201 auto* F2 = CI->getCalledFunction();
203 auto F2name = F2->getName().str();
204 if (F2->isDeclaration()) {
205 if (F2name.rfind(
"__", 0) !=
207 && F2name.rfind(
"llvm.", 0) !=
209 && ignored.find(F2name) == ignored.end()
211 undefined.emplace(F2name);
214 if (defined.find(F2name) == defined.end()) {
215 defined.emplace(F2name);
216 scan_function_calls<T>(*F2, defined, undefined, ignored);
224 template <
typename T =
void>
226 std::unordered_set<std::string>& defined,
227 std::unordered_set<std::string>& undefined,
228 const std::unordered_set<std::string>& ignored) {
229 for (
auto& F : llvm_module) {
230 if (!F.isDeclaration()) {
236 template <
typename T =
void>
237 std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>>
239 const std::unordered_set<std::string>& ignored = {}) {
240 std::unordered_set<std::string> defined, undefined;
242 return std::make_tuple(defined, undefined);
245 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
248 const std::unordered_set<llvm::Function*>& live_funcs) {
249 std::vector<llvm::Function*> dead_funcs;
252 if (live_funcs.count(&F)) {
255 for (
auto U : F.users()) {
256 auto* C = llvm::dyn_cast<
const llvm::CallInst>(U);
257 if (!C || C->getParent()->getParent() != &F) {
263 dead_funcs.push_back(&F);
266 for (
auto pFn : dead_funcs) {
267 pFn->eraseFromParent();
275 bool check_module_requires_libdevice(llvm::Module* llvm_module) {
277 for (llvm::Function& F : *llvm_module) {
278 if (F.hasName() && F.getName().startswith(
"__nv_")) {
279 LOG(
INFO) <<
"Module requires linking with libdevice: " << std::string(F.getName());
283 LOG(
DEBUG1) <<
"module does not require linking against libdevice";
288 void add_intrinsics_to_module(llvm::Module* llvm_module) {
289 for (llvm::Function& F : *llvm_module) {
290 for (llvm::Instruction& I : instructions(F)) {
291 if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
292 if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
293 llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
294 llvm::Function& decl_fn =
295 *llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID(), Tys);
296 ii->setCalledFunction(&decl_fn);
299 llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID());
309 llvm::Module* llvm_module,
310 llvm::legacy::PassManager& pass_manager,
311 const std::unordered_set<llvm::Function*>& live_funcs,
312 const bool is_gpu_smem_used,
316 pass_manager.add(llvm::createVerifierPass());
317 pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
321 pass_manager.add(llvm::createSROAPass());
325 llvm::createEarlyCSEPass(
true));
327 if (!is_gpu_smem_used) {
332 pass_manager.add(llvm::createJumpThreadingPass());
334 pass_manager.add(llvm::createCFGSimplificationPass());
337 pass_manager.add(llvm::createNewGVNPass());
339 pass_manager.add(llvm::createDeadStoreEliminationPass());
340 pass_manager.add(llvm::createLICMPass());
342 pass_manager.add(llvm::createInstructionCombiningPass());
345 pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
346 pass_manager.add(llvm::createGlobalOptimizerPass());
348 pass_manager.add(llvm::createCFGSimplificationPass());
350 pass_manager.run(*llvm_module);
361 : execution_engine_(execution_engine) {}
365 : execution_engine_(execution_engine) {
368 #ifdef ENABLE_INTEL_JIT_LISTENER
372 LOG(
INFO) <<
"Registered IntelJITEventListener";
374 LOG(
WARNING) <<
"This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
375 "listener configuration parameter.";
376 #endif // ENABLE_INTEL_JIT_LISTENER
382 llvm::ExecutionEngine* execution_engine) {
389 std::stringstream err_ss;
390 llvm::raw_os_ostream err_os(err_ss);
391 err_os <<
"\n-----\n";
392 if (llvm::verifyFunction(*func, &err_os)) {
393 err_os <<
"\n-----\n";
394 func->print(err_os,
nullptr);
395 err_os <<
"\n-----\n";
403 llvm::Module* llvm_module) {
404 llvm::legacy::PassManager pass_manager;
405 auto cpu_target_machine = execution_engine->getTargetMachine();
406 CHECK(cpu_target_machine);
407 llvm::SmallString<256> code_str;
408 llvm::raw_svector_ostream os(code_str);
409 #if LLVM_VERSION_MAJOR >= 10
410 cpu_target_machine->addPassesToEmitFile(
411 pass_manager, os,
nullptr, llvm::CGFT_AssemblyFile);
413 cpu_target_machine->addPassesToEmitFile(
414 pass_manager, os,
nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
416 pass_manager.run(*llvm_module);
417 return "Assembly for the CPU:\n" + std::string(code_str.str()) +
"\nEnd of assembly";
421 llvm::EngineBuilder& eb,
431 CHECK(execution_engine.get());
433 llvm_module->setDataLayout(execution_engine->getDataLayout());
437 execution_engine->finalizeObject();
438 return execution_engine;
444 llvm::Function* func,
445 const std::unordered_set<llvm::Function*>& live_funcs,
448 llvm::Module* llvm_module = func->getParent();
450 #ifndef WITH_JIT_DEBUG
451 llvm::legacy::PassManager pass_manager;
453 func, llvm_module, pass_manager, live_funcs,
false, co);
454 #endif // WITH_JIT_DEBUG
456 auto init_err = llvm::InitializeNativeTarget();
459 llvm::InitializeAllTargetMCs();
460 llvm::InitializeNativeTargetAsmPrinter();
461 llvm::InitializeNativeTargetAsmParser();
464 std::unique_ptr<llvm::Module> owner(llvm_module);
465 llvm::EngineBuilder eb(std::move(owner));
466 eb.setErrorStr(&err_str);
467 eb.setEngineKind(llvm::EngineKind::JIT);
468 llvm::TargetOptions to;
469 to.EnableFastISel =
true;
470 eb.setTargetOptions(to);
479 llvm::Function* query_func,
480 llvm::Function* multifrag_query_func,
481 const std::unordered_set<llvm::Function*>& live_funcs,
486 llvm::Module* M = query_func->getParent();
487 auto* flag = llvm::mdconst::extract_or_null<llvm::ConstantInt>(
488 M->getModuleFlag(
"manage_memory_buffer"));
489 if (flag and flag->getZExtValue() == 1 and M->getFunction(
"allocate_varlen_buffer") and
490 M->getFunction(
"register_buffer_with_executor_rsm")) {
491 LOG(
INFO) <<
"including executor addr to cache key\n";
494 if (cgen_state_->filter_func_) {
497 for (
const auto helper : cgen_state_->helper_functions_) {
505 if (cgen_state_->needs_geos_) {
507 auto llvm_module = multifrag_query_func->getParent();
508 load_geos_dynamic_library();
511 auto rt_geos_module_copy = llvm::CloneModule(
512 *get_geos_module(), cgen_state_->vmap_, [](
const llvm::GlobalValue* gv) {
513 auto func = llvm::dyn_cast<llvm::Function>(gv);
517 return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
518 func->getLinkage() ==
519 llvm::GlobalValue::LinkageTypes::InternalLinkage ||
520 func->getLinkage() == llvm::GlobalValue::LinkageTypes::ExternalLinkage);
525 llvm::Linker::Flags::LinkOnlyNeeded);
527 throw std::runtime_error(
"GEOS is disabled in this build");
531 auto execution_engine =
533 auto cpu_compilation_context =
534 std::make_shared<CpuCompilationContext>(std::move(execution_engine));
535 cpu_compilation_context->setFunctionPointer(multifrag_query_func);
541 llvm::Module& llvm_module,
543 llvm::Linker::Flags flags) {
547 for (
auto&
f : *udf_module) {
548 auto func = llvm_module.getFunction(
f.getName());
550 LOG(
ERROR) <<
" Attempt to overwrite " <<
f.getName().str() <<
" in "
551 << llvm_module.getModuleIdentifier() <<
" from `"
552 << udf_module->getModuleIdentifier() <<
"`" << std::endl;
553 throw std::runtime_error(
554 "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
557 VLOG(1) <<
" Adding " <<
f.getName().str() <<
" to "
558 << llvm_module.getModuleIdentifier() <<
" from `"
559 << udf_module->getModuleIdentifier() <<
"`" << std::endl;
563 auto udf_module_copy = llvm::CloneModule(*udf_module, cgen_state->
vmap_);
565 udf_module_copy->setDataLayout(llvm_module.getDataLayout());
566 udf_module_copy->setTargetTriple(llvm_module.getTargetTriple());
569 llvm::Linker ld(llvm_module);
570 bool link_error =
false;
572 link_error = ld.linkInModule(std::move(udf_module_copy), flags);
575 throw std::runtime_error(
"link_udf_module: *** error linking module ***");
585 if (s ==
"int16_t") {
588 if (s ==
"int32_t") {
591 if (s ==
"int64_t") {
594 CHECK(s ==
"float" || s ==
"double");
600 for (
const std::string any_or_all : {
"any",
"all"}) {
601 for (
const std::string elem_type :
602 {
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"float",
"double"}) {
603 for (
const std::string needle_type :
604 {
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"float",
"double"}) {
605 for (
const std::string op_name : {
"eq",
"ne",
"lt",
"le",
"gt",
"ge"}) {
606 result += (
"declare i1 @array_" + any_or_all +
"_" + op_name +
"_" + elem_type +
618 for (
const std::string key_type : {
"int8_t",
"int16_t",
"int32_t",
"int64_t"}) {
620 result +=
"declare i64 @translate_null_key_" + key_type +
"(" + key_llvm_type +
", " +
621 key_llvm_type +
", i64);\n";
627 R
"(
declare void @llvm.dbg.declare(metadata, metadata, metadata)
declare void @llvm.dbg.value(metadata, metadata, metadata)
declare double @llvm.fmuladd.f64(double, double, double)
declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
declare i64 @get_thread_index();
declare i64 @get_block_index();
declare i32 @pos_start_impl(i32*);
declare i32 @group_buff_idx_impl();
declare i32 @pos_step_impl();
declare i8 @thread_warp_idx(i8);
declare i64* @init_shared_mem(i64*, i32);
declare i64* @init_shared_mem_nop(i64*, i32);
declare i64* @declare_dynamic_shared_memory();
declare void @write_back_nop(i64*, i64*, i32);
declare void @write_back_non_grouped_agg(i64*, i64*, i32);
declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8);
declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32);
declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32);
declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32);
declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32);
declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32);
declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32);
declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64);
declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64);
declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64);
declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64);
declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64);
declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double);
declare i64 @get_bucket_key_for_range_double(i8*, i64, double);
declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double);
declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64);
declare i64 @agg_count_shared(i64*, i64);
declare i64 @agg_count_skip_val_shared(i64*, i64, i64);
declare i32 @agg_count_int32_shared(i32*, i32);
declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32);
declare i64 @agg_count_double_shared(i64*, double);
declare i64 @agg_count_double_skip_val_shared(i64*, double, double);
declare i32 @agg_count_float_shared(i32*, float);
declare i32 @agg_count_float_skip_val_shared(i32*, float, float);
declare i64 @agg_count_if_shared(i64*, i64);
declare i64 @agg_count_if_skip_val_shared(i64*, i64, i64);
declare i32 @agg_count_if_int32_shared(i32*, i32);
declare i32 @agg_count_if_int32_skip_val_shared(i32*, i32, i32);
declare i64 @agg_sum_shared(i64*, i64);
declare i64 @agg_sum_skip_val_shared(i64*, i64, i64);
declare i32 @agg_sum_int32_shared(i32*, i32);
declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_sum_double_shared(i64*, double);
declare void @agg_sum_double_skip_val_shared(i64*, double, double);
declare void @agg_sum_float_shared(i32*, float);
declare void @agg_sum_float_skip_val_shared(i32*, float, float);
declare i64 @agg_sum_if_shared(i64*, i64, i8);
declare i64 @agg_sum_if_skip_val_shared(i64*, i64, i64, i8);
declare i32 @agg_sum_if_int32_shared(i32*, i32, i8);
declare i32 @agg_sum_if_int32_skip_val_shared(i32*, i32, i32, i8);
declare void @agg_sum_if_double_shared(i64*, double, i8);
declare void @agg_sum_if_double_skip_val_shared(i64*, double, double, i8);
declare void @agg_sum_if_float_shared(i32*, float, i8);
declare void @agg_sum_if_float_skip_val_shared(i32*, float, float, i8);
declare void @agg_max_shared(i64*, i64);
declare void @agg_max_skip_val_shared(i64*, i64, i64);
declare void @agg_max_int32_shared(i32*, i32);
declare void @agg_max_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_max_int16_shared(i16*, i16);
declare void @agg_max_int16_skip_val_shared(i16*, i16, i16);
declare void @agg_max_int8_shared(i8*, i8);
declare void @agg_max_int8_skip_val_shared(i8*, i8, i8);
declare void @agg_max_double_shared(i64*, double);
declare void @agg_max_double_skip_val_shared(i64*, double, double);
declare void @agg_max_float_shared(i32*, float);
declare void @agg_max_float_skip_val_shared(i32*, float, float);
declare void @agg_min_shared(i64*, i64);
declare void @agg_min_skip_val_shared(i64*, i64, i64);
declare void @agg_min_int32_shared(i32*, i32);
declare void @agg_min_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_min_int16_shared(i16*, i16);
declare void @agg_min_int16_skip_val_shared(i16*, i16, i16);
declare void @agg_min_int8_shared(i8*, i8);
declare void @agg_min_int8_skip_val_shared(i8*, i8, i8);
declare void @agg_min_double_shared(i64*, double);
declare void @agg_min_double_skip_val_shared(i64*, double, double);
declare void @agg_min_float_shared(i32*, float);
declare void @agg_min_float_skip_val_shared(i32*, float, float);
declare void @agg_id_shared(i64*, i64);
declare i8* @agg_id_varlen_shared(i8*, i64, i8*, i64);
declare void @agg_id_int32_shared(i32*, i32);
declare void @agg_id_int16_shared(i16*, i16);
declare void @agg_id_int8_shared(i8*, i8);
declare void @agg_id_double_shared(i64*, double);
declare void @agg_id_double_shared_slow(i64*, double*);
declare void @agg_id_float_shared(i32*, float);
declare i32 @checked_single_agg_id_shared(i64*, i64, i64);
declare i32 @checked_single_agg_id_double_shared(i64*, double, double);
declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double);
declare i32 @checked_single_agg_id_float_shared(i32*, float, float);
declare i1 @slotEmptyKeyCAS(i64*, i64, i64);
declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32);
declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16);
declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8);
declare i64 @datetrunc_century(i64);
declare i64 @datetrunc_day(i64);
declare i64 @datetrunc_decade(i64);
declare i64 @datetrunc_hour(i64);
declare i64 @datetrunc_millennium(i64);
declare i64 @datetrunc_minute(i64);
declare i64 @datetrunc_month(i64);
declare i64 @datetrunc_quarter(i64);
declare i64 @datetrunc_quarterday(i64);
declare i64 @datetrunc_week_monday(i64);
declare i64 @datetrunc_week_sunday(i64);
declare i64 @datetrunc_week_saturday(i64);
declare i64 @datetrunc_year(i64);
declare i64 @extract_epoch(i64);
declare i64 @extract_dateepoch(i64);
declare i64 @extract_quarterday(i64);
declare i64 @extract_hour(i64);
declare i64 @extract_minute(i64);
declare i64 @extract_second(i64);
declare i64 @extract_millisecond(i64);
declare i64 @extract_microsecond(i64);
declare i64 @extract_nanosecond(i64);
declare i64 @extract_dow(i64);
declare i64 @extract_isodow(i64);
declare i64 @extract_day(i64);
declare i64 @extract_week_monday(i64);
declare i64 @extract_week_sunday(i64);
declare i64 @extract_week_saturday(i64);
declare i64 @extract_day_of_year(i64);
declare i64 @extract_month(i64);
declare i64 @extract_quarter(i64);
declare i64 @extract_year(i64);
declare i64 @ExtractTimeFromHPTimestamp(i64,i64);
declare i64 @ExtractTimeFromHPTimestampNullable(i64,i64,i64);
declare i64 @ExtractTimeFromLPTimestamp(i64);
declare i64 @ExtractTimeFromLPTimestampNullable(i64,i64);
declare i64 @DateTruncateHighPrecisionToDate(i64, i64);
declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64);
declare i64 @DateDiff(i32, i64, i64);
declare i64 @DateDiffNullable(i32, i64, i64, i64);
declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32);
declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64);
declare i64 @DateAdd(i32, i64, i64);
declare i64 @DateAddNullable(i32, i64, i64, i64);
declare i64 @DateAddHighPrecision(i32, i64, i64, i32);
declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64);
declare {i8*,i64} @string_decode(i8*, i64);
declare i32 @array_size(i8*, i64, i32);
declare i32 @array_size_nullable(i8*, i64, i32, i32);
declare i32 @array_size_1_nullable(i8*, i64, i32);
declare i32 @fast_fixlen_array_size(i8*, i32);
declare i1 @array_is_null(i8*, i64);
declare i1 @point_coord_array_is_null(i8*, i64);
declare i8* @array_buff(i8*, i64);
declare i8* @fast_fixlen_array_buff(i8*, i64);
declare i8 @array_at_int8_t(i8*, i64, i32);
declare i16 @array_at_int16_t(i8*, i64, i32);
declare i32 @array_at_int32_t(i8*, i64, i32);
declare i64 @array_at_int64_t(i8*, i64, i32);
declare float @array_at_float(i8*, i64, i32);
declare double @array_at_double(i8*, i64, i32);
declare i8 @varlen_array_at_int8_t(i8*, i64, i32);
declare i16 @varlen_array_at_int16_t(i8*, i64, i32);
declare i32 @varlen_array_at_int32_t(i8*, i64, i32);
declare i64 @varlen_array_at_int64_t(i8*, i64, i32);
declare float @varlen_array_at_float(i8*, i64, i32);
declare double @varlen_array_at_double(i8*, i64, i32);
declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32);
declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32);
declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32);
declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32);
declare float @varlen_notnull_array_at_float(i8*, i64, i32);
declare double @varlen_notnull_array_at_double(i8*, i64, i32);
declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8);
declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16);
declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32);
declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64);
declare float @array_at_float_checked(i8*, i64, i64, float);
declare double @array_at_double_checked(i8*, i64, i64, double);
declare i32 @char_length(i8*, i32);
declare i32 @char_length_nullable(i8*, i32, i32);
declare i32 @char_length_encoded(i8*, i32);
declare i32 @char_length_encoded_nullable(i8*, i32, i32);
declare i32 @key_for_string_encoded(i32);
declare i1 @sample_ratio(double, i64);
declare double @width_bucket(double, double, double, double, i32);
declare double @width_bucket_reverse(double, double, double, double, i32);
declare double @width_bucket_nullable(double, double, double, double, i32, double);
declare double @width_bucket_reversed_nullable(double, double, double, double, i32, double);
declare double @width_bucket_no_oob_check(double, double, double);
declare double @width_bucket_reverse_no_oob_check(double, double, double);
declare double @width_bucket_expr(double, i1, double, double, i32);
declare double @width_bucket_expr_nullable(double, i1, double, double, i32, double);
declare double @width_bucket_expr_no_oob_check(double, i1, double, double, i32);
declare i1 @string_like(i8*, i32, i8*, i32, i8);
declare i1 @string_ilike(i8*, i32, i8*, i32, i8);
declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8);
declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8);
declare i1 @string_like_simple(i8*, i32, i8*, i32);
declare i1 @string_ilike_simple(i8*, i32, i8*, i32);
declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8);
declare i1 @string_lt(i8*, i32, i8*, i32);
declare i1 @string_le(i8*, i32, i8*, i32);
declare i1 @string_gt(i8*, i32, i8*, i32);
declare i1 @string_ge(i8*, i32, i8*, i32);
declare i1 @string_eq(i8*, i32, i8*, i32);
declare i1 @string_ne(i8*, i32, i8*, i32);
declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8);
declare i1 @regexp_like(i8*, i32, i8*, i32, i8);
declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8);
declare void @linear_probabilistic_count(i8*, i32, i8*, i32);
declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64);
declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64);
declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64);
declare void @record_error_code(i32, i32*);
declare i32 @get_error_code(i32*);
declare i1 @dynamic_watchdog();
declare i1 @check_interrupt();
declare void @force_sync();
declare void @sync_warp();
declare void @sync_warp_protected(i64, i64);
declare void @sync_threadblock();
declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32);
declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64);
declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float);
declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double);
declare double @decompress_x_coord_geoint(i32);
declare double @decompress_y_coord_geoint(i32);
declare i32 @compress_x_coord_geoint(double);
declare i32 @compress_y_coord_geoint(double);
)" + gen_array_any_all_sigs() +
631 std::string extension_function_decls(
const std::unordered_set<std::string>& udf_decls) {
637 void legalize_nvvm_ir(llvm::Function* query_func) {
644 std::vector<llvm::Instruction*> stackrestore_intrinsics;
645 std::vector<llvm::Instruction*> stacksave_intrinsics;
646 std::vector<llvm::Instruction*> lifetime;
647 for (
auto& BB : *query_func) {
648 for (llvm::Instruction& I : BB) {
649 if (
const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
650 if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
651 stacksave_intrinsics.push_back(&I);
652 }
else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
653 stackrestore_intrinsics.push_back(&I);
654 }
else if (II->getIntrinsicID() == llvm::Intrinsic::lifetime_start ||
655 II->getIntrinsicID() == llvm::Intrinsic::lifetime_end) {
656 lifetime.push_back(&I);
665 for (
auto& II : stackrestore_intrinsics) {
666 II->eraseFromParent();
668 for (
auto& II : stacksave_intrinsics) {
669 II->eraseFromParent();
672 for (
auto& II : lifetime) {
673 II->eraseFromParent();
681 return llvm::StringRef(
"nvptx64-nvidia-cuda");
685 return llvm::StringRef(
686 "e-p:64:64:64-i1:8:8-i8:8:8-"
687 "i16:16:16-i32:32:32-i64:64:64-"
688 "f32:32:32-f64:64:64-v16:16:16-"
689 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
693 std::map<std::string, std::string>
result;
695 result.insert(std::make_pair(
"cpu_name", llvm::sys::getHostCPUName()));
696 result.insert(std::make_pair(
"cpu_triple", llvm::sys::getProcessTriple()));
698 std::make_pair(
"cpu_cores",
std::to_string(llvm::sys::getHostNumPhysicalCores())));
702 std::string sizeof_types;
705 sizeof_types +=
"ssize_t:" +
std::to_string(
sizeof(ssize_t)) +
";";
707 sizeof_types +=
"uchar:" +
std::to_string(
sizeof(
unsigned char)) +
";";
709 sizeof_types +=
"ushort:" +
std::to_string(
sizeof(
unsigned short int)) +
";";
711 sizeof_types +=
"uint:" +
std::to_string(
sizeof(
unsigned int)) +
";";
713 sizeof_types +=
"ulong:" +
std::to_string(
sizeof(
unsigned long int)) +
";";
714 sizeof_types +=
"longlong:" +
std::to_string(
sizeof(
long long int)) +
";";
715 sizeof_types +=
"ulonglong:" +
std::to_string(
sizeof(
unsigned long long int)) +
";";
718 sizeof_types +=
"longdouble:" +
std::to_string(
sizeof(
long double)) +
";";
721 result.insert(std::make_pair(
"type_sizeof", sizeof_types));
723 std::string null_values;
724 null_values +=
"boolean1:" +
std::to_string(serialized_null_value<bool>()) +
";";
725 null_values +=
"boolean8:" +
std::to_string(serialized_null_value<int8_t>()) +
";";
726 null_values +=
"int8:" +
std::to_string(serialized_null_value<int8_t>()) +
";";
727 null_values +=
"int16:" +
std::to_string(serialized_null_value<int16_t>()) +
";";
728 null_values +=
"int32:" +
std::to_string(serialized_null_value<int32_t>()) +
";";
729 null_values +=
"int64:" +
std::to_string(serialized_null_value<int64_t>()) +
";";
730 null_values +=
"uint8:" +
std::to_string(serialized_null_value<uint8_t>()) +
";";
731 null_values +=
"uint16:" +
std::to_string(serialized_null_value<uint16_t>()) +
";";
732 null_values +=
"uint32:" +
std::to_string(serialized_null_value<uint32_t>()) +
";";
733 null_values +=
"uint64:" +
std::to_string(serialized_null_value<uint64_t>()) +
";";
734 null_values +=
"float32:" +
std::to_string(serialized_null_value<float>()) +
";";
735 null_values +=
"float64:" +
std::to_string(serialized_null_value<double>()) +
";";
737 "Array<boolean8>:" +
std::to_string(serialized_null_value<int8_t, true>()) +
";";
739 "Array<int8>:" +
std::to_string(serialized_null_value<int8_t, true>()) +
";";
741 "Array<int16>:" +
std::to_string(serialized_null_value<int16_t, true>()) +
";";
743 "Array<int32>:" +
std::to_string(serialized_null_value<int32_t, true>()) +
";";
745 "Array<int64>:" +
std::to_string(serialized_null_value<int64_t, true>()) +
";";
747 "Array<float32>:" +
std::to_string(serialized_null_value<float, true>()) +
";";
749 "Array<float64>:" +
std::to_string(serialized_null_value<double, true>()) +
";";
751 result.insert(std::make_pair(
"null_values", null_values));
753 llvm::StringMap<bool> cpu_features;
754 if (llvm::sys::getHostCPUFeatures(cpu_features)) {
755 std::string features_str =
"";
756 for (
auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
757 features_str += (it->getValue() ?
" +" :
" -");
758 features_str += it->getKey().str();
760 result.insert(std::make_pair(
"cpu_features", features_str));
763 result.insert(std::make_pair(
"llvm_version",
770 int device_count = 0;
774 char device_name[256];
775 int major = 0, minor = 0;
780 &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
782 &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
785 result.insert(std::make_pair(
"gpu_name", device_name));
786 result.insert(std::make_pair(
"gpu_count",
std::to_string(device_count)));
787 result.insert(std::make_pair(
"gpu_compute_capability",
791 result.insert(std::make_pair(
"gpu_driver",
797 std::make_pair(
"gpu_has_libdevice",
809 std::unordered_set<llvm::Function*> findAliveRuntimeFuncs(
810 llvm::Module& llvm_module,
811 const std::vector<llvm::Function*>& roots) {
812 std::queue<llvm::Function*> queue;
813 std::unordered_set<llvm::Function*> visited;
814 for (llvm::Function* F : roots) {
818 while (!queue.empty()) {
819 llvm::Function* F = queue.front();
821 if (visited.find(F) != visited.end()) {
826 for (llvm::inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
827 if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
828 if (CI->isInlineAsm())
830 llvm::Function* called = CI->getCalledFunction();
831 if (!called || visited.find(called) != visited.end()) {
846 llvm::Module& llvm_module,
847 llvm::PassManagerBuilder& pass_manager_builder,
848 const GPUTarget& gpu_target) {
852 if (!executor->has_libdevice_module()) {
854 throw std::runtime_error(
855 "libdevice library is not available but required by the UDF module");
859 std::vector<llvm::Function*> roots;
860 for (llvm::Function& fn : llvm_module) {
861 if (!fn.isDeclaration())
862 roots.emplace_back(&fn);
868 gpu_target.cgen_state,
869 llvm::Linker::Flags::OverrideFromSrc);
871 std::unordered_set<llvm::Function*> live_funcs =
872 findAliveRuntimeFuncs(llvm_module, roots);
874 std::vector<llvm::Function*> funcs_to_delete;
875 for (llvm::Function& fn : llvm_module) {
876 if (!live_funcs.count(&fn)) {
878 funcs_to_delete.emplace_back(&fn);
882 for (llvm::Function*
f : funcs_to_delete) {
883 f->eraseFromParent();
887 #if LLVM_VERSION_MAJOR >= 11
888 llvm::LLVMContext& ctx = llvm_module.getContext();
889 llvm_module.setModuleFlag(llvm::Module::Override,
891 llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
892 llvm::Type::getInt32Ty(ctx), uint32_t(1))));
894 llvm_module.addModuleFlag(llvm::Module::Override,
"nvvm-reflect-ftz", uint32_t(1));
896 for (llvm::Function& fn : llvm_module) {
897 fn.addFnAttr(
"nvptx-f32ftz",
"true");
901 gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
902 llvm::legacy::FunctionPassManager FPM(&llvm_module);
903 pass_manager_builder.populateFunctionPassManager(FPM);
906 FPM.doInitialization();
907 for (
auto& F : llvm_module) {
910 FPM.doFinalization();
916 llvm::Function* func,
917 llvm::Function* wrapper_func,
918 const std::unordered_set<llvm::Function*>& live_funcs,
919 const bool is_gpu_smem_used,
921 const GPUTarget& gpu_target) {
924 auto llvm_module = func->getParent();
945 CHECK(gpu_target.cgen_state->module_ == llvm_module);
946 CHECK(func->getParent() == wrapper_func->getParent());
947 llvm_module->setDataLayout(
948 "e-p:64:64:64-i1:8:8-i8:8:8-"
949 "i16:16:16-i32:32:32-i64:64:64-"
950 "f32:32:32-f64:64:64-v16:16:16-"
951 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
952 llvm_module->setTargetTriple(
"nvptx64-nvidia-cuda");
953 CHECK(gpu_target.nvptx_target_machine);
954 llvm::PassManagerBuilder pass_manager_builder = llvm::PassManagerBuilder();
956 pass_manager_builder.OptLevel = 0;
957 llvm::legacy::PassManager module_pass_manager;
958 pass_manager_builder.populateModulePassManager(module_pass_manager);
960 bool requires_libdevice = check_module_requires_libdevice(llvm_module);
962 if (requires_libdevice) {
967 optimize_ir(func, llvm_module, module_pass_manager, live_funcs, is_gpu_smem_used, co);
968 legalize_nvvm_ir(func);
970 std::stringstream ss;
971 llvm::raw_os_ostream os(ss);
973 llvm::LLVMContext& ctx = llvm_module->getContext();
975 llvm::NamedMDNode* md = llvm_module->getOrInsertNamedMetadata(
"nvvm.annotations");
977 llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
978 llvm::MDString::get(ctx,
"kernel"),
979 llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
980 llvm::Type::getInt32Ty(ctx), 1))};
983 md->addOperand(llvm::MDNode::get(ctx, md_vals));
985 std::unordered_set<llvm::Function*> roots{wrapper_func, func};
986 if (gpu_target.row_func_not_inlined) {
988 roots.insert(gpu_target.cgen_state->row_func_);
989 if (gpu_target.cgen_state->filter_func_) {
990 roots.insert(gpu_target.cgen_state->filter_func_);
995 for (
auto f : gpu_target.cgen_state->helper_functions_) {
999 if (requires_libdevice) {
1000 for (llvm::Function& F : *llvm_module) {
1008 if (F.hasName() && F.getName().startswith(
"__internal") && !F.isDeclaration()) {
1011 legalize_nvvm_ir(&F);
1016 std::unordered_set<std::string> udf_declarations;
1018 if (executor->has_udf_module(
true)) {
1019 for (
auto&
f : executor->get_udf_module(
true)->getFunctionList()) {
1020 llvm::Function* udf_function = llvm_module->getFunction(
f.getName());
1023 legalize_nvvm_ir(udf_function);
1024 roots.insert(udf_function);
1028 if (
f.isDeclaration()) {
1029 udf_declarations.insert(
f.getName().str());
1035 if (executor->has_rt_udf_module(
true)) {
1036 for (
auto&
f : executor->get_rt_udf_module(
true)->getFunctionList()) {
1037 llvm::Function* udf_function = llvm_module->getFunction(
f.getName());
1039 legalize_nvvm_ir(udf_function);
1040 roots.insert(udf_function);
1044 if (
f.isDeclaration()) {
1045 udf_declarations.insert(
f.getName().str());
1051 std::vector<llvm::Function*> rt_funcs;
1052 for (
auto& Fn : *llvm_module) {
1053 if (roots.count(&Fn)) {
1056 rt_funcs.push_back(&Fn);
1058 for (
auto& pFn : rt_funcs) {
1059 pFn->removeFromParent();
1062 if (requires_libdevice) {
1063 add_intrinsics_to_module(llvm_module);
1066 llvm_module->print(os,
nullptr);
1069 for (
auto& pFn : rt_funcs) {
1070 llvm_module->getFunctionList().push_back(pFn);
1072 llvm_module->eraseNamedMetadata(md);
1074 auto cuda_llir = ss.str() + cuda_rt_decls + extension_function_decls(udf_declarations);
1078 cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
1080 LOG(
WARNING) <<
"Failed to generate PTX: " << e.what()
1081 <<
". Switching to CPU execution target.";
1084 LOG(
PTX) <<
"PTX for the GPU:\n" << ptx <<
"\nEnd of PTX";
1086 auto cubin_result =
ptx_to_cubin(ptx, gpu_target.cuda_mgr);
1087 auto& option_keys = cubin_result.option_keys;
1088 auto& option_values = cubin_result.option_values;
1089 auto cubin = cubin_result.cubin;
1090 auto link_state = cubin_result.link_state;
1091 const auto num_options = option_keys.size();
1093 auto func_name = wrapper_func->getName().str();
1094 auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
1095 for (
int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
1097 gpu_compilation_context->addDeviceCode(
1098 std::make_unique<GpuDeviceCompilationContext>(cubin,
1101 gpu_target.cuda_mgr,
1104 &option_values[0]));
1108 return gpu_compilation_context;
1115 llvm::Function* query_func,
1116 llvm::Function* multifrag_query_func,
1117 std::unordered_set<llvm::Function*>& live_funcs,
1118 const bool no_inline,
1120 const bool is_gpu_smem_used,
1128 if (cgen_state_->filter_func_) {
1131 for (
const auto helper : cgen_state_->helper_functions_) {
1139 bool row_func_not_inlined =
false;
1141 for (
auto it = llvm::inst_begin(cgen_state_->row_func_),
1142 e = llvm::inst_end(cgen_state_->row_func_);
1145 if (llvm::isa<llvm::CallInst>(*it)) {
1146 auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
1147 if (get_gv_call.getCalledFunction()->getName() ==
"array_size" ||
1148 get_gv_call.getCalledFunction()->getName() ==
"linear_probabilistic_count") {
1150 row_func_not_inlined =
true;
1157 initializeNVPTXBackend();
1159 nvptx_target_machine_.get(), cuda_mgr, cgen_state_.get(), row_func_not_inlined};
1160 std::shared_ptr<GpuCompilationContext> compilation_context;
1165 multifrag_query_func,
1170 }
catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1171 if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1174 LOG(
WARNING) <<
"Failed to allocate GPU memory for generated code. Evicting "
1176 <<
"% of GPU code cache and re-trying.";
1181 multifrag_query_func,
1198 llvm::TargetMachine* nvptx_target_machine,
1199 llvm::LLVMContext& context) {
1201 auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir,
"",
false);
1203 llvm::SMDiagnostic parse_error;
1205 auto llvm_module = llvm::parseIR(mem_buff->getMemBufferRef(), parse_error, context);
1207 LOG(
IR) <<
"CodeGenerator::generatePTX:NVVM IR:\n" << cuda_llir <<
"\nEnd of NNVM IR";
1211 llvm::SmallString<256> code_str;
1212 llvm::raw_svector_ostream formatted_os(code_str);
1213 CHECK(nvptx_target_machine);
1215 llvm::legacy::PassManager ptxgen_pm;
1216 llvm_module->setDataLayout(nvptx_target_machine->createDataLayout());
1218 #if LLVM_VERSION_MAJOR >= 10
1219 nvptx_target_machine->addPassesToEmitFile(
1220 ptxgen_pm, formatted_os,
nullptr, llvm::CGFT_AssemblyFile);
1222 nvptx_target_machine->addPassesToEmitFile(
1223 ptxgen_pm, formatted_os,
nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1225 ptxgen_pm.run(*llvm_module);
1228 #if LLVM_VERSION_MAJOR >= 11
1229 return std::string(code_str);
1231 return code_str.str();
1238 llvm::InitializeAllTargets();
1239 llvm::InitializeAllTargetMCs();
1240 llvm::InitializeAllAsmPrinters();
1242 auto target = llvm::TargetRegistry::lookupTarget(
"nvptx64", err);
1246 return std::unique_ptr<llvm::TargetMachine>(
1247 target->createTargetMachine(
"nvptx64-nvidia-cuda",
1250 llvm::TargetOptions(),
1251 llvm::Reloc::Static));
1256 cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1260 if (nvptx_target_machine_) {
1263 const auto arch = cudaMgr()->getDeviceArch();
1270 return func->getName() ==
"query_stub_hoisted_literals" ||
1271 func->getName() ==
"multifrag_query_hoisted_literals" ||
1272 func->getName() ==
"query_stub" || func->getName() ==
"multifrag_query" ||
1273 func->getName() ==
"fixed_width_int_decode" ||
1274 func->getName() ==
"fixed_width_unsigned_decode" ||
1275 func->getName() ==
"diff_fixed_width_int_decode" ||
1276 func->getName() ==
"fixed_width_double_decode" ||
1277 func->getName() ==
"fixed_width_float_decode" ||
1278 func->getName() ==
"fixed_width_small_date_decode" ||
1279 func->getName() ==
"fixed_width_date_encode" ||
1280 func->getName() ==
"record_error_code" || func->getName() ==
"get_error_code" ||
1281 func->getName() ==
"pos_start_impl" || func->getName() ==
"pos_step_impl" ||
1282 func->getName() ==
"group_buff_idx_impl" ||
1283 func->getName() ==
"init_shared_mem" ||
1284 func->getName() ==
"init_shared_mem_nop" || func->getName() ==
"write_back_nop";
1288 const std::string& bc_filename,
1289 llvm::LLVMContext& context) {
1290 llvm::SMDiagnostic err;
1292 auto buffer_or_error = llvm::MemoryBuffer::getFile(bc_filename);
1293 CHECK(!buffer_or_error.getError()) <<
"bc_filename=" << bc_filename;
1294 llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1296 auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1297 CHECK(!owner.takeError());
1298 CHECK(owner->get());
1299 return std::move(owner.get());
1303 const std::string& udf_ir_filename,
1304 llvm::LLVMContext& ctx,
1305 bool is_gpu =
false) {
1306 llvm::SMDiagnostic parse_error;
1308 llvm::StringRef file_name_arg(udf_ir_filename);
1310 auto owner = llvm::parseIRFile(file_name_arg, parse_error, ctx);
1316 llvm::Triple gpu_triple(owner->getTargetTriple());
1317 if (!gpu_triple.isNVPTX()) {
1319 <<
"Expected triple nvptx64-nvidia-cuda for NVVM IR of loadtime UDFs but got "
1320 << gpu_triple.str() <<
". Disabling the NVVM IR module.";
1321 return std::unique_ptr<llvm::Module>();
1328 const std::string& udf_ir_string,
1329 llvm::LLVMContext& ctx,
1330 bool is_gpu =
false) {
1331 llvm::SMDiagnostic parse_error;
1333 auto buf = std::make_unique<llvm::MemoryBufferRef>(udf_ir_string,
1334 "Runtime UDF/UDTF LLVM/NVVM IR");
1336 auto owner = llvm::parseIR(*buf, parse_error, ctx);
1338 LOG(
IR) <<
"read_llvm_module_from_ir_string:\n"
1339 << udf_ir_string <<
"\nEnd of LLVM/NVVM IR";
1344 llvm::Triple gpu_triple(owner->getTargetTriple());
1345 if (!gpu_triple.isNVPTX()) {
1346 LOG(
IR) <<
"read_llvm_module_from_ir_string:\n"
1347 << udf_ir_string <<
"\nEnd of NNVM IR";
1348 LOG(
WARNING) <<
"Expected triple nvptx64-nvidia-cuda for NVVM IR but got "
1350 <<
". Executing runtime UDF/UDTFs on GPU will be disabled.";
1351 return std::unique_ptr<llvm::Module>();
1361 const bool use_resume_param,
1362 llvm::Function* query_func,
1363 llvm::Module* llvm_module) {
1364 for (
auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1366 if (!llvm::isa<llvm::CallInst>(*it)) {
1369 auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1370 if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
1371 if (use_resume_param) {
1372 const auto error_code_arg =
get_arg_by_name(query_func,
"error_code");
1373 llvm::ReplaceInstWithInst(
1375 llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name +
"_impl"),
1378 llvm::ReplaceInstWithInst(
1380 llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name +
"_impl")));
1388 const size_t in_col_count,
1389 const size_t agg_col_count,
1390 const bool hoist_literals) {
1391 auto arg_it = row_func->arg_begin();
1393 if (agg_col_count) {
1394 for (
size_t i = 0; i < agg_col_count; ++i) {
1395 arg_it->setName(
"out");
1399 arg_it->setName(
"group_by_buff");
1401 arg_it->setName(
"varlen_output_buff");
1403 arg_it->setName(
"crt_matched");
1405 arg_it->setName(
"total_matched");
1407 arg_it->setName(
"old_total_matched");
1409 arg_it->setName(
"max_matched");
1413 arg_it->setName(
"agg_init_val");
1416 arg_it->setName(
"pos");
1419 arg_it->setName(
"frag_row_off");
1422 arg_it->setName(
"num_rows_per_scan");
1425 if (hoist_literals) {
1426 arg_it->setName(
"literals");
1430 for (
size_t i = 0; i < in_col_count; ++i) {
1435 arg_it->setName(
"join_hash_tables");
1437 arg_it->setName(
"row_func_mgr");
1441 const size_t agg_col_count,
1442 const bool hoist_literals,
1443 llvm::Module* llvm_module,
1444 llvm::LLVMContext& context) {
1445 std::vector<llvm::Type*> row_process_arg_types;
1447 if (agg_col_count) {
1449 for (
size_t i = 0; i < agg_col_count; ++i) {
1450 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1454 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1456 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1458 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1460 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1462 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1464 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1468 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1471 row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1474 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1477 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1480 if (hoist_literals) {
1481 row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1485 for (
size_t i = 0; i < in_col_count; ++i) {
1486 row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1490 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1493 row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1497 llvm::FunctionType::get(
get_int_type(32, context), row_process_arg_types,
false);
1499 auto row_func = llvm::Function::Create(
1500 ft, llvm::Function::ExternalLinkage,
"row_func", llvm_module);
1510 const std::string& query_fname,
1511 llvm::Function* multifrag_query_func,
1512 llvm::Module* llvm_module) {
1513 std::vector<llvm::CallInst*> query_stubs;
1514 for (
auto it = llvm::inst_begin(multifrag_query_func),
1515 e = llvm::inst_end(multifrag_query_func);
1518 if (!llvm::isa<llvm::CallInst>(*it)) {
1521 auto& query_call = llvm::cast<llvm::CallInst>(*it);
1522 if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
1523 query_stubs.push_back(&query_call);
1526 for (
auto& S : query_stubs) {
1527 std::vector<llvm::Value*>
args;
1528 for (
size_t i = 0; i < S->getNumOperands() - 1; ++i) {
1529 args.push_back(S->getArgOperand(i));
1531 llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args,
""));
1535 std::vector<std::string>
get_agg_fnames(
const std::vector<Analyzer::Expr*>& target_exprs,
1536 const bool is_group_by) {
1537 std::vector<std::string>
result;
1538 for (
size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1539 ++target_idx, ++agg_col_idx) {
1540 const auto target_expr = target_exprs[target_idx];
1542 const auto target_type_info = target_expr->get_type_info();
1544 const bool is_varlen =
1545 (target_type_info.is_string() &&
1547 target_type_info.is_array();
1548 if (!agg_expr || agg_expr->get_aggtype() ==
kSAMPLE) {
1549 result.emplace_back(target_type_info.is_fp() ?
"agg_id_double" :
"agg_id");
1551 result.emplace_back(
"agg_id");
1553 if (target_type_info.is_geometry()) {
1554 result.emplace_back(
"agg_id");
1555 for (
auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1556 result.emplace_back(
"agg_id");
1566 agg_type_info = target_type_info;
1569 agg_type_info = agg_expr->get_arg()->get_type_info();
1575 !agg_type_info.
is_fp()) {
1576 throw std::runtime_error(
"AVG is only valid on integer and floating point");
1580 :
"agg_sum_double");
1583 :
"agg_count_double");
1589 throw std::runtime_error(
1590 "MIN on strings, arrays or geospatial types not supported yet");
1594 :
"agg_min_double");
1600 throw std::runtime_error(
1601 "MAX on strings, arrays or geospatial types not supported yet");
1605 :
"agg_max_double");
1611 !agg_type_info.
is_fp()) {
1612 throw std::runtime_error(
1613 "SUM and SUM_IF is only valid on integer and floating point");
1621 result.emplace_back(func_name);
1625 result.emplace_back(agg_expr->get_is_distinct() ?
"agg_count_distinct"
1629 result.emplace_back(
"agg_count_if");
1632 result.emplace_back(agg_type_info.
is_fp() ?
"agg_id_double" :
"agg_id");
1637 result.emplace_back(agg_type_info.
is_fp() ?
"agg_id_double" :
"agg_id");
1641 result.emplace_back(
"agg_approximate_count_distinct");
1644 result.emplace_back(
"agg_approx_quantile");
1647 result.emplace_back(
"agg_mode_func");
1650 UNREACHABLE() <<
"Usupported agg_type: " << agg_type;
1659 const bool is_cuda_ir) {
1667 llvm::Module& llvm_module,
1668 const std::vector<llvm::Function*>& roots,
1669 const std::vector<llvm::Function*>& leaves) {
1671 std::unordered_set<llvm::Function*> live_funcs;
1672 live_funcs.insert(roots.begin(), roots.end());
1673 live_funcs.insert(leaves.begin(), leaves.end());
1675 if (
auto F = llvm_module.getFunction(
"init_shared_mem_nop")) {
1676 live_funcs.insert(F);
1678 if (
auto F = llvm_module.getFunction(
"write_back_nop")) {
1679 live_funcs.insert(F);
1682 for (
const llvm::Function* F : roots) {
1683 for (
const llvm::BasicBlock& BB : *F) {
1684 for (
const llvm::Instruction& I : BB) {
1685 if (
const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1686 live_funcs.insert(CI->getCalledFunction());
1692 for (llvm::Function& F : llvm_module) {
1693 if (!live_funcs.count(&F) && !F.isDeclaration()) {
1694 F.setLinkage(llvm::GlobalValue::InternalLinkage);
1704 template <
typename InstType>
1706 std::string bb_name,
1707 std::string variable_name) {
1708 llvm::Value* result =
nullptr;
1709 if (func ==
nullptr || variable_name.empty()) {
1712 bool is_found =
false;
1713 for (
auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1714 if (!bb_name.empty() && bb_it->getName() != bb_name) {
1717 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1718 if (llvm::isa<InstType>(*inst_it)) {
1719 if (inst_it->getName() == variable_name) {
1732 llvm::Function* query_func,
1733 bool run_with_dynamic_watchdog,
1734 bool run_with_allowing_runtime_interrupt,
1735 const std::vector<JoinLoop>& join_loops,
1737 const std::vector<InputTableInfo>& input_table_infos) {
1743 if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1746 run_with_allowing_runtime_interrupt =
false;
1752 executor_session_mutex_);
1753 if (current_query_session_.empty()) {
1754 run_with_allowing_runtime_interrupt =
false;
1758 llvm::Value* row_count =
nullptr;
1759 if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1762 find_variable_in_basic_block<llvm::LoadInst>(query_func,
".entry",
"row_count");
1765 bool done_splitting =
false;
1766 for (
auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1768 llvm::Value* pos =
nullptr;
1769 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1770 if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1771 llvm::isa<llvm::PHINode>(*inst_it)) {
1772 if (inst_it->getName() ==
"pos") {
1777 if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1780 auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1781 if (std::string(row_func_call.getCalledFunction()->getName()) ==
"row_process") {
1782 auto next_inst_it = inst_it;
1784 auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1785 auto& br_instr = bb_it->back();
1786 llvm::IRBuilder<> ir_builder(&br_instr);
1787 llvm::Value* err_lv = &*inst_it;
1788 llvm::Value* err_lv_returned_from_row_func =
nullptr;
1789 if (run_with_dynamic_watchdog) {
1791 llvm::Value* call_watchdog_lv =
nullptr;
1797 auto crit_edge_rem =
1798 (blockSize() & (blockSize() - 1))
1799 ? ir_builder.CreateSRem(
1801 cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1802 : ir_builder.CreateAnd(
1804 cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1805 auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1806 crit_edge_threshold->setName(
"crit_edge_threshold");
1811 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1814 auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1815 call_watchdog_lv = ir_builder.CreateICmp(
1816 llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1818 CHECK(call_watchdog_lv);
1819 auto error_check_bb = bb_it->splitBasicBlock(
1820 llvm::BasicBlock::iterator(br_instr),
".error_check");
1821 auto& watchdog_br_instr = bb_it->back();
1823 auto watchdog_check_bb = llvm::BasicBlock::Create(
1824 cgen_state_->context_,
".watchdog_check", query_func, error_check_bb);
1825 llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1826 auto detected_timeout = watchdog_ir_builder.CreateCall(
1827 cgen_state_->module_->getFunction(
"dynamic_watchdog"), {});
1828 auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1830 watchdog_ir_builder.CreateBr(error_check_bb);
1832 llvm::ReplaceInstWithInst(
1834 llvm::BranchInst::Create(
1835 watchdog_check_bb, error_check_bb, call_watchdog_lv));
1836 ir_builder.SetInsertPoint(&br_instr);
1837 auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1839 unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1840 unified_err_lv->addIncoming(err_lv, &*bb_it);
1841 err_lv = unified_err_lv;
1842 }
else if (run_with_allowing_runtime_interrupt) {
1844 llvm::Value* call_check_interrupt_lv{
nullptr};
1845 llvm::Value* interrupt_err_lv{
nullptr};
1846 llvm::BasicBlock* error_check_bb{
nullptr};
1847 llvm::BasicBlock* interrupt_check_bb{
nullptr};
1848 llvm::Instruction* check_interrupt_br_instr{
nullptr};
1851 join_loops.begin(), join_loops.end(), [](
const JoinLoop& join_loop) {
1852 return join_loop.isNestedLoopJoin();
1854 auto codegen_interrupt_checker = [&]() {
1855 error_check_bb = bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
1857 check_interrupt_br_instr = &bb_it->back();
1859 interrupt_check_bb = llvm::BasicBlock::Create(
1860 cgen_state_->context_,
".interrupt_check", query_func, error_check_bb);
1861 llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1862 auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1863 cgen_state_->module_->getFunction(
"check_interrupt"), {});
1864 interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1868 interrupt_checker_ir_builder.CreateBr(error_check_bb);
1870 if (has_loop_join) {
1871 codegen_interrupt_checker();
1872 CHECK(interrupt_check_bb);
1873 CHECK(check_interrupt_br_instr);
1874 llvm::ReplaceInstWithInst(check_interrupt_br_instr,
1875 llvm::BranchInst::Create(interrupt_check_bb));
1876 ir_builder.SetInsertPoint(&br_instr);
1877 err_lv = interrupt_err_lv;
1889 int64_t total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1890 uint64_t interrupt_checking_freq = 32;
1894 if (!input_table_infos.empty()) {
1895 const auto& outer_table_info = *input_table_infos.begin();
1896 auto num_outer_table_tuples =
1897 outer_table_info.info.getFragmentNumTuplesUpperBound();
1898 if (num_outer_table_tuples > 0) {
1906 auto max_inc = uint64_t(
1907 floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1913 auto calibrated_inc =
1914 uint64_t(floor(max_inc * (1 - freq_control_knob)));
1915 interrupt_checking_freq =
1920 if (interrupt_checking_freq > max_inc) {
1921 interrupt_checking_freq = max_inc / 2;
1923 if (interrupt_checking_freq < 8) {
1926 interrupt_checking_freq = 8;
1930 VLOG(1) <<
"Set the running query interrupt checking frequency: "
1931 << interrupt_checking_freq;
1933 llvm::Value* pos_shifted_per_iteration =
1934 ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1935 auto interrupt_predicate = ir_builder.CreateAnd(pos_shifted_per_iteration,
1936 interrupt_checking_freq);
1937 call_check_interrupt_lv =
1938 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1939 interrupt_predicate,
1940 cgen_state_->llInt(int64_t(0LL)));
1943 auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1944 call_check_interrupt_lv =
1945 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1946 interrupt_predicate,
1947 cgen_state_->llInt(int64_t(0LL)));
1949 codegen_interrupt_checker();
1950 CHECK(call_check_interrupt_lv);
1951 CHECK(interrupt_err_lv);
1952 CHECK(interrupt_check_bb);
1953 CHECK(error_check_bb);
1954 CHECK(check_interrupt_br_instr);
1955 llvm::ReplaceInstWithInst(
1956 check_interrupt_br_instr,
1957 llvm::BranchInst::Create(
1958 interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
1959 ir_builder.SetInsertPoint(&br_instr);
1960 auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1962 unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
1963 unified_err_lv->addIncoming(err_lv, &*bb_it);
1964 err_lv = unified_err_lv;
1967 if (!err_lv_returned_from_row_func) {
1968 err_lv_returned_from_row_func = err_lv;
1974 err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1978 err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1980 cgen_state_->llInt(static_cast<int32_t>(0)));
1982 auto error_bb = llvm::BasicBlock::Create(
1983 cgen_state_->context_,
".error_exit", query_func, new_bb);
1984 const auto error_code_arg =
get_arg_by_name(query_func,
"error_code");
1985 llvm::CallInst::Create(
1986 cgen_state_->module_->getFunction(
"record_error_code"),
1987 std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
1990 llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1991 llvm::ReplaceInstWithInst(&br_instr,
1992 llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1993 done_splitting =
true;
1998 CHECK(done_splitting);
2002 llvm::Module* M = cgen_state_->module_;
2003 if (M->getFunction(
"allocate_varlen_buffer") ==
nullptr)
2007 bool should_track =
false;
2008 auto* flag = M->getModuleFlag(
"manage_memory_buffer");
2009 if (
auto* cnt = llvm::mdconst::extract_or_null<llvm::ConstantInt>(flag)) {
2010 if (cnt->getZExtValue() == 1) {
2011 should_track =
true;
2015 if (!should_track) {
2020 LOG(
INFO) <<
"Found 'manage_memory_buffer' metadata.";
2021 llvm::SmallVector<llvm::CallInst*, 4> calls_to_analyze;
2023 for (llvm::Function& F : *M) {
2024 for (llvm::BasicBlock& BB : F) {
2025 for (llvm::Instruction& I : BB) {
2026 if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&I)) {
2028 llvm::Function* called = CI->getCalledFunction();
2030 if (called->getName() ==
"allocate_varlen_buffer") {
2031 calls_to_analyze.push_back(CI);
2041 llvm::IRBuilder<> Builder(cgen_state_->context_);
2044 auto void_ = llvm::Type::getVoidTy(cgen_state_->context_);
2045 llvm::FunctionType* fnty = llvm::FunctionType::get(void_, {i64, i8p},
false);
2046 llvm::FunctionCallee register_buffer_fn =
2047 M->getOrInsertFunction(
"register_buffer_with_executor_rsm", fnty, {});
2049 int64_t executor_addr =
reinterpret_cast<int64_t
>(
this);
2050 for (llvm::CallInst* CI : calls_to_analyze) {
2055 for (llvm::User* U : CI->users()) {
2056 if (llvm::CallInst* call = llvm::dyn_cast<llvm::CallInst>(U)) {
2057 if (call->getCalledFunction() and
2058 call->getCalledFunction()->getName() ==
"register_buffer_with_executor_rsm") {
2065 Builder.SetInsertPoint(CI->getNextNode());
2066 Builder.CreateCall(register_buffer_fn,
2067 {
ll_int(executor_addr, cgen_state_->context_), CI});
2075 std::vector<llvm::Value*> hoisted_literals;
2079 std::vector<llvm::Type*> row_process_arg_types;
2081 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2082 E = cgen_state_->row_func_->arg_end();
2085 row_process_arg_types.push_back(I->getType());
2088 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2089 for (
auto value : element.second) {
2090 row_process_arg_types.push_back(value->getType());
2094 auto ft = llvm::FunctionType::get(
2095 get_int_type(32, cgen_state_->context_), row_process_arg_types,
false);
2096 auto row_func_with_hoisted_literals =
2097 llvm::Function::Create(ft,
2098 llvm::Function::ExternalLinkage,
2099 "row_func_hoisted_literals",
2100 cgen_state_->row_func_->getParent());
2102 auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
2103 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2104 E = cgen_state_->row_func_->arg_end();
2108 row_func_arg_it->setName(I->getName());
2113 decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{
nullptr};
2114 decltype(row_func_arg_it) filter_func_arg_it{
nullptr};
2115 if (cgen_state_->filter_func_) {
2118 std::vector<llvm::Type*> filter_func_arg_types;
2120 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2121 E = cgen_state_->filter_func_->arg_end();
2124 filter_func_arg_types.push_back(I->getType());
2127 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2128 for (
auto value : element.second) {
2129 filter_func_arg_types.push_back(value->getType());
2133 auto ft2 = llvm::FunctionType::get(
2134 get_int_type(32, cgen_state_->context_), filter_func_arg_types,
false);
2135 filter_func_with_hoisted_literals =
2136 llvm::Function::Create(ft2,
2137 llvm::Function::ExternalLinkage,
2138 "filter_func_hoisted_literals",
2139 cgen_state_->filter_func_->getParent());
2141 filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
2142 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2143 E = cgen_state_->filter_func_->arg_end();
2147 filter_func_arg_it->setName(I->getName());
2149 ++filter_func_arg_it;
2153 std::unordered_map<int, std::vector<llvm::Value*>>
2154 query_func_literal_loads_function_arguments,
2155 query_func_literal_loads_function_arguments2;
2157 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2158 std::vector<llvm::Value*> argument_values, argument_values2;
2160 for (
auto value : element.second) {
2161 hoisted_literals.push_back(value);
2162 argument_values.push_back(&*row_func_arg_it);
2163 if (cgen_state_->filter_func_) {
2164 argument_values2.push_back(&*filter_func_arg_it);
2165 cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
2167 if (value->hasName()) {
2168 row_func_arg_it->setName(
"arg_" + value->getName());
2169 if (cgen_state_->filter_func_) {
2170 filter_func_arg_it->getContext();
2171 filter_func_arg_it->setName(
"arg_" + value->getName());
2175 ++filter_func_arg_it;
2178 query_func_literal_loads_function_arguments[element.first] = argument_values;
2179 query_func_literal_loads_function_arguments2[element.first] = argument_values2;
2185 row_func_with_hoisted_literals->getBasicBlockList().splice(
2186 row_func_with_hoisted_literals->begin(),
2187 cgen_state_->row_func_->getBasicBlockList());
2190 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2191 E = cgen_state_->row_func_->arg_end(),
2192 I2 = row_func_with_hoisted_literals->arg_begin();
2195 I->replaceAllUsesWith(&*I2);
2197 cgen_state_->filter_func_args_.replace(&*I, &*I2);
2201 cgen_state_->row_func_ = row_func_with_hoisted_literals;
2204 std::vector<llvm::Instruction*> placeholders;
2205 std::string prefix(
"__placeholder__literal_");
2206 for (
auto it = llvm::inst_begin(row_func_with_hoisted_literals),
2207 e = llvm::inst_end(row_func_with_hoisted_literals);
2210 if (it->hasName() && it->getName().startswith(prefix)) {
2211 auto offset_and_index_entry =
2212 cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
2213 CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2215 int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2216 int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2218 it->replaceAllUsesWith(
2219 query_func_literal_loads_function_arguments[lit_off][lit_idx]);
2220 placeholders.push_back(&*it);
2223 for (
auto placeholder : placeholders) {
2224 placeholder->removeFromParent();
2227 if (cgen_state_->filter_func_) {
2231 filter_func_with_hoisted_literals->getBasicBlockList().splice(
2232 filter_func_with_hoisted_literals->begin(),
2233 cgen_state_->filter_func_->getBasicBlockList());
2237 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2238 E = cgen_state_->filter_func_->arg_end(),
2239 I2 = filter_func_with_hoisted_literals->arg_begin();
2242 I->replaceAllUsesWith(&*I2);
2247 cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2250 std::vector<llvm::Instruction*> placeholders;
2251 std::string prefix(
"__placeholder__literal_");
2252 for (
auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2253 e = llvm::inst_end(filter_func_with_hoisted_literals);
2256 if (it->hasName() && it->getName().startswith(prefix)) {
2257 auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2258 llvm::dyn_cast<llvm::Value>(&*it));
2259 CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2261 int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2262 int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2264 it->replaceAllUsesWith(
2265 query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2266 placeholders.push_back(&*it);
2269 for (
auto placeholder : placeholders) {
2270 placeholder->removeFromParent();
2274 return hoisted_literals;
2281 return shared_mem_used
2290 const unsigned cuda_blocksize,
2291 const unsigned num_blocks_per_mp) {
2298 CHECK(query_mem_desc_ptr);
2318 if (cuda_blocksize < query_mem_desc_ptr->getEntryCount()) {
2323 const auto target_infos =
2326 if (std::find_if(target_infos.begin(),
2329 if (ti.sql_type.is_varlen() ||
2330 !supported_aggs.count(ti.agg_kind)) {
2335 }) == target_infos.end()) {
2350 if (cuda_blocksize < query_mem_desc_ptr->getEntryCount()) {
2362 const size_t shared_memory_threshold_bytes = std::min(
2365 const auto output_buffer_size =
2367 if (output_buffer_size > shared_memory_threshold_bytes) {
2374 const auto target_infos =
2380 if (std::find_if(target_infos.begin(),
2383 if (ti.sql_type.is_varlen() ||
2384 !supported_aggs.count(ti.agg_kind)) {
2389 }) == target_infos.end()) {
2400 std::string llvm_ir;
2401 std::unordered_set<llvm::MDNode*> md;
2404 for (
auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2405 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2406 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2407 instr_it->getAllMetadata(imd);
2408 for (
auto [kind, node] : imd) {
2415 for (
auto bb_it = cgen_state->
row_func_->begin(); bb_it != cgen_state->
row_func_->end();
2417 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2418 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2419 instr_it->getAllMetadata(imd);
2420 for (
auto [kind, node] : imd) {
2431 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2432 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2433 instr_it->getAllMetadata(imd);
2434 for (
auto [kind, node] : imd) {
2443 std::map<size_t, std::string> sorted_strings;
2446 llvm::raw_string_ostream os(str);
2447 p->print(os, cgen_state->
module_,
true);
2449 auto fields =
split(str, {}, 1);
2450 if (fields.empty() || fields[0].empty()) {
2453 sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2456 for (
auto [
id, text] : sorted_strings) {
2468 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2475 const bool allow_lazy_fetch,
2476 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2477 const size_t max_groups_buffer_entry_guess,
2478 const int8_t crt_min_byte_width,
2479 const bool has_cardinality_estimation,
2491 static std::uint64_t counter = 0;
2493 VLOG(1) <<
"CODEGEN #" << counter <<
":";
2494 LOG(
IR) <<
"CODEGEN #" << counter <<
":";
2495 LOG(
PTX) <<
"CODEGEN #" << counter <<
":";
2496 LOG(
ASM) <<
"CODEGEN #" << counter <<
":";
2507 addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
2515 has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2519 max_groups_buffer_entry_guess,
2526 !has_cardinality_estimation && (!render_info || !render_info->
isInSitu()) &&
2528 const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2532 const bool output_columnar =
query_mem_desc->didOutputColumnar();
2533 const bool gpu_shared_mem_optimization =
2538 cuda_mgr ? this->blockSize() : 1,
2539 cuda_mgr ?
this->numBlocksPerMP() : 1);
2540 if (gpu_shared_mem_optimization) {
2543 LOG(
DEBUG1) <<
"GPU shared memory is used for the " +
2554 const size_t num_count_distinct_descs =
2556 for (
size_t i = 0; i < num_count_distinct_descs; i++) {
2557 const auto& count_distinct_descriptor =
2570 if (
auto gby_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
2571 bool has_multiple_gpus = cuda_mgr ? cuda_mgr->getDeviceCount() > 1 :
false;
2572 if (gby_expr->get_aggtype() ==
SQLAgg::kSAMPLE && has_multiple_gpus &&
2575 bool (*)(
const Analyzer::ColumnVar*,
const Analyzer::ColumnVar*)>
2578 for (
const auto cv : colvar_set) {
2579 if (cv->get_type_info().is_varlen()) {
2580 const auto tbl_key = cv->getTableKey();
2581 std::for_each(query_infos.begin(),
2584 if (input_table_info.table_key == tbl_key &&
2585 input_table_info.info.fragments.size() > 1) {
2599 CHECK(cgen_state_->module_ ==
nullptr);
2600 cgen_state_->set_module_shallow_copy(get_rt_module(),
true);
2607 if (has_udf_module(is_gpu)) {
2609 get_udf_module(is_gpu), *cgen_state_->module_, cgen_state_.get());
2611 if (has_rt_udf_module(is_gpu)) {
2613 get_rt_udf_module(is_gpu), *cgen_state_->module_, cgen_state_.get());
2621 const auto agg_slot_count = ra_exe_unit.
estimator ? size_t(1) : agg_fnames.size();
2624 auto [query_func, row_func_call] = is_group_by
2634 !!ra_exe_unit.estimator,
2640 cgen_state_->query_func_ = query_func;
2641 cgen_state_->row_func_call_ = row_func_call;
2642 cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2643 &query_func->getEntryBlock().front());
2647 auto& fetch_bb = query_func->front();
2648 llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2649 fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2651 query_func->args().begin(),
2653 cgen_state_->context_);
2657 is_group_by ? 0 : agg_slot_count,
2659 cgen_state_->module_,
2660 cgen_state_->context_);
2661 CHECK(cgen_state_->row_func_);
2662 cgen_state_->row_func_bb_ =
2663 llvm::BasicBlock::Create(cgen_state_->context_,
"entry", cgen_state_->row_func_);
2666 auto filter_func_ft =
2667 llvm::FunctionType::get(
get_int_type(32, cgen_state_->context_), {},
false);
2668 cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2669 llvm::Function::ExternalLinkage,
2671 cgen_state_->module_);
2672 CHECK(cgen_state_->filter_func_);
2673 cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2674 cgen_state_->context_,
"entry", cgen_state_->filter_func_);
2677 cgen_state_->current_func_ = cgen_state_->row_func_;
2678 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2680 preloadFragOffsets(ra_exe_unit.
input_descs, query_infos);
2682 const auto join_loops =
2683 buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2687 plan_state_->addSimpleQual(simple_qual);
2689 const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2690 if (is_not_deleted_bb) {
2691 cgen_state_->row_func_bb_ = is_not_deleted_bb;
2693 if (!join_loops.empty()) {
2694 codegenJoinLoops(join_loops,
2695 body_execution_unit,
2696 group_by_and_aggregate,
2698 cgen_state_->row_func_bb_,
2703 const bool can_return_error = compileBody(
2704 ra_exe_unit, group_by_and_aggregate, *
query_mem_desc, co, gpu_smem_context);
2707 createErrorCheckControlFlow(query_func,
2712 group_by_and_aggregate.query_infos_);
2715 std::vector<llvm::Value*> hoisted_literals;
2718 VLOG(1) <<
"number of hoisted literals: "
2719 << cgen_state_->query_func_literal_loads_.size()
2720 <<
" / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2724 if (co.
hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2726 hoisted_literals = inlineHoistedLiterals();
2730 std::vector<llvm::Value*> row_func_args;
2731 for (
size_t i = 0; i < cgen_state_->row_func_call_->getNumOperands() - 1; ++i) {
2732 row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2734 row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2735 row_func_args.push_back(
get_arg_by_name(query_func,
"join_hash_tables"));
2738 row_func_args.insert(
2739 row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2740 llvm::ReplaceInstWithInst(
2741 cgen_state_->row_func_call_,
2742 llvm::CallInst::Create(cgen_state_->row_func_, row_func_args,
""));
2745 if (cgen_state_->filter_func_) {
2746 std::vector<llvm::Value*> filter_func_args;
2747 for (
auto arg_it = cgen_state_->filter_func_args_.begin();
2748 arg_it != cgen_state_->filter_func_args_.end();
2750 filter_func_args.push_back(arg_it->first);
2752 llvm::ReplaceInstWithInst(
2753 cgen_state_->filter_func_call_,
2754 llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args,
""));
2758 plan_state_->init_agg_vals_ =
2768 if (gpu_smem_context.isSharedMemoryUsed()) {
2772 cgen_state_->module_,
2773 cgen_state_->context_,
2776 plan_state_->init_agg_vals_,
2778 gpu_smem_code.codegen();
2779 gpu_smem_code.injectFunctionsInto(query_func);
2782 cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2783 cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2784 LOG(
IR) << gpu_smem_code.toString();
2788 auto multifrag_query_func = cgen_state_->module_->getFunction(
2789 "multifrag_query" + std::string(co.
hoist_literals ?
"_hoisted_literals" :
""));
2790 CHECK(multifrag_query_func);
2793 insertErrorCodeChecker(
2798 "query_stub" + std::string(co.
hoist_literals ?
"_hoisted_literals" :
""),
2799 multifrag_query_func,
2800 cgen_state_->module_);
2802 std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2803 if (cgen_state_->filter_func_) {
2804 root_funcs.push_back(cgen_state_->filter_func_);
2807 *cgen_state_->module_, root_funcs, {multifrag_query_func});
2814 if (cgen_state_->filter_func_) {
2825 std::string llvm_ir =
2829 VLOG(3) <<
"Unoptimized IR for the " << device_str <<
"\n" << llvm_ir <<
"\nEnd of IR";
2831 #ifdef WITH_JIT_DEBUG
2832 throw std::runtime_error(
2833 "Explain optimized not available when JIT runtime debug symbols are enabled");
2837 llvm::legacy::PassManager pass_manager;
2839 cgen_state_->module_,
2842 gpu_smem_context.isSharedMemoryUsed(),
2844 #endif // WITH_JIT_DEBUG
2855 LOG(
IR) <<
"IR for the " << device_str;
2868 AutoTrackBuffersInRuntimeIR();
2872 if (cgen_state_->filter_func_) {
2877 return std::make_tuple(
2880 ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2881 : optimizeAndCodegenGPU(query_func,
2882 multifrag_query_func,
2884 is_group_by || ra_exe_unit.estimator,
2886 gpu_smem_context.isSharedMemoryUsed(),
2888 cgen_state_->getLiterals(),
2891 std::move(gpu_smem_context)},
2896 bool hoist_literals,
2897 bool allow_runtime_query_interrupt) {
2898 auto query_stub_func_name =
2899 "query_stub" + std::string(hoist_literals ?
"_hoisted_literals" :
"");
2900 for (
auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2901 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
2902 if (!llvm::isa<llvm::CallInst>(*inst_it)) {
2905 auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
2906 if (std::string(row_func_call.getCalledFunction()->getName()) ==
2907 query_stub_func_name) {
2908 auto next_inst_it = inst_it;
2910 auto new_bb = bb_it->splitBasicBlock(next_inst_it);
2911 auto& br_instr = bb_it->back();
2912 llvm::IRBuilder<> ir_builder(&br_instr);
2913 llvm::Value* err_lv = &*inst_it;
2914 auto error_check_bb =
2915 bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
".error_check");
2916 llvm::Value* error_code_arg =
nullptr;
2918 for (
auto arg_it = query_func->arg_begin(); arg_it != query_func->arg_end();
2919 arg_it++, ++arg_cnt) {
2922 if (hoist_literals) {
2924 error_code_arg = &*arg_it;
2929 error_code_arg = &*arg_it;
2934 CHECK(error_code_arg);
2935 llvm::Value* err_code =
nullptr;
2936 if (allow_runtime_query_interrupt) {
2938 auto& check_interrupt_br_instr = bb_it->back();
2939 auto interrupt_check_bb = llvm::BasicBlock::Create(
2940 cgen_state_->context_,
".interrupt_check", query_func, error_check_bb);
2941 llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
2942 auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
2943 cgen_state_->module_->getFunction(
"check_interrupt"), {});
2944 auto detected_error = interrupt_checker_ir_builder.CreateCall(
2945 cgen_state_->module_->getFunction(
"get_error_code"),
2946 std::vector<llvm::Value*>{error_code_arg});
2947 err_code = interrupt_checker_ir_builder.CreateSelect(
2951 interrupt_checker_ir_builder.CreateBr(error_check_bb);
2952 llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
2953 llvm::BranchInst::Create(interrupt_check_bb));
2954 ir_builder.SetInsertPoint(&br_instr);
2957 ir_builder.SetInsertPoint(&br_instr);
2959 ir_builder.CreateCall(cgen_state_->module_->getFunction(
"get_error_code"),
2960 std::vector<llvm::Value*>{error_code_arg});
2962 err_lv = ir_builder.CreateICmp(
2963 llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
2964 auto error_bb = llvm::BasicBlock::Create(
2965 cgen_state_->context_,
".error_exit", query_func, new_bb);
2966 llvm::CallInst::Create(cgen_state_->module_->getFunction(
"record_error_code"),
2967 std::vector<llvm::Value*>{err_code, error_code_arg},
2970 llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2971 llvm::ReplaceInstWithInst(&br_instr,
2972 llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2987 const auto& outer_input_desc = ra_exe_unit.
input_descs[0];
2991 const auto& table_key = outer_input_desc.getTableKey();
2992 const auto deleted_cd = plan_state_->getDeletedColForTable(table_key);
2996 CHECK(deleted_cd->columnType.is_boolean());
2997 const auto deleted_expr =
2998 makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
3000 outer_input_desc.getNestLevel());
3002 const auto is_deleted =
3003 code_generator.toBool(code_generator.codegen(deleted_expr.get(),
true, co).front());
3004 const auto is_deleted_bb = llvm::BasicBlock::Create(
3005 cgen_state_->context_,
"is_deleted", cgen_state_->row_func_);
3006 llvm::BasicBlock* bb = llvm::BasicBlock::Create(
3007 cgen_state_->context_,
"is_not_deleted", cgen_state_->row_func_);
3008 cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
3009 cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
3010 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3011 cgen_state_->ir_builder_.SetInsertPoint(bb);
3026 cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
3027 llvm::Value* loop_done{
nullptr};
3028 std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
3029 if (cgen_state_->filter_func_) {
3030 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3031 auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
3032 cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
3033 row_func_entry_bb->begin());
3034 loop_done = cgen_state_->ir_builder_.CreateAlloca(
3035 get_int_type(1, cgen_state_->context_),
nullptr,
"loop_done");
3036 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3037 cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(
true), loop_done);
3039 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
3040 cgen_state_->current_func_ = cgen_state_->filter_func_;
3041 fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
3045 std::vector<Analyzer::Expr*> primary_quals;
3046 std::vector<Analyzer::Expr*> deferred_quals;
3048 ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
3049 if (short_circuited) {
3051 <<
"short-circuited and deferred " <<
std::to_string(deferred_quals.size())
3054 llvm::Value* filter_lv = cgen_state_->llBool(
true);
3056 for (
auto expr : primary_quals) {
3058 auto cond = code_generator.toBool(code_generator.codegen(expr,
true, co).front());
3059 filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
3061 CHECK(filter_lv->getType()->isIntegerTy(1));
3062 llvm::BasicBlock* sc_false{
nullptr};
3063 if (!deferred_quals.empty()) {
3064 auto sc_true = llvm::BasicBlock::Create(
3065 cgen_state_->context_,
"sc_true", cgen_state_->current_func_);
3066 sc_false = llvm::BasicBlock::Create(
3067 cgen_state_->context_,
"sc_false", cgen_state_->current_func_);
3068 cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
3069 cgen_state_->ir_builder_.SetInsertPoint(sc_false);
3071 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
3073 cgen_state_->ir_builder_.SetInsertPoint(sc_true);
3074 filter_lv = cgen_state_->llBool(
true);
3076 for (
auto expr : deferred_quals) {
3077 filter_lv = cgen_state_->ir_builder_.CreateAnd(
3078 filter_lv, code_generator.toBool(code_generator.codegen(expr,
true, co).front()));
3081 CHECK(filter_lv->getType()->isIntegerTy(1));
3082 auto ret = group_by_and_aggregate.
codegen(
3083 filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
3087 if (cgen_state_->filter_func_) {
3088 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3089 cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(
false), loop_done);
3090 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3093 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3094 cgen_state_->current_func_ = cgen_state_->row_func_;
3095 cgen_state_->filter_func_call_ =
3096 cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
3100 redeclareFilterFunction();
3102 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3103 auto loop_done_true = llvm::BasicBlock::Create(
3104 cgen_state_->context_,
"loop_done_true", cgen_state_->row_func_);
3105 auto loop_done_false = llvm::BasicBlock::Create(
3106 cgen_state_->context_,
"loop_done_false", cgen_state_->row_func_);
3107 auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(
3108 loop_done->getType()->getPointerElementType(), loop_done);
3109 cgen_state_->ir_builder_.CreateCondBr(
3110 loop_done_flag, loop_done_true, loop_done_false);
3111 cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
3112 cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3113 cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
3115 cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3122 llvm::Value* byte_stream_arg,
3123 llvm::IRBuilder<>& ir_builder,
3124 llvm::LLVMContext& ctx) {
3125 CHECK(byte_stream_arg);
3126 const auto max_col_local_id = num_columns - 1;
3128 std::vector<llvm::Value*> col_heads;
3129 for (
int col_id = 0; col_id <= max_col_local_id; ++col_id) {
3130 auto* gep = ir_builder.CreateGEP(
3131 byte_stream_arg->getType()->getScalarType()->getPointerElementType(),
3133 llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id));
3134 col_heads.emplace_back(
3135 ir_builder.CreateLoad(gep->getType()->getPointerElementType(), gep));
3139 void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, const std::vector< JoinLoop > &join_loops, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
std::vector< Analyzer::Expr * > target_exprs
double g_running_query_interrupt_freq
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
std::string get_cuda_libdevice_dir(void)
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned cuda_blocksize, const unsigned num_blocks_per_mp)
std::string gen_translate_null_key_sigs()
bool countDistinctDescriptorsLogicallyEmpty() const
size_t getEntryCount() const
static const int32_t ERR_INTERRUPTED
std::unordered_map< shared::TableKey, const ColumnDescriptor * > DeletedColumnsMap
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
void mark_function_never_inline(llvm::Function *func)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
void optimize_ir(llvm::Function *query_func, llvm::Module *llvm_module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co)
bool with_dynamic_watchdog
Streaming Top N algorithm.
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void AutoTrackBuffersInRuntimeIR()
void checkCudaErrors(CUresult err)
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
llvm::ConstantInt * ll_int(const T v, llvm::LLVMContext &context)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *llvm_module)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_string(const std::string &udf_ir_string, llvm::LLVMContext &ctx, bool is_gpu=false)
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *mod, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
bool hasKeylessHash() const
std::vector< std::string > CodeCacheKey
ExecutorOptLevel opt_level
bool g_enable_dynamic_watchdog
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *mod, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
bool filter_on_deleted_column
size_t getRowSize() const
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
llvm::Function * row_func_
bool g_enable_smem_non_grouped_agg
std::shared_lock< T > shared_lock
unsigned getExpOfTwo(unsigned n)
bool output_columnar_hint
llvm::StringRef get_gpu_target_triple_string()
Supported runtime functions management and retrieval.
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
void verify_function_ir(const llvm::Function *func)
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
bool useStreamingTopN() const
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
ExecutionEngineWrapper create_execution_engine(llvm::Module *llvm_module, llvm::EngineBuilder &eb, const CompilationOptions &co)
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx, bool is_gpu=false)
ExecutorExplainType explain_type
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
void insertErrorCodeChecker(llvm::Function *query_func, bool hoist_literals, bool allow_runtime_query_interrupt)
static const int32_t ERR_OUT_OF_TIME
void initializeNVPTXBackend() const
size_t getMinSharedMemoryPerBlockForAllDevices() const
const std::string cuda_rt_decls
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
CubinResult ptx_to_cubin(const std::string &ptx, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
QueryDescriptionType getQueryDescriptionType() const
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *llvm_module, llvm::LLVMContext &context)
ExecutorDeviceType device_type
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *llvm_module)
llvm::Function * filter_func_
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
static void addUdfIrToModule(const std::string &udf_ir_filename, const bool is_cuda_ir)
bool isArchMaxwellOrLaterForAll() const
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *llvm_module)
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool is_gpu_smem_used, const CompilationOptions &)
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(Executor *executor, llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co, const GPUTarget &gpu_target)
bool g_enable_smem_grouped_non_count_agg
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::unordered_map< shared::TableKey, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
static std::map< ExtModuleKinds, std::string > extension_module_sources
void show_defined(llvm::Module &llvm_module)
bool g_enable_filter_function
static void linkModuleWithLibdevice(Executor *executor, llvm::Module &module, llvm::PassManagerBuilder &pass_manager_builder, const GPUTarget &gpu_target)
float g_fraction_code_cache_to_evict
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
SQLAgg get_aggtype() const
std::string filename(char const *path)
std::list< std::shared_ptr< Analyzer::Expr > > quals
std::string gen_array_any_all_sigs()
bool didOutputColumnar() const
#define DEBUG_TIMER(name)
llvm::ValueToValueMapTy vmap_
std::vector< llvm::Value * > inlineHoistedLiterals()
static std::shared_ptr< QueryEngine > getInstance()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool register_intel_jit_listener
bool any_of(std::vector< Analyzer::Expr * > const &target_exprs)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
static llvm::sys::Mutex g_ee_create_mutex
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls, const bool is_gpu=false)
bool allow_runtime_query_interrupt
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
llvm::Type * get_int_ptr_type(const int width, llvm::LLVMContext &context)
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
std::unique_ptr< llvm::Module > read_llvm_module_from_bc_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
size_t g_gpu_smem_threshold