19 #if LLVM_VERSION_MAJOR < 9
20 static_assert(
false,
"LLVM Version >= 9 is required.");
23 #include <llvm/Analysis/ScopedNoAliasAA.h>
24 #include <llvm/Analysis/TypeBasedAliasAnalysis.h>
25 #include <llvm/Bitcode/BitcodeReader.h>
26 #include <llvm/Bitcode/BitcodeWriter.h>
27 #include <llvm/ExecutionEngine/MCJIT.h>
28 #include <llvm/IR/Attributes.h>
29 #include <llvm/IR/GlobalValue.h>
30 #include <llvm/IR/InstIterator.h>
31 #include <llvm/IR/IntrinsicInst.h>
32 #include <llvm/IR/Intrinsics.h>
33 #include <llvm/IR/LegacyPassManager.h>
34 #include <llvm/IR/Verifier.h>
35 #include <llvm/IRReader/IRReader.h>
36 #if 14 <= LLVM_VERSION_MAJOR
37 #include <llvm/MC/TargetRegistry.h>
39 #include <llvm/Support/TargetRegistry.h>
41 #include <llvm/Support/Casting.h>
42 #include <llvm/Support/FileSystem.h>
43 #include <llvm/Support/FormattedStream.h>
44 #include <llvm/Support/MemoryBuffer.h>
45 #include <llvm/Support/SourceMgr.h>
46 #include <llvm/Support/TargetSelect.h>
47 #include <llvm/Support/raw_os_ostream.h>
48 #include <llvm/Support/raw_ostream.h>
49 #include <llvm/Transforms/IPO.h>
50 #include <llvm/Transforms/IPO/AlwaysInliner.h>
51 #include <llvm/Transforms/IPO/InferFunctionAttrs.h>
52 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
53 #include <llvm/Transforms/InstCombine/InstCombine.h>
54 #include <llvm/Transforms/Instrumentation.h>
55 #include <llvm/Transforms/Scalar.h>
56 #include <llvm/Transforms/Scalar/GVN.h>
57 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
58 #include <llvm/Transforms/Utils.h>
59 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
60 #include <llvm/Transforms/Utils/Cloning.h>
62 #if LLVM_VERSION_MAJOR >= 11
63 #include <llvm/Support/Host.h>
85 #include <llvm/Support/DynamicLibrary.h>
87 #ifndef GEOS_LIBRARY_FILENAME
88 #error Configuration should include GEOS library file name
90 std::unique_ptr<std::string> g_libgeos_so_filename(
91 new std::string(GEOS_LIBRARY_FILENAME));
92 static llvm::sys::DynamicLibrary geos_dynamic_library;
93 static std::mutex geos_init_mutex;
97 void load_geos_dynamic_library() {
98 std::lock_guard<std::mutex> guard(geos_init_mutex);
100 if (!geos_dynamic_library.isValid()) {
101 if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
102 LOG(
WARNING) <<
"Misconfigured GEOS library file name, trying 'libgeos_c.so'";
103 g_libgeos_so_filename.reset(
new std::string(
"libgeos_c.so"));
105 auto filename = *g_libgeos_so_filename;
106 std::string error_message;
107 geos_dynamic_library =
108 llvm::sys::DynamicLibrary::getPermanentLibrary(
filename.c_str(), &error_message);
109 if (!geos_dynamic_library.isValid()) {
111 std::string exception_message =
"Failed to load GEOS library: " + error_message;
112 throw std::runtime_error(exception_message.c_str());
125 std::string src =
"",
126 const bool is_gpu =
false) {
127 std::string excname = (is_gpu ?
"NVVM IR ParseError: " :
"LLVM IR ParseError: ");
128 llvm::raw_string_ostream ss(excname);
129 parse_error.print(src.c_str(), ss,
false,
false);
143 #define SHOW_DEFINED(MODULE) \
145 std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
146 ::show_defined(MODULE); \
149 #define SHOW_FUNCTIONS(MODULE) \
151 std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
152 ::show_functions(MODULE); \
155 template <
typename T =
void>
157 std::cout <<
"defines: ";
158 for (
auto&
f : llvm_module.getFunctionList()) {
159 if (!
f.isDeclaration()) {
160 std::cout <<
f.getName().str() <<
", ";
163 std::cout << std::endl;
166 template <
typename T =
void>
168 if (llvm_module ==
nullptr) {
169 std::cout <<
"is null" << std::endl;
175 template <
typename T =
void>
194 template <
typename T =
void>
196 std::unordered_set<std::string>& defined,
197 std::unordered_set<std::string>& undefined,
198 const std::unordered_set<std::string>& ignored) {
199 for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
200 if (
auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
201 auto* F2 = CI->getCalledFunction();
203 auto F2name = F2->getName().str();
204 if (F2->isDeclaration()) {
205 if (F2name.rfind(
"__", 0) !=
207 && F2name.rfind(
"llvm.", 0) !=
209 && ignored.find(F2name) == ignored.end()
211 undefined.emplace(F2name);
214 if (defined.find(F2name) == defined.end()) {
215 defined.emplace(F2name);
216 scan_function_calls<T>(*F2, defined, undefined, ignored);
224 template <
typename T =
void>
226 std::unordered_set<std::string>& defined,
227 std::unordered_set<std::string>& undefined,
228 const std::unordered_set<std::string>& ignored) {
229 for (
auto& F : llvm_module) {
230 if (!F.isDeclaration()) {
236 template <
typename T =
void>
237 std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>>
239 const std::unordered_set<std::string>& ignored = {}) {
240 std::unordered_set<std::string> defined, undefined;
242 return std::make_tuple(defined, undefined);
245 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
248 const std::unordered_set<llvm::Function*>& live_funcs) {
249 std::vector<llvm::Function*> dead_funcs;
252 if (live_funcs.count(&F)) {
255 for (
auto U : F.users()) {
256 auto* C = llvm::dyn_cast<
const llvm::CallInst>(U);
257 if (!C || C->getParent()->getParent() != &F) {
263 dead_funcs.push_back(&F);
266 for (
auto pFn : dead_funcs) {
267 pFn->eraseFromParent();
275 bool check_module_requires_libdevice(llvm::Module* llvm_module) {
277 for (llvm::Function& F : *llvm_module) {
278 if (F.hasName() && F.getName().startswith(
"__nv_")) {
279 LOG(
INFO) <<
"Module requires linking with libdevice: " << std::string(F.getName());
283 LOG(
DEBUG1) <<
"module does not require linking against libdevice";
288 void add_intrinsics_to_module(llvm::Module* llvm_module) {
289 for (llvm::Function& F : *llvm_module) {
290 for (llvm::Instruction& I : instructions(F)) {
291 if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
292 if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
293 llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
294 llvm::Function& decl_fn =
295 *llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID(), Tys);
296 ii->setCalledFunction(&decl_fn);
299 llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID());
309 llvm::Module* llvm_module,
310 llvm::legacy::PassManager& pass_manager,
311 const std::unordered_set<llvm::Function*>& live_funcs,
312 const bool is_gpu_smem_used,
316 pass_manager.add(llvm::createVerifierPass());
317 pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
321 pass_manager.add(llvm::createSROAPass());
325 llvm::createEarlyCSEPass(
true));
327 if (!is_gpu_smem_used) {
332 pass_manager.add(llvm::createJumpThreadingPass());
334 pass_manager.add(llvm::createCFGSimplificationPass());
337 pass_manager.add(llvm::createNewGVNPass());
339 pass_manager.add(llvm::createDeadStoreEliminationPass());
340 pass_manager.add(llvm::createLICMPass());
342 pass_manager.add(llvm::createInstructionCombiningPass());
345 pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
346 pass_manager.add(llvm::createGlobalOptimizerPass());
348 pass_manager.add(llvm::createCFGSimplificationPass());
350 pass_manager.run(*llvm_module);
361 : execution_engine_(execution_engine) {}
365 : execution_engine_(execution_engine) {
368 #ifdef ENABLE_INTEL_JIT_LISTENER
372 LOG(
INFO) <<
"Registered IntelJITEventListener";
374 LOG(
WARNING) <<
"This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
375 "listener configuration parameter.";
376 #endif // ENABLE_INTEL_JIT_LISTENER
382 llvm::ExecutionEngine* execution_engine) {
389 std::stringstream err_ss;
390 llvm::raw_os_ostream err_os(err_ss);
391 err_os <<
"\n-----\n";
392 if (llvm::verifyFunction(*func, &err_os)) {
393 err_os <<
"\n-----\n";
394 func->print(err_os,
nullptr);
395 err_os <<
"\n-----\n";
403 llvm::Module* llvm_module) {
404 llvm::legacy::PassManager pass_manager;
405 auto cpu_target_machine = execution_engine->getTargetMachine();
406 CHECK(cpu_target_machine);
407 llvm::SmallString<256> code_str;
408 llvm::raw_svector_ostream os(code_str);
409 #if LLVM_VERSION_MAJOR >= 10
410 cpu_target_machine->addPassesToEmitFile(
411 pass_manager, os,
nullptr, llvm::CGFT_AssemblyFile);
413 cpu_target_machine->addPassesToEmitFile(
414 pass_manager, os,
nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
416 pass_manager.run(*llvm_module);
417 return "Assembly for the CPU:\n" + std::string(code_str.str()) +
"\nEnd of assembly";
421 llvm::EngineBuilder& eb,
431 CHECK(execution_engine.get());
433 llvm_module->setDataLayout(execution_engine->getDataLayout());
437 execution_engine->finalizeObject();
438 return execution_engine;
444 llvm::Function* func,
445 const std::unordered_set<llvm::Function*>& live_funcs,
448 llvm::Module* llvm_module = func->getParent();
450 #ifndef WITH_JIT_DEBUG
451 llvm::legacy::PassManager pass_manager;
453 func, llvm_module, pass_manager, live_funcs,
false, co);
454 #endif // WITH_JIT_DEBUG
456 auto init_err = llvm::InitializeNativeTarget();
459 llvm::InitializeAllTargetMCs();
460 llvm::InitializeNativeTargetAsmPrinter();
461 llvm::InitializeNativeTargetAsmParser();
464 std::unique_ptr<llvm::Module> owner(llvm_module);
465 llvm::EngineBuilder eb(std::move(owner));
466 eb.setErrorStr(&err_str);
467 eb.setEngineKind(llvm::EngineKind::JIT);
468 llvm::TargetOptions to;
469 to.EnableFastISel =
true;
470 eb.setTargetOptions(to);
479 llvm::Function* query_func,
480 llvm::Function* multifrag_query_func,
481 const std::unordered_set<llvm::Function*>& live_funcs,
486 llvm::Module* M = query_func->getParent();
487 auto* flag = llvm::mdconst::extract_or_null<llvm::ConstantInt>(
488 M->getModuleFlag(
"manage_memory_buffer"));
489 if (flag and flag->getZExtValue() == 1 and M->getFunction(
"allocate_varlen_buffer") and
490 M->getFunction(
"register_buffer_with_executor_rsm")) {
491 LOG(
INFO) <<
"including executor addr to cache key\n";
494 if (cgen_state_->filter_func_) {
497 for (
const auto helper : cgen_state_->helper_functions_) {
505 if (cgen_state_->needs_geos_) {
507 auto llvm_module = multifrag_query_func->getParent();
508 load_geos_dynamic_library();
511 auto rt_geos_module_copy = llvm::CloneModule(
512 *get_geos_module(), cgen_state_->vmap_, [](
const llvm::GlobalValue* gv) {
513 auto func = llvm::dyn_cast<llvm::Function>(gv);
517 return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
518 func->getLinkage() ==
519 llvm::GlobalValue::LinkageTypes::InternalLinkage ||
520 func->getLinkage() == llvm::GlobalValue::LinkageTypes::ExternalLinkage);
525 llvm::Linker::Flags::LinkOnlyNeeded);
527 throw std::runtime_error(
"GEOS is disabled in this build");
531 auto execution_engine =
533 auto cpu_compilation_context =
534 std::make_shared<CpuCompilationContext>(std::move(execution_engine));
535 cpu_compilation_context->setFunctionPointer(multifrag_query_func);
541 llvm::Module& llvm_module,
543 llvm::Linker::Flags flags) {
547 for (
auto&
f : *udf_module) {
548 auto func = llvm_module.getFunction(
f.getName());
550 LOG(
ERROR) <<
" Attempt to overwrite " <<
f.getName().str() <<
" in "
551 << llvm_module.getModuleIdentifier() <<
" from `"
552 << udf_module->getModuleIdentifier() <<
"`" << std::endl;
553 throw std::runtime_error(
554 "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
557 VLOG(1) <<
" Adding " <<
f.getName().str() <<
" to "
558 << llvm_module.getModuleIdentifier() <<
" from `"
559 << udf_module->getModuleIdentifier() <<
"`" << std::endl;
563 auto udf_module_copy = llvm::CloneModule(*udf_module, cgen_state->
vmap_);
565 udf_module_copy->setDataLayout(llvm_module.getDataLayout());
566 udf_module_copy->setTargetTriple(llvm_module.getTargetTriple());
569 llvm::Linker ld(llvm_module);
570 bool link_error =
false;
572 link_error = ld.linkInModule(std::move(udf_module_copy), flags);
575 throw std::runtime_error(
"link_udf_module: *** error linking module ***");
585 if (s ==
"int16_t") {
588 if (s ==
"int32_t") {
591 if (s ==
"int64_t") {
594 CHECK(s ==
"float" || s ==
"double");
600 for (
const std::string any_or_all : {
"any",
"all"}) {
601 for (
const std::string elem_type :
602 {
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"float",
"double"}) {
603 for (
const std::string needle_type :
604 {
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"float",
"double"}) {
605 for (
const std::string op_name : {
"eq",
"ne",
"lt",
"le",
"gt",
"ge"}) {
606 result += (
"declare i1 @array_" + any_or_all +
"_" + op_name +
"_" + elem_type +
618 for (
const std::string key_type : {
"int8_t",
"int16_t",
"int32_t",
"int64_t"}) {
620 result +=
"declare i64 @translate_null_key_" + key_type +
"(" + key_llvm_type +
", " +
621 key_llvm_type +
", i64);\n";
627 R
"(
declare void @llvm.dbg.declare(metadata, metadata, metadata)
declare void @llvm.dbg.value(metadata, metadata, metadata)
declare double @llvm.fmuladd.f64(double, double, double)
declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
declare i64 @get_thread_index();
declare i64 @get_block_index();
declare i32 @pos_start_impl(i32*);
declare i32 @group_buff_idx_impl();
declare i32 @pos_step_impl();
declare i8 @thread_warp_idx(i8);
declare i64* @init_shared_mem(i64*, i32);
declare i64* @init_shared_mem_nop(i64*, i32);
declare i64* @declare_dynamic_shared_memory();
declare void @write_back_nop(i64*, i64*, i32);
declare void @write_back_non_grouped_agg(i64*, i64*, i32);
declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8);
declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32);
declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32);
declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32);
declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32);
declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32);
declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32);
declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64);
declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64);
declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64);
declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64);
declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64);
declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double);
declare i64 @get_bucket_key_for_range_double(i8*, i64, double);
declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double);
declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64);
declare i64 @agg_count_shared(i64*, i64);
declare i64 @agg_count_skip_val_shared(i64*, i64, i64);
declare i32 @agg_count_int32_shared(i32*, i32);
declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32);
declare i64 @agg_count_double_shared(i64*, double);
declare i64 @agg_count_double_skip_val_shared(i64*, double, double);
declare i32 @agg_count_float_shared(i32*, float);
declare i32 @agg_count_float_skip_val_shared(i32*, float, float);
declare i64 @agg_sum_shared(i64*, i64);
declare i64 @agg_sum_skip_val_shared(i64*, i64, i64);
declare i32 @agg_sum_int32_shared(i32*, i32);
declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_sum_double_shared(i64*, double);
declare void @agg_sum_double_skip_val_shared(i64*, double, double);
declare void @agg_sum_float_shared(i32*, float);
declare void @agg_sum_float_skip_val_shared(i32*, float, float);
declare void @agg_max_shared(i64*, i64);
declare void @agg_max_skip_val_shared(i64*, i64, i64);
declare void @agg_max_int32_shared(i32*, i32);
declare void @agg_max_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_max_int16_shared(i16*, i16);
declare void @agg_max_int16_skip_val_shared(i16*, i16, i16);
declare void @agg_max_int8_shared(i8*, i8);
declare void @agg_max_int8_skip_val_shared(i8*, i8, i8);
declare void @agg_max_double_shared(i64*, double);
declare void @agg_max_double_skip_val_shared(i64*, double, double);
declare void @agg_max_float_shared(i32*, float);
declare void @agg_max_float_skip_val_shared(i32*, float, float);
declare void @agg_min_shared(i64*, i64);
declare void @agg_min_skip_val_shared(i64*, i64, i64);
declare void @agg_min_int32_shared(i32*, i32);
declare void @agg_min_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_min_int16_shared(i16*, i16);
declare void @agg_min_int16_skip_val_shared(i16*, i16, i16);
declare void @agg_min_int8_shared(i8*, i8);
declare void @agg_min_int8_skip_val_shared(i8*, i8, i8);
declare void @agg_min_double_shared(i64*, double);
declare void @agg_min_double_skip_val_shared(i64*, double, double);
declare void @agg_min_float_shared(i32*, float);
declare void @agg_min_float_skip_val_shared(i32*, float, float);
declare void @agg_id_shared(i64*, i64);
declare i8* @agg_id_varlen_shared(i8*, i64, i8*, i64);
declare void @agg_id_int32_shared(i32*, i32);
declare void @agg_id_int16_shared(i16*, i16);
declare void @agg_id_int8_shared(i8*, i8);
declare void @agg_id_double_shared(i64*, double);
declare void @agg_id_double_shared_slow(i64*, double*);
declare void @agg_id_float_shared(i32*, float);
declare i32 @checked_single_agg_id_shared(i64*, i64, i64);
declare i32 @checked_single_agg_id_double_shared(i64*, double, double);
declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double);
declare i32 @checked_single_agg_id_float_shared(i32*, float, float);
declare i1 @slotEmptyKeyCAS(i64*, i64, i64);
declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32);
declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16);
declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8);
declare i64 @datetrunc_century(i64);
declare i64 @datetrunc_day(i64);
declare i64 @datetrunc_decade(i64);
declare i64 @datetrunc_hour(i64);
declare i64 @datetrunc_millennium(i64);
declare i64 @datetrunc_minute(i64);
declare i64 @datetrunc_month(i64);
declare i64 @datetrunc_quarter(i64);
declare i64 @datetrunc_quarterday(i64);
declare i64 @datetrunc_week_monday(i64);
declare i64 @datetrunc_week_sunday(i64);
declare i64 @datetrunc_week_saturday(i64);
declare i64 @datetrunc_year(i64);
declare i64 @extract_epoch(i64);
declare i64 @extract_dateepoch(i64);
declare i64 @extract_quarterday(i64);
declare i64 @extract_hour(i64);
declare i64 @extract_minute(i64);
declare i64 @extract_second(i64);
declare i64 @extract_millisecond(i64);
declare i64 @extract_microsecond(i64);
declare i64 @extract_nanosecond(i64);
declare i64 @extract_dow(i64);
declare i64 @extract_isodow(i64);
declare i64 @extract_day(i64);
declare i64 @extract_week_monday(i64);
declare i64 @extract_week_sunday(i64);
declare i64 @extract_week_saturday(i64);
declare i64 @extract_day_of_year(i64);
declare i64 @extract_month(i64);
declare i64 @extract_quarter(i64);
declare i64 @extract_year(i64);
declare i64 @ExtractTimeFromHPTimestamp(i64,i64);
declare i64 @ExtractTimeFromHPTimestampNullable(i64,i64,i64);
declare i64 @ExtractTimeFromLPTimestamp(i64);
declare i64 @ExtractTimeFromLPTimestampNullable(i64,i64);
declare i64 @DateTruncateHighPrecisionToDate(i64, i64);
declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64);
declare i64 @DateDiff(i32, i64, i64);
declare i64 @DateDiffNullable(i32, i64, i64, i64);
declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32);
declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64);
declare i64 @DateAdd(i32, i64, i64);
declare i64 @DateAddNullable(i32, i64, i64, i64);
declare i64 @DateAddHighPrecision(i32, i64, i64, i32);
declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64);
declare i64 @string_decode(i8*, i64);
declare i32 @array_size(i8*, i64, i32);
declare i32 @array_size_nullable(i8*, i64, i32, i32);
declare i32 @fast_fixlen_array_size(i8*, i32);
declare i1 @array_is_null(i8*, i64);
declare i1 @point_coord_array_is_null(i8*, i64);
declare i8* @array_buff(i8*, i64);
declare i8* @fast_fixlen_array_buff(i8*, i64);
declare i8 @array_at_int8_t(i8*, i64, i32);
declare i16 @array_at_int16_t(i8*, i64, i32);
declare i32 @array_at_int32_t(i8*, i64, i32);
declare i64 @array_at_int64_t(i8*, i64, i32);
declare float @array_at_float(i8*, i64, i32);
declare double @array_at_double(i8*, i64, i32);
declare i8 @varlen_array_at_int8_t(i8*, i64, i32);
declare i16 @varlen_array_at_int16_t(i8*, i64, i32);
declare i32 @varlen_array_at_int32_t(i8*, i64, i32);
declare i64 @varlen_array_at_int64_t(i8*, i64, i32);
declare float @varlen_array_at_float(i8*, i64, i32);
declare double @varlen_array_at_double(i8*, i64, i32);
declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32);
declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32);
declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32);
declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32);
declare float @varlen_notnull_array_at_float(i8*, i64, i32);
declare double @varlen_notnull_array_at_double(i8*, i64, i32);
declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8);
declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16);
declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32);
declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64);
declare float @array_at_float_checked(i8*, i64, i64, float);
declare double @array_at_double_checked(i8*, i64, i64, double);
declare i32 @char_length(i8*, i32);
declare i32 @char_length_nullable(i8*, i32, i32);
declare i32 @char_length_encoded(i8*, i32);
declare i32 @char_length_encoded_nullable(i8*, i32, i32);
declare i32 @key_for_string_encoded(i32);
declare i1 @sample_ratio(double, i64);
declare double @width_bucket(double, double, double, double, i32);
declare double @width_bucket_reverse(double, double, double, double, i32);
declare double @width_bucket_nullable(double, double, double, double, i32, double);
declare double @width_bucket_reversed_nullable(double, double, double, double, i32, double);
declare double @width_bucket_no_oob_check(double, double, double);
declare double @width_bucket_reverse_no_oob_check(double, double, double);
declare double @width_bucket_expr(double, i1, double, double, i32);
declare double @width_bucket_expr_nullable(double, i1, double, double, i32, double);
declare double @width_bucket_expr_no_oob_check(double, i1, double, double, i32);
declare i1 @string_like(i8*, i32, i8*, i32, i8);
declare i1 @string_ilike(i8*, i32, i8*, i32, i8);
declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8);
declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8);
declare i1 @string_like_simple(i8*, i32, i8*, i32);
declare i1 @string_ilike_simple(i8*, i32, i8*, i32);
declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8);
declare i1 @string_lt(i8*, i32, i8*, i32);
declare i1 @string_le(i8*, i32, i8*, i32);
declare i1 @string_gt(i8*, i32, i8*, i32);
declare i1 @string_ge(i8*, i32, i8*, i32);
declare i1 @string_eq(i8*, i32, i8*, i32);
declare i1 @string_ne(i8*, i32, i8*, i32);
declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8);
declare i1 @regexp_like(i8*, i32, i8*, i32, i8);
declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8);
declare void @linear_probabilistic_count(i8*, i32, i8*, i32);
declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64);
declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64);
declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64);
declare void @record_error_code(i32, i32*);
declare i32 @get_error_code(i32*);
declare i1 @dynamic_watchdog();
declare i1 @check_interrupt();
declare void @force_sync();
declare void @sync_warp();
declare void @sync_warp_protected(i64, i64);
declare void @sync_threadblock();
declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32);
declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64);
declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float);
declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double);
declare double @decompress_x_coord_geoint(i32);
declare double @decompress_y_coord_geoint(i32);
declare i32 @compress_x_coord_geoint(double);
declare i32 @compress_y_coord_geoint(double);
)" + gen_array_any_all_sigs() +
631 std::string extension_function_decls(
const std::unordered_set<std::string>& udf_decls) {
637 void legalize_nvvm_ir(llvm::Function* query_func) {
644 std::vector<llvm::Instruction*> stackrestore_intrinsics;
645 std::vector<llvm::Instruction*> stacksave_intrinsics;
646 std::vector<llvm::Instruction*> lifetime;
647 for (
auto& BB : *query_func) {
648 for (llvm::Instruction& I : BB) {
649 if (
const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
650 if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
651 stacksave_intrinsics.push_back(&I);
652 }
else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
653 stackrestore_intrinsics.push_back(&I);
654 }
else if (II->getIntrinsicID() == llvm::Intrinsic::lifetime_start ||
655 II->getIntrinsicID() == llvm::Intrinsic::lifetime_end) {
656 lifetime.push_back(&I);
665 for (
auto& II : stackrestore_intrinsics) {
666 II->eraseFromParent();
668 for (
auto& II : stacksave_intrinsics) {
669 II->eraseFromParent();
672 for (
auto& II : lifetime) {
673 II->eraseFromParent();
681 return llvm::StringRef(
"nvptx64-nvidia-cuda");
685 return llvm::StringRef(
686 "e-p:64:64:64-i1:8:8-i8:8:8-"
687 "i16:16:16-i32:32:32-i64:64:64-"
688 "f32:32:32-f64:64:64-v16:16:16-"
689 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
693 std::map<std::string, std::string>
result;
695 result.insert(std::make_pair(
"cpu_name", llvm::sys::getHostCPUName()));
696 result.insert(std::make_pair(
"cpu_triple", llvm::sys::getProcessTriple()));
698 std::make_pair(
"cpu_cores",
std::to_string(llvm::sys::getHostNumPhysicalCores())));
702 std::string sizeof_types;
705 sizeof_types +=
"ssize_t:" +
std::to_string(
sizeof(ssize_t)) +
";";
707 sizeof_types +=
"uchar:" +
std::to_string(
sizeof(
unsigned char)) +
";";
709 sizeof_types +=
"ushort:" +
std::to_string(
sizeof(
unsigned short int)) +
";";
711 sizeof_types +=
"uint:" +
std::to_string(
sizeof(
unsigned int)) +
";";
713 sizeof_types +=
"ulong:" +
std::to_string(
sizeof(
unsigned long int)) +
";";
714 sizeof_types +=
"longlong:" +
std::to_string(
sizeof(
long long int)) +
";";
715 sizeof_types +=
"ulonglong:" +
std::to_string(
sizeof(
unsigned long long int)) +
";";
718 sizeof_types +=
"longdouble:" +
std::to_string(
sizeof(
long double)) +
";";
721 result.insert(std::make_pair(
"type_sizeof", sizeof_types));
723 std::string null_values;
724 null_values +=
"boolean1:" +
std::to_string(serialized_null_value<bool>()) +
";";
725 null_values +=
"boolean8:" +
std::to_string(serialized_null_value<int8_t>()) +
";";
726 null_values +=
"int8:" +
std::to_string(serialized_null_value<int8_t>()) +
";";
727 null_values +=
"int16:" +
std::to_string(serialized_null_value<int16_t>()) +
";";
728 null_values +=
"int32:" +
std::to_string(serialized_null_value<int32_t>()) +
";";
729 null_values +=
"int64:" +
std::to_string(serialized_null_value<int64_t>()) +
";";
730 null_values +=
"uint8:" +
std::to_string(serialized_null_value<uint8_t>()) +
";";
731 null_values +=
"uint16:" +
std::to_string(serialized_null_value<uint16_t>()) +
";";
732 null_values +=
"uint32:" +
std::to_string(serialized_null_value<uint32_t>()) +
";";
733 null_values +=
"uint64:" +
std::to_string(serialized_null_value<uint64_t>()) +
";";
734 null_values +=
"float32:" +
std::to_string(serialized_null_value<float>()) +
";";
735 null_values +=
"float64:" +
std::to_string(serialized_null_value<double>()) +
";";
737 "Array<boolean8>:" +
std::to_string(serialized_null_value<int8_t, true>()) +
";";
739 "Array<int8>:" +
std::to_string(serialized_null_value<int8_t, true>()) +
";";
741 "Array<int16>:" +
std::to_string(serialized_null_value<int16_t, true>()) +
";";
743 "Array<int32>:" +
std::to_string(serialized_null_value<int32_t, true>()) +
";";
745 "Array<int64>:" +
std::to_string(serialized_null_value<int64_t, true>()) +
";";
747 "Array<float32>:" +
std::to_string(serialized_null_value<float, true>()) +
";";
749 "Array<float64>:" +
std::to_string(serialized_null_value<double, true>()) +
";";
751 result.insert(std::make_pair(
"null_values", null_values));
753 llvm::StringMap<bool> cpu_features;
754 if (llvm::sys::getHostCPUFeatures(cpu_features)) {
755 std::string features_str =
"";
756 for (
auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
757 features_str += (it->getValue() ?
" +" :
" -");
758 features_str += it->getKey().str();
760 result.insert(std::make_pair(
"cpu_features", features_str));
763 result.insert(std::make_pair(
"llvm_version",
770 int device_count = 0;
774 char device_name[256];
775 int major = 0, minor = 0;
780 &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
782 &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
785 result.insert(std::make_pair(
"gpu_name", device_name));
786 result.insert(std::make_pair(
"gpu_count",
std::to_string(device_count)));
787 result.insert(std::make_pair(
"gpu_compute_capability",
791 result.insert(std::make_pair(
"gpu_driver",
804 std::unordered_set<llvm::Function*> findAliveRuntimeFuncs(
805 llvm::Module& llvm_module,
806 const std::vector<llvm::Function*>& roots) {
807 std::queue<llvm::Function*> queue;
808 std::unordered_set<llvm::Function*> visited;
809 for (llvm::Function* F : roots) {
813 while (!queue.empty()) {
814 llvm::Function* F = queue.front();
816 if (visited.find(F) != visited.end()) {
821 for (llvm::inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
822 if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
823 if (CI->isInlineAsm())
825 llvm::Function* called = CI->getCalledFunction();
826 if (!called || visited.find(called) != visited.end()) {
841 llvm::Module& llvm_module,
842 llvm::PassManagerBuilder& pass_manager_builder,
843 const GPUTarget& gpu_target) {
847 if (!executor->has_libdevice_module()) {
849 throw std::runtime_error(
850 "libdevice library is not available but required by the UDF module");
854 std::vector<llvm::Function*> roots;
855 for (llvm::Function& fn : llvm_module) {
856 if (!fn.isDeclaration())
857 roots.emplace_back(&fn);
863 gpu_target.cgen_state,
864 llvm::Linker::Flags::OverrideFromSrc);
866 std::unordered_set<llvm::Function*> live_funcs =
867 findAliveRuntimeFuncs(llvm_module, roots);
869 std::vector<llvm::Function*> funcs_to_delete;
870 for (llvm::Function& fn : llvm_module) {
871 if (!live_funcs.count(&fn)) {
873 funcs_to_delete.emplace_back(&fn);
877 for (llvm::Function*
f : funcs_to_delete) {
878 f->eraseFromParent();
882 #if LLVM_VERSION_MAJOR >= 11
883 llvm::LLVMContext& ctx = llvm_module.getContext();
884 llvm_module.setModuleFlag(llvm::Module::Override,
886 llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
887 llvm::Type::getInt32Ty(ctx), uint32_t(1))));
889 llvm_module.addModuleFlag(llvm::Module::Override,
"nvvm-reflect-ftz", uint32_t(1));
891 for (llvm::Function& fn : llvm_module) {
892 fn.addFnAttr(
"nvptx-f32ftz",
"true");
896 gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
897 llvm::legacy::FunctionPassManager FPM(&llvm_module);
898 pass_manager_builder.populateFunctionPassManager(FPM);
901 FPM.doInitialization();
902 for (
auto& F : llvm_module) {
905 FPM.doFinalization();
911 llvm::Function* func,
912 llvm::Function* wrapper_func,
913 const std::unordered_set<llvm::Function*>& live_funcs,
914 const bool is_gpu_smem_used,
916 const GPUTarget& gpu_target) {
919 auto llvm_module = func->getParent();
940 CHECK(gpu_target.cgen_state->module_ == llvm_module);
941 CHECK(func->getParent() == wrapper_func->getParent());
942 llvm_module->setDataLayout(
943 "e-p:64:64:64-i1:8:8-i8:8:8-"
944 "i16:16:16-i32:32:32-i64:64:64-"
945 "f32:32:32-f64:64:64-v16:16:16-"
946 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
947 llvm_module->setTargetTriple(
"nvptx64-nvidia-cuda");
948 CHECK(gpu_target.nvptx_target_machine);
949 llvm::PassManagerBuilder pass_manager_builder = llvm::PassManagerBuilder();
951 pass_manager_builder.OptLevel = 0;
952 llvm::legacy::PassManager module_pass_manager;
953 pass_manager_builder.populateModulePassManager(module_pass_manager);
955 bool requires_libdevice = check_module_requires_libdevice(llvm_module);
957 if (requires_libdevice) {
962 optimize_ir(func, llvm_module, module_pass_manager, live_funcs, is_gpu_smem_used, co);
963 legalize_nvvm_ir(func);
965 std::stringstream ss;
966 llvm::raw_os_ostream os(ss);
968 llvm::LLVMContext& ctx = llvm_module->getContext();
970 llvm::NamedMDNode* md = llvm_module->getOrInsertNamedMetadata(
"nvvm.annotations");
972 llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
973 llvm::MDString::get(ctx,
"kernel"),
974 llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
975 llvm::Type::getInt32Ty(ctx), 1))};
978 md->addOperand(llvm::MDNode::get(ctx, md_vals));
980 std::unordered_set<llvm::Function*> roots{wrapper_func, func};
981 if (gpu_target.row_func_not_inlined) {
983 roots.insert(gpu_target.cgen_state->row_func_);
984 if (gpu_target.cgen_state->filter_func_) {
985 roots.insert(gpu_target.cgen_state->filter_func_);
990 for (
auto f : gpu_target.cgen_state->helper_functions_) {
994 if (requires_libdevice) {
995 for (llvm::Function& F : *llvm_module) {
1003 if (F.hasName() && F.getName().startswith(
"__internal") && !F.isDeclaration()) {
1006 legalize_nvvm_ir(&F);
1011 std::unordered_set<std::string> udf_declarations;
1013 if (executor->has_udf_module(
true)) {
1014 for (
auto&
f : executor->get_udf_module(
true)->getFunctionList()) {
1015 llvm::Function* udf_function = llvm_module->getFunction(
f.getName());
1018 legalize_nvvm_ir(udf_function);
1019 roots.insert(udf_function);
1023 if (
f.isDeclaration()) {
1024 udf_declarations.insert(
f.getName().str());
1030 if (executor->has_rt_udf_module(
true)) {
1031 for (
auto&
f : executor->get_rt_udf_module(
true)->getFunctionList()) {
1032 llvm::Function* udf_function = llvm_module->getFunction(
f.getName());
1034 legalize_nvvm_ir(udf_function);
1035 roots.insert(udf_function);
1039 if (
f.isDeclaration()) {
1040 udf_declarations.insert(
f.getName().str());
1046 std::vector<llvm::Function*> rt_funcs;
1047 for (
auto& Fn : *llvm_module) {
1048 if (roots.count(&Fn)) {
1051 rt_funcs.push_back(&Fn);
1053 for (
auto& pFn : rt_funcs) {
1054 pFn->removeFromParent();
1057 if (requires_libdevice) {
1058 add_intrinsics_to_module(llvm_module);
1061 llvm_module->print(os,
nullptr);
1064 for (
auto& pFn : rt_funcs) {
1065 llvm_module->getFunctionList().push_back(pFn);
1067 llvm_module->eraseNamedMetadata(md);
1069 auto cuda_llir = ss.str() + cuda_rt_decls + extension_function_decls(udf_declarations);
1073 cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
1075 LOG(
WARNING) <<
"Failed to generate PTX: " << e.what()
1076 <<
". Switching to CPU execution target.";
1079 LOG(
PTX) <<
"PTX for the GPU:\n" << ptx <<
"\nEnd of PTX";
1081 auto cubin_result =
ptx_to_cubin(ptx, gpu_target.block_size, gpu_target.cuda_mgr);
1082 auto& option_keys = cubin_result.option_keys;
1083 auto& option_values = cubin_result.option_values;
1084 auto cubin = cubin_result.cubin;
1085 auto link_state = cubin_result.link_state;
1086 const auto num_options = option_keys.size();
1088 auto func_name = wrapper_func->getName().str();
1089 auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
1090 for (
int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
1092 gpu_compilation_context->addDeviceCode(
1093 std::make_unique<GpuDeviceCompilationContext>(cubin,
1096 gpu_target.cuda_mgr,
1099 &option_values[0]));
1103 return gpu_compilation_context;
1110 llvm::Function* query_func,
1111 llvm::Function* multifrag_query_func,
1112 std::unordered_set<llvm::Function*>& live_funcs,
1113 const bool no_inline,
1115 const bool is_gpu_smem_used,
1123 if (cgen_state_->filter_func_) {
1126 for (
const auto helper : cgen_state_->helper_functions_) {
1134 bool row_func_not_inlined =
false;
1136 for (
auto it = llvm::inst_begin(cgen_state_->row_func_),
1137 e = llvm::inst_end(cgen_state_->row_func_);
1140 if (llvm::isa<llvm::CallInst>(*it)) {
1141 auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
1142 if (get_gv_call.getCalledFunction()->getName() ==
"array_size" ||
1143 get_gv_call.getCalledFunction()->getName() ==
"linear_probabilistic_count") {
1145 row_func_not_inlined =
true;
1152 initializeNVPTXBackend();
1157 row_func_not_inlined};
1158 std::shared_ptr<GpuCompilationContext> compilation_context;
1163 multifrag_query_func,
1168 }
catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1169 if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1172 LOG(
WARNING) <<
"Failed to allocate GPU memory for generated code. Evicting "
1174 <<
"% of GPU code cache and re-trying.";
1179 multifrag_query_func,
1196 llvm::TargetMachine* nvptx_target_machine,
1197 llvm::LLVMContext& context) {
1199 auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir,
"",
false);
1201 llvm::SMDiagnostic parse_error;
1203 auto llvm_module = llvm::parseIR(mem_buff->getMemBufferRef(), parse_error, context);
1205 LOG(
IR) <<
"CodeGenerator::generatePTX:NVVM IR:\n" << cuda_llir <<
"\nEnd of NNVM IR";
1209 llvm::SmallString<256> code_str;
1210 llvm::raw_svector_ostream formatted_os(code_str);
1211 CHECK(nvptx_target_machine);
1213 llvm::legacy::PassManager ptxgen_pm;
1214 llvm_module->setDataLayout(nvptx_target_machine->createDataLayout());
1216 #if LLVM_VERSION_MAJOR >= 10
1217 nvptx_target_machine->addPassesToEmitFile(
1218 ptxgen_pm, formatted_os,
nullptr, llvm::CGFT_AssemblyFile);
1220 nvptx_target_machine->addPassesToEmitFile(
1221 ptxgen_pm, formatted_os,
nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1223 ptxgen_pm.run(*llvm_module);
1226 #if LLVM_VERSION_MAJOR >= 11
1227 return std::string(code_str);
1229 return code_str.str();
1236 llvm::InitializeAllTargets();
1237 llvm::InitializeAllTargetMCs();
1238 llvm::InitializeAllAsmPrinters();
1240 auto target = llvm::TargetRegistry::lookupTarget(
"nvptx64", err);
1244 return std::unique_ptr<llvm::TargetMachine>(
1245 target->createTargetMachine(
"nvptx64-nvidia-cuda",
1248 llvm::TargetOptions(),
1249 llvm::Reloc::Static));
1254 cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1258 if (nvptx_target_machine_) {
1261 const auto arch = cudaMgr()->getDeviceArch();
1268 return func->getName() ==
"query_stub_hoisted_literals" ||
1269 func->getName() ==
"multifrag_query_hoisted_literals" ||
1270 func->getName() ==
"query_stub" || func->getName() ==
"multifrag_query" ||
1271 func->getName() ==
"fixed_width_int_decode" ||
1272 func->getName() ==
"fixed_width_unsigned_decode" ||
1273 func->getName() ==
"diff_fixed_width_int_decode" ||
1274 func->getName() ==
"fixed_width_double_decode" ||
1275 func->getName() ==
"fixed_width_float_decode" ||
1276 func->getName() ==
"fixed_width_small_date_decode" ||
1277 func->getName() ==
"fixed_width_date_encode" ||
1278 func->getName() ==
"record_error_code" || func->getName() ==
"get_error_code" ||
1279 func->getName() ==
"pos_start_impl" || func->getName() ==
"pos_step_impl" ||
1280 func->getName() ==
"group_buff_idx_impl" ||
1281 func->getName() ==
"init_shared_mem" ||
1282 func->getName() ==
"init_shared_mem_nop" || func->getName() ==
"write_back_nop";
1286 const std::string& bc_filename,
1287 llvm::LLVMContext& context) {
1288 llvm::SMDiagnostic err;
1290 auto buffer_or_error = llvm::MemoryBuffer::getFile(bc_filename);
1291 CHECK(!buffer_or_error.getError()) <<
"bc_filename=" << bc_filename;
1292 llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1294 auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1295 CHECK(!owner.takeError());
1296 CHECK(owner->get());
1297 return std::move(owner.get());
1301 const std::string& udf_ir_filename,
1302 llvm::LLVMContext& ctx,
1303 bool is_gpu =
false) {
1304 llvm::SMDiagnostic parse_error;
1306 llvm::StringRef file_name_arg(udf_ir_filename);
1308 auto owner = llvm::parseIRFile(file_name_arg, parse_error, ctx);
1314 llvm::Triple gpu_triple(owner->getTargetTriple());
1315 if (!gpu_triple.isNVPTX()) {
1317 <<
"Expected triple nvptx64-nvidia-cuda for NVVM IR of loadtime UDFs but got "
1318 << gpu_triple.str() <<
". Disabling the NVVM IR module.";
1319 return std::unique_ptr<llvm::Module>();
1326 const std::string& udf_ir_string,
1327 llvm::LLVMContext& ctx,
1328 bool is_gpu =
false) {
1329 llvm::SMDiagnostic parse_error;
1331 auto buf = std::make_unique<llvm::MemoryBufferRef>(udf_ir_string,
1332 "Runtime UDF/UDTF LLVM/NVVM IR");
1334 auto owner = llvm::parseIR(*buf, parse_error, ctx);
1336 LOG(
IR) <<
"read_llvm_module_from_ir_string:\n"
1337 << udf_ir_string <<
"\nEnd of LLVM/NVVM IR";
1342 llvm::Triple gpu_triple(owner->getTargetTriple());
1343 if (!gpu_triple.isNVPTX()) {
1344 LOG(
IR) <<
"read_llvm_module_from_ir_string:\n"
1345 << udf_ir_string <<
"\nEnd of NNVM IR";
1346 LOG(
WARNING) <<
"Expected triple nvptx64-nvidia-cuda for NVVM IR but got "
1348 <<
". Executing runtime UDF/UDTFs on GPU will be disabled.";
1349 return std::unique_ptr<llvm::Module>();
1359 const bool use_resume_param,
1360 llvm::Function* query_func,
1361 llvm::Module* llvm_module) {
1362 for (
auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1364 if (!llvm::isa<llvm::CallInst>(*it)) {
1367 auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1368 if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
1369 if (use_resume_param) {
1370 const auto error_code_arg =
get_arg_by_name(query_func,
"error_code");
1371 llvm::ReplaceInstWithInst(
1373 llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name +
"_impl"),
1376 llvm::ReplaceInstWithInst(
1378 llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name +
"_impl")));
1386 const size_t in_col_count,
1387 const size_t agg_col_count,
1388 const bool hoist_literals) {
1389 auto arg_it = row_func->arg_begin();
1391 if (agg_col_count) {
1392 for (
size_t i = 0; i < agg_col_count; ++i) {
1393 arg_it->setName(
"out");
1397 arg_it->setName(
"group_by_buff");
1399 arg_it->setName(
"varlen_output_buff");
1401 arg_it->setName(
"crt_matched");
1403 arg_it->setName(
"total_matched");
1405 arg_it->setName(
"old_total_matched");
1407 arg_it->setName(
"max_matched");
1411 arg_it->setName(
"agg_init_val");
1414 arg_it->setName(
"pos");
1417 arg_it->setName(
"frag_row_off");
1420 arg_it->setName(
"num_rows_per_scan");
1423 if (hoist_literals) {
1424 arg_it->setName(
"literals");
1428 for (
size_t i = 0; i < in_col_count; ++i) {
1433 arg_it->setName(
"join_hash_tables");
1437 const size_t agg_col_count,
1438 const bool hoist_literals,
1439 llvm::Module* llvm_module,
1440 llvm::LLVMContext& context) {
1441 std::vector<llvm::Type*> row_process_arg_types;
1443 if (agg_col_count) {
1445 for (
size_t i = 0; i < agg_col_count; ++i) {
1446 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1450 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1452 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1454 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1456 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1458 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1460 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1464 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1467 row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1470 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1473 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1476 if (hoist_literals) {
1477 row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1481 for (
size_t i = 0; i < in_col_count; ++i) {
1482 row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1486 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1490 llvm::FunctionType::get(
get_int_type(32, context), row_process_arg_types,
false);
1492 auto row_func = llvm::Function::Create(
1493 ft, llvm::Function::ExternalLinkage,
"row_func", llvm_module);
1503 const std::string& query_fname,
1504 llvm::Function* multifrag_query_func,
1505 llvm::Module* llvm_module) {
1506 std::vector<llvm::CallInst*> query_stubs;
1507 for (
auto it = llvm::inst_begin(multifrag_query_func),
1508 e = llvm::inst_end(multifrag_query_func);
1511 if (!llvm::isa<llvm::CallInst>(*it)) {
1514 auto& query_call = llvm::cast<llvm::CallInst>(*it);
1515 if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
1516 query_stubs.push_back(&query_call);
1519 for (
auto& S : query_stubs) {
1520 std::vector<llvm::Value*>
args;
1521 for (
size_t i = 0; i < S->getNumOperands() - 1; ++i) {
1522 args.push_back(S->getArgOperand(i));
1524 llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args,
""));
1528 std::vector<std::string>
get_agg_fnames(
const std::vector<Analyzer::Expr*>& target_exprs,
1529 const bool is_group_by) {
1530 std::vector<std::string>
result;
1531 for (
size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1532 ++target_idx, ++agg_col_idx) {
1533 const auto target_expr = target_exprs[target_idx];
1535 const auto target_type_info = target_expr->get_type_info();
1537 const bool is_varlen =
1538 (target_type_info.is_string() &&
1540 target_type_info.is_array();
1541 if (!agg_expr || agg_expr->get_aggtype() ==
kSAMPLE) {
1542 result.emplace_back(target_type_info.is_fp() ?
"agg_id_double" :
"agg_id");
1544 result.emplace_back(
"agg_id");
1546 if (target_type_info.is_geometry()) {
1547 result.emplace_back(
"agg_id");
1548 for (
auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1549 result.emplace_back(
"agg_id");
1555 const auto& agg_type_info =
1556 agg_type !=
kCOUNT ? agg_expr->get_arg()->get_type_info() : target_type_info;
1559 if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1560 !agg_type_info.is_fp()) {
1561 throw std::runtime_error(
"AVG is only valid on integer and floating point");
1563 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1565 :
"agg_sum_double");
1566 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1568 :
"agg_count_double");
1572 if (agg_type_info.is_string() || agg_type_info.is_array() ||
1573 agg_type_info.is_geometry()) {
1574 throw std::runtime_error(
1575 "MIN on strings, arrays or geospatial types not supported yet");
1577 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1579 :
"agg_min_double");
1583 if (agg_type_info.is_string() || agg_type_info.is_array() ||
1584 agg_type_info.is_geometry()) {
1585 throw std::runtime_error(
1586 "MAX on strings, arrays or geospatial types not supported yet");
1588 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1590 :
"agg_max_double");
1594 if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1595 !agg_type_info.is_fp()) {
1596 throw std::runtime_error(
"SUM is only valid on integer and floating point");
1598 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1600 :
"agg_sum_double");
1604 result.emplace_back(agg_expr->get_is_distinct() ?
"agg_count_distinct"
1608 result.emplace_back(agg_type_info.is_fp() ?
"agg_id_double" :
"agg_id");
1613 result.emplace_back(agg_type_info.is_fp() ?
"agg_id_double" :
"agg_id");
1617 result.emplace_back(
"agg_approximate_count_distinct");
1620 result.emplace_back(
"agg_approx_quantile");
1632 const bool is_cuda_ir) {
1640 llvm::Module& llvm_module,
1641 const std::vector<llvm::Function*>& roots,
1642 const std::vector<llvm::Function*>& leaves) {
1644 std::unordered_set<llvm::Function*> live_funcs;
1645 live_funcs.insert(roots.begin(), roots.end());
1646 live_funcs.insert(leaves.begin(), leaves.end());
1648 if (
auto F = llvm_module.getFunction(
"init_shared_mem_nop")) {
1649 live_funcs.insert(F);
1651 if (
auto F = llvm_module.getFunction(
"write_back_nop")) {
1652 live_funcs.insert(F);
1655 for (
const llvm::Function* F : roots) {
1656 for (
const llvm::BasicBlock& BB : *F) {
1657 for (
const llvm::Instruction& I : BB) {
1658 if (
const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1659 live_funcs.insert(CI->getCalledFunction());
1665 for (llvm::Function& F : llvm_module) {
1666 if (!live_funcs.count(&F) && !F.isDeclaration()) {
1667 F.setLinkage(llvm::GlobalValue::InternalLinkage);
1677 template <
typename InstType>
1679 std::string bb_name,
1680 std::string variable_name) {
1681 llvm::Value* result =
nullptr;
1682 if (func ==
nullptr || variable_name.empty()) {
1685 bool is_found =
false;
1686 for (
auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1687 if (!bb_name.empty() && bb_it->getName() != bb_name) {
1690 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1691 if (llvm::isa<InstType>(*inst_it)) {
1692 if (inst_it->getName() == variable_name) {
1705 llvm::Function* query_func,
1706 bool run_with_dynamic_watchdog,
1707 bool run_with_allowing_runtime_interrupt,
1708 const std::vector<JoinLoop>& join_loops,
1710 const std::vector<InputTableInfo>& input_table_infos) {
1716 if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1719 run_with_allowing_runtime_interrupt =
false;
1725 executor_session_mutex_);
1726 if (current_query_session_.empty()) {
1727 run_with_allowing_runtime_interrupt =
false;
1731 llvm::Value* row_count =
nullptr;
1732 if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1735 find_variable_in_basic_block<llvm::LoadInst>(query_func,
".entry",
"row_count");
1738 bool done_splitting =
false;
1739 for (
auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1741 llvm::Value* pos =
nullptr;
1742 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1743 if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1744 llvm::isa<llvm::PHINode>(*inst_it)) {
1745 if (inst_it->getName() ==
"pos") {
1750 if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1753 auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1754 if (std::string(row_func_call.getCalledFunction()->getName()) ==
"row_process") {
1755 auto next_inst_it = inst_it;
1757 auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1758 auto& br_instr = bb_it->back();
1759 llvm::IRBuilder<> ir_builder(&br_instr);
1760 llvm::Value* err_lv = &*inst_it;
1761 llvm::Value* err_lv_returned_from_row_func =
nullptr;
1762 if (run_with_dynamic_watchdog) {
1764 llvm::Value* call_watchdog_lv =
nullptr;
1770 auto crit_edge_rem =
1771 (blockSize() & (blockSize() - 1))
1772 ? ir_builder.CreateSRem(
1774 cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1775 : ir_builder.CreateAnd(
1777 cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1778 auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1779 crit_edge_threshold->setName(
"crit_edge_threshold");
1784 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1787 auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1788 call_watchdog_lv = ir_builder.CreateICmp(
1789 llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1791 CHECK(call_watchdog_lv);
1792 auto error_check_bb = bb_it->splitBasicBlock(
1793 llvm::BasicBlock::iterator(br_instr),
".error_check");
1794 auto& watchdog_br_instr = bb_it->back();
1796 auto watchdog_check_bb = llvm::BasicBlock::Create(
1797 cgen_state_->context_,
".watchdog_check", query_func, error_check_bb);
1798 llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1799 auto detected_timeout = watchdog_ir_builder.CreateCall(
1800 cgen_state_->module_->getFunction(
"dynamic_watchdog"), {});
1801 auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1803 watchdog_ir_builder.CreateBr(error_check_bb);
1805 llvm::ReplaceInstWithInst(
1807 llvm::BranchInst::Create(
1808 watchdog_check_bb, error_check_bb, call_watchdog_lv));
1809 ir_builder.SetInsertPoint(&br_instr);
1810 auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1812 unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1813 unified_err_lv->addIncoming(err_lv, &*bb_it);
1814 err_lv = unified_err_lv;
1815 }
else if (run_with_allowing_runtime_interrupt) {
1817 llvm::Value* call_check_interrupt_lv{
nullptr};
1818 llvm::Value* interrupt_err_lv{
nullptr};
1819 llvm::BasicBlock* error_check_bb{
nullptr};
1820 llvm::BasicBlock* interrupt_check_bb{
nullptr};
1821 llvm::Instruction* check_interrupt_br_instr{
nullptr};
1823 auto has_loop_join = std::any_of(
1824 join_loops.begin(), join_loops.end(), [](
const JoinLoop& join_loop) {
1825 return join_loop.isNestedLoopJoin();
1827 auto codegen_interrupt_checker = [&]() {
1828 error_check_bb = bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
1830 check_interrupt_br_instr = &bb_it->back();
1832 interrupt_check_bb = llvm::BasicBlock::Create(
1833 cgen_state_->context_,
".interrupt_check", query_func, error_check_bb);
1834 llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1835 auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1836 cgen_state_->module_->getFunction(
"check_interrupt"), {});
1837 interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1841 interrupt_checker_ir_builder.CreateBr(error_check_bb);
1843 if (has_loop_join) {
1844 codegen_interrupt_checker();
1845 CHECK(interrupt_check_bb);
1846 CHECK(check_interrupt_br_instr);
1847 llvm::ReplaceInstWithInst(check_interrupt_br_instr,
1848 llvm::BranchInst::Create(interrupt_check_bb));
1849 ir_builder.SetInsertPoint(&br_instr);
1850 err_lv = interrupt_err_lv;
1862 int64_t total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1863 uint64_t interrupt_checking_freq = 32;
1867 if (!input_table_infos.empty()) {
1868 const auto& outer_table_info = *input_table_infos.begin();
1869 auto num_outer_table_tuples =
1870 outer_table_info.info.getFragmentNumTuplesUpperBound();
1871 if (num_outer_table_tuples > 0) {
1879 auto max_inc = uint64_t(
1880 floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1886 auto calibrated_inc =
1887 uint64_t(floor(max_inc * (1 - freq_control_knob)));
1888 interrupt_checking_freq =
1893 if (interrupt_checking_freq > max_inc) {
1894 interrupt_checking_freq = max_inc / 2;
1896 if (interrupt_checking_freq < 8) {
1899 interrupt_checking_freq = 8;
1903 VLOG(1) <<
"Set the running query interrupt checking frequency: "
1904 << interrupt_checking_freq;
1906 llvm::Value* pos_shifted_per_iteration =
1907 ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1908 auto interrupt_predicate = ir_builder.CreateAnd(pos_shifted_per_iteration,
1909 interrupt_checking_freq);
1910 call_check_interrupt_lv =
1911 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1912 interrupt_predicate,
1913 cgen_state_->llInt(int64_t(0LL)));
1916 auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1917 call_check_interrupt_lv =
1918 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1919 interrupt_predicate,
1920 cgen_state_->llInt(int64_t(0LL)));
1922 codegen_interrupt_checker();
1923 CHECK(call_check_interrupt_lv);
1924 CHECK(interrupt_err_lv);
1925 CHECK(interrupt_check_bb);
1926 CHECK(error_check_bb);
1927 CHECK(check_interrupt_br_instr);
1928 llvm::ReplaceInstWithInst(
1929 check_interrupt_br_instr,
1930 llvm::BranchInst::Create(
1931 interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
1932 ir_builder.SetInsertPoint(&br_instr);
1933 auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1935 unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
1936 unified_err_lv->addIncoming(err_lv, &*bb_it);
1937 err_lv = unified_err_lv;
1940 if (!err_lv_returned_from_row_func) {
1941 err_lv_returned_from_row_func = err_lv;
1947 err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1951 err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1953 cgen_state_->llInt(static_cast<int32_t>(0)));
1955 auto error_bb = llvm::BasicBlock::Create(
1956 cgen_state_->context_,
".error_exit", query_func, new_bb);
1957 const auto error_code_arg =
get_arg_by_name(query_func,
"error_code");
1958 llvm::CallInst::Create(
1959 cgen_state_->module_->getFunction(
"record_error_code"),
1960 std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
1963 llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1964 llvm::ReplaceInstWithInst(&br_instr,
1965 llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1966 done_splitting =
true;
1971 CHECK(done_splitting);
1975 llvm::Module* M = cgen_state_->module_;
1976 if (M->getFunction(
"allocate_varlen_buffer") ==
nullptr)
1980 bool should_track =
false;
1981 auto* flag = M->getModuleFlag(
"manage_memory_buffer");
1982 if (
auto* cnt = llvm::mdconst::extract_or_null<llvm::ConstantInt>(flag)) {
1983 if (cnt->getZExtValue() == 1) {
1984 should_track =
true;
1988 if (!should_track) {
1993 LOG(
INFO) <<
"Found 'manage_memory_buffer' metadata.";
1994 llvm::SmallVector<llvm::CallInst*, 4> calls_to_analyze;
1996 for (llvm::Function& F : *M) {
1997 for (llvm::BasicBlock& BB : F) {
1998 for (llvm::Instruction& I : BB) {
1999 if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&I)) {
2001 llvm::Function* called = CI->getCalledFunction();
2003 if (called->getName() ==
"allocate_varlen_buffer") {
2004 calls_to_analyze.push_back(CI);
2014 llvm::IRBuilder<> Builder(cgen_state_->context_);
2017 auto void_ = llvm::Type::getVoidTy(cgen_state_->context_);
2018 llvm::FunctionType* fnty = llvm::FunctionType::get(void_, {i64, i8p},
false);
2019 llvm::FunctionCallee register_buffer_fn =
2020 M->getOrInsertFunction(
"register_buffer_with_executor_rsm", fnty, {});
2022 int64_t executor_addr =
reinterpret_cast<int64_t
>(
this);
2023 for (llvm::CallInst* CI : calls_to_analyze) {
2028 for (llvm::User* U : CI->users()) {
2029 if (llvm::CallInst* call = llvm::dyn_cast<llvm::CallInst>(U)) {
2030 if (call->getCalledFunction() and
2031 call->getCalledFunction()->getName() ==
"register_buffer_with_executor_rsm") {
2038 Builder.SetInsertPoint(CI->getNextNode());
2039 Builder.CreateCall(register_buffer_fn,
2040 {
ll_int(executor_addr, cgen_state_->context_), CI});
2048 std::vector<llvm::Value*> hoisted_literals;
2052 std::vector<llvm::Type*> row_process_arg_types;
2054 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2055 E = cgen_state_->row_func_->arg_end();
2058 row_process_arg_types.push_back(I->getType());
2061 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2062 for (
auto value : element.second) {
2063 row_process_arg_types.push_back(value->getType());
2067 auto ft = llvm::FunctionType::get(
2068 get_int_type(32, cgen_state_->context_), row_process_arg_types,
false);
2069 auto row_func_with_hoisted_literals =
2070 llvm::Function::Create(ft,
2071 llvm::Function::ExternalLinkage,
2072 "row_func_hoisted_literals",
2073 cgen_state_->row_func_->getParent());
2075 auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
2076 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2077 E = cgen_state_->row_func_->arg_end();
2081 row_func_arg_it->setName(I->getName());
2086 decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{
nullptr};
2087 decltype(row_func_arg_it) filter_func_arg_it{
nullptr};
2088 if (cgen_state_->filter_func_) {
2091 std::vector<llvm::Type*> filter_func_arg_types;
2093 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2094 E = cgen_state_->filter_func_->arg_end();
2097 filter_func_arg_types.push_back(I->getType());
2100 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2101 for (
auto value : element.second) {
2102 filter_func_arg_types.push_back(value->getType());
2106 auto ft2 = llvm::FunctionType::get(
2107 get_int_type(32, cgen_state_->context_), filter_func_arg_types,
false);
2108 filter_func_with_hoisted_literals =
2109 llvm::Function::Create(ft2,
2110 llvm::Function::ExternalLinkage,
2111 "filter_func_hoisted_literals",
2112 cgen_state_->filter_func_->getParent());
2114 filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
2115 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2116 E = cgen_state_->filter_func_->arg_end();
2120 filter_func_arg_it->setName(I->getName());
2122 ++filter_func_arg_it;
2126 std::unordered_map<int, std::vector<llvm::Value*>>
2127 query_func_literal_loads_function_arguments,
2128 query_func_literal_loads_function_arguments2;
2130 for (
auto& element : cgen_state_->query_func_literal_loads_) {
2131 std::vector<llvm::Value*> argument_values, argument_values2;
2133 for (
auto value : element.second) {
2134 hoisted_literals.push_back(value);
2135 argument_values.push_back(&*row_func_arg_it);
2136 if (cgen_state_->filter_func_) {
2137 argument_values2.push_back(&*filter_func_arg_it);
2138 cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
2140 if (value->hasName()) {
2141 row_func_arg_it->setName(
"arg_" + value->getName());
2142 if (cgen_state_->filter_func_) {
2143 filter_func_arg_it->getContext();
2144 filter_func_arg_it->setName(
"arg_" + value->getName());
2148 ++filter_func_arg_it;
2151 query_func_literal_loads_function_arguments[element.first] = argument_values;
2152 query_func_literal_loads_function_arguments2[element.first] = argument_values2;
2158 row_func_with_hoisted_literals->getBasicBlockList().splice(
2159 row_func_with_hoisted_literals->begin(),
2160 cgen_state_->row_func_->getBasicBlockList());
2163 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2164 E = cgen_state_->row_func_->arg_end(),
2165 I2 = row_func_with_hoisted_literals->arg_begin();
2168 I->replaceAllUsesWith(&*I2);
2170 cgen_state_->filter_func_args_.replace(&*I, &*I2);
2174 cgen_state_->row_func_ = row_func_with_hoisted_literals;
2177 std::vector<llvm::Instruction*> placeholders;
2178 std::string prefix(
"__placeholder__literal_");
2179 for (
auto it = llvm::inst_begin(row_func_with_hoisted_literals),
2180 e = llvm::inst_end(row_func_with_hoisted_literals);
2183 if (it->hasName() && it->getName().startswith(prefix)) {
2184 auto offset_and_index_entry =
2185 cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
2186 CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2188 int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2189 int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2191 it->replaceAllUsesWith(
2192 query_func_literal_loads_function_arguments[lit_off][lit_idx]);
2193 placeholders.push_back(&*it);
2196 for (
auto placeholder : placeholders) {
2197 placeholder->removeFromParent();
2200 if (cgen_state_->filter_func_) {
2204 filter_func_with_hoisted_literals->getBasicBlockList().splice(
2205 filter_func_with_hoisted_literals->begin(),
2206 cgen_state_->filter_func_->getBasicBlockList());
2210 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2211 E = cgen_state_->filter_func_->arg_end(),
2212 I2 = filter_func_with_hoisted_literals->arg_begin();
2215 I->replaceAllUsesWith(&*I2);
2220 cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2223 std::vector<llvm::Instruction*> placeholders;
2224 std::string prefix(
"__placeholder__literal_");
2225 for (
auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2226 e = llvm::inst_end(filter_func_with_hoisted_literals);
2229 if (it->hasName() && it->getName().startswith(prefix)) {
2230 auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2231 llvm::dyn_cast<llvm::Value>(&*it));
2232 CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2234 int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2235 int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2237 it->replaceAllUsesWith(
2238 query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2239 placeholders.push_back(&*it);
2242 for (
auto placeholder : placeholders) {
2243 placeholder->removeFromParent();
2247 return hoisted_literals;
2254 return shared_mem_used
2263 const unsigned gpu_blocksize,
2264 const unsigned num_blocks_per_mp) {
2291 if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2296 const auto target_infos =
2298 std::unordered_set<SQLAgg> supported_aggs{
kCOUNT};
2299 if (std::find_if(target_infos.begin(),
2302 if (ti.sql_type.is_varlen() ||
2303 !supported_aggs.count(ti.agg_kind)) {
2308 }) == target_infos.end()) {
2323 if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2335 const size_t shared_memory_threshold_bytes = std::min(
2338 const auto output_buffer_size =
2340 if (output_buffer_size > shared_memory_threshold_bytes) {
2347 const auto target_infos =
2349 std::unordered_set<SQLAgg> supported_aggs{
kCOUNT};
2353 if (std::find_if(target_infos.begin(),
2356 if (ti.sql_type.is_varlen() ||
2357 !supported_aggs.count(ti.agg_kind)) {
2362 }) == target_infos.end()) {
2373 std::string llvm_ir;
2374 std::unordered_set<llvm::MDNode*> md;
2377 for (
auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2378 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2379 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2380 instr_it->getAllMetadata(imd);
2381 for (
auto [kind, node] : imd) {
2388 for (
auto bb_it = cgen_state->
row_func_->begin(); bb_it != cgen_state->
row_func_->end();
2390 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2391 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2392 instr_it->getAllMetadata(imd);
2393 for (
auto [kind, node] : imd) {
2404 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2405 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2406 instr_it->getAllMetadata(imd);
2407 for (
auto [kind, node] : imd) {
2416 std::map<size_t, std::string> sorted_strings;
2419 llvm::raw_string_ostream os(str);
2420 p->print(os, cgen_state->
module_,
true);
2422 auto fields =
split(str, {}, 1);
2423 if (fields.empty() || fields[0].empty()) {
2426 sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2429 for (
auto [
id, text] : sorted_strings) {
2441 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2448 const bool allow_lazy_fetch,
2449 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2450 const size_t max_groups_buffer_entry_guess,
2451 const int8_t crt_min_byte_width,
2452 const bool has_cardinality_estimation,
2458 const auto cuda_mgr =
data_mgr_->getCudaMgr();
2465 static std::uint64_t counter = 0;
2467 VLOG(1) <<
"CODEGEN #" << counter <<
":";
2468 LOG(
IR) <<
"CODEGEN #" << counter <<
":";
2469 LOG(
PTX) <<
"CODEGEN #" << counter <<
":";
2470 LOG(
ASM) <<
"CODEGEN #" << counter <<
":";
2481 addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
2489 has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2493 max_groups_buffer_entry_guess,
2500 !has_cardinality_estimation && (!render_info || !render_info->
isInSitu()) &&
2502 const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2506 const bool output_columnar =
query_mem_desc->didOutputColumnar();
2507 const bool gpu_shared_mem_optimization =
2512 cuda_mgr ? this->blockSize() : 1,
2513 cuda_mgr ?
this->numBlocksPerMP() : 1);
2514 if (gpu_shared_mem_optimization) {
2517 LOG(
DEBUG1) <<
"GPU shared memory is used for the " +
2528 const size_t num_count_distinct_descs =
2530 for (
size_t i = 0; i < num_count_distinct_descs; i++) {
2531 const auto& count_distinct_descriptor =
2544 if (
auto gby_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
2545 bool has_multiple_gpus = cuda_mgr ? cuda_mgr->getDeviceCount() > 1 :
false;
2546 if (gby_expr->get_aggtype() ==
SQLAgg::kSAMPLE && has_multiple_gpus &&
2549 bool (*)(
const Analyzer::ColumnVar*,
const Analyzer::ColumnVar*)>
2552 for (
const auto cv : colvar_set) {
2553 if (cv->get_type_info().is_varlen()) {
2554 const auto tbl_id = cv->get_table_id();
2555 std::for_each(query_infos.begin(),
2558 if (input_table_info.table_id == tbl_id &&
2559 input_table_info.info.fragments.size() > 1) {
2573 CHECK(cgen_state_->module_ ==
nullptr);
2574 cgen_state_->set_module_shallow_copy(get_rt_module(),
true);
2581 if (has_udf_module(is_gpu)) {
2583 get_udf_module(is_gpu), *cgen_state_->module_, cgen_state_.get());
2585 if (has_rt_udf_module(is_gpu)) {
2587 get_rt_udf_module(is_gpu), *cgen_state_->module_, cgen_state_.get());
2595 const auto agg_slot_count = ra_exe_unit.
estimator ? size_t(1) : agg_fnames.size();
2598 auto [query_func, row_func_call] = is_group_by
2608 !!ra_exe_unit.estimator,
2614 cgen_state_->query_func_ = query_func;
2615 cgen_state_->row_func_call_ = row_func_call;
2616 cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2617 &query_func->getEntryBlock().front());
2621 auto& fetch_bb = query_func->front();
2622 llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2623 fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2625 query_func->args().begin(),
2627 cgen_state_->context_);
2631 is_group_by ? 0 : agg_slot_count,
2633 cgen_state_->module_,
2634 cgen_state_->context_);
2635 CHECK(cgen_state_->row_func_);
2636 cgen_state_->row_func_bb_ =
2637 llvm::BasicBlock::Create(cgen_state_->context_,
"entry", cgen_state_->row_func_);
2640 auto filter_func_ft =
2641 llvm::FunctionType::get(
get_int_type(32, cgen_state_->context_), {},
false);
2642 cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2643 llvm::Function::ExternalLinkage,
2645 cgen_state_->module_);
2646 CHECK(cgen_state_->filter_func_);
2647 cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2648 cgen_state_->context_,
"entry", cgen_state_->filter_func_);
2651 cgen_state_->current_func_ = cgen_state_->row_func_;
2652 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2654 preloadFragOffsets(ra_exe_unit.
input_descs, query_infos);
2656 const auto join_loops =
2657 buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2661 plan_state_->addSimpleQual(simple_qual);
2663 const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2664 if (is_not_deleted_bb) {
2665 cgen_state_->row_func_bb_ = is_not_deleted_bb;
2667 if (!join_loops.empty()) {
2668 codegenJoinLoops(join_loops,
2669 body_execution_unit,
2670 group_by_and_aggregate,
2672 cgen_state_->row_func_bb_,
2677 const bool can_return_error = compileBody(
2678 ra_exe_unit, group_by_and_aggregate, *
query_mem_desc, co, gpu_smem_context);
2681 createErrorCheckControlFlow(query_func,
2686 group_by_and_aggregate.query_infos_);
2689 std::vector<llvm::Value*> hoisted_literals;
2692 VLOG(1) <<
"number of hoisted literals: "
2693 << cgen_state_->query_func_literal_loads_.size()
2694 <<
" / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2698 if (co.
hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2700 hoisted_literals = inlineHoistedLiterals();
2704 std::vector<llvm::Value*> row_func_args;
2705 for (
size_t i = 0; i < cgen_state_->row_func_call_->getNumOperands() - 1; ++i) {
2706 row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2708 row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2709 row_func_args.push_back(
get_arg_by_name(query_func,
"join_hash_tables"));
2711 row_func_args.insert(
2712 row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2713 llvm::ReplaceInstWithInst(
2714 cgen_state_->row_func_call_,
2715 llvm::CallInst::Create(cgen_state_->row_func_, row_func_args,
""));
2718 if (cgen_state_->filter_func_) {
2719 std::vector<llvm::Value*> filter_func_args;
2720 for (
auto arg_it = cgen_state_->filter_func_args_.begin();
2721 arg_it != cgen_state_->filter_func_args_.end();
2723 filter_func_args.push_back(arg_it->first);
2725 llvm::ReplaceInstWithInst(
2726 cgen_state_->filter_func_call_,
2727 llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args,
""));
2731 plan_state_->init_agg_vals_ =
2741 if (gpu_smem_context.isSharedMemoryUsed()) {
2745 cgen_state_->module_,
2746 cgen_state_->context_,
2749 plan_state_->init_agg_vals_,
2751 gpu_smem_code.codegen();
2752 gpu_smem_code.injectFunctionsInto(query_func);
2755 cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2756 cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2757 LOG(
IR) << gpu_smem_code.toString();
2761 auto multifrag_query_func = cgen_state_->module_->getFunction(
2762 "multifrag_query" + std::string(co.
hoist_literals ?
"_hoisted_literals" :
""));
2763 CHECK(multifrag_query_func);
2766 insertErrorCodeChecker(
2771 "query_stub" + std::string(co.
hoist_literals ?
"_hoisted_literals" :
""),
2772 multifrag_query_func,
2773 cgen_state_->module_);
2775 std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2776 if (cgen_state_->filter_func_) {
2777 root_funcs.push_back(cgen_state_->filter_func_);
2780 *cgen_state_->module_, root_funcs, {multifrag_query_func});
2787 if (cgen_state_->filter_func_) {
2797 std::string llvm_ir;
2800 #ifdef WITH_JIT_DEBUG
2801 throw std::runtime_error(
2802 "Explain optimized not available when JIT runtime debug symbols are enabled");
2806 llvm::legacy::PassManager pass_manager;
2808 cgen_state_->module_,
2811 gpu_smem_context.isSharedMemoryUsed(),
2813 #endif // WITH_JIT_DEBUG
2827 LOG(
IR) <<
"IR for the "
2841 AutoTrackBuffersInRuntimeIR();
2845 if (cgen_state_->filter_func_) {
2850 return std::make_tuple(
2853 ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2854 : optimizeAndCodegenGPU(query_func,
2855 multifrag_query_func,
2857 is_group_by || ra_exe_unit.estimator,
2859 gpu_smem_context.isSharedMemoryUsed(),
2861 cgen_state_->getLiterals(),
2864 std::move(gpu_smem_context)},
2869 bool hoist_literals,
2870 bool allow_runtime_query_interrupt) {
2871 auto query_stub_func_name =
2872 "query_stub" + std::string(hoist_literals ?
"_hoisted_literals" :
"");
2873 for (
auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2874 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
2875 if (!llvm::isa<llvm::CallInst>(*inst_it)) {
2878 auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
2879 if (std::string(row_func_call.getCalledFunction()->getName()) ==
2880 query_stub_func_name) {
2881 auto next_inst_it = inst_it;
2883 auto new_bb = bb_it->splitBasicBlock(next_inst_it);
2884 auto& br_instr = bb_it->back();
2885 llvm::IRBuilder<> ir_builder(&br_instr);
2886 llvm::Value* err_lv = &*inst_it;
2887 auto error_check_bb =
2888 bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
".error_check");
2889 llvm::Value* error_code_arg =
nullptr;
2891 for (
auto arg_it = query_func->arg_begin(); arg_it != query_func->arg_end();
2892 arg_it++, ++arg_cnt) {
2895 if (hoist_literals) {
2897 error_code_arg = &*arg_it;
2902 error_code_arg = &*arg_it;
2907 CHECK(error_code_arg);
2908 llvm::Value* err_code =
nullptr;
2909 if (allow_runtime_query_interrupt) {
2911 auto& check_interrupt_br_instr = bb_it->back();
2912 auto interrupt_check_bb = llvm::BasicBlock::Create(
2913 cgen_state_->context_,
".interrupt_check", query_func, error_check_bb);
2914 llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
2915 auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
2916 cgen_state_->module_->getFunction(
"check_interrupt"), {});
2917 auto detected_error = interrupt_checker_ir_builder.CreateCall(
2918 cgen_state_->module_->getFunction(
"get_error_code"),
2919 std::vector<llvm::Value*>{error_code_arg});
2920 err_code = interrupt_checker_ir_builder.CreateSelect(
2924 interrupt_checker_ir_builder.CreateBr(error_check_bb);
2925 llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
2926 llvm::BranchInst::Create(interrupt_check_bb));
2927 ir_builder.SetInsertPoint(&br_instr);
2930 ir_builder.SetInsertPoint(&br_instr);
2932 ir_builder.CreateCall(cgen_state_->module_->getFunction(
"get_error_code"),
2933 std::vector<llvm::Value*>{error_code_arg});
2935 err_lv = ir_builder.CreateICmp(
2936 llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
2937 auto error_bb = llvm::BasicBlock::Create(
2938 cgen_state_->context_,
".error_exit", query_func, new_bb);
2939 llvm::CallInst::Create(cgen_state_->module_->getFunction(
"record_error_code"),
2940 std::vector<llvm::Value*>{err_code, error_code_arg},
2943 llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2944 llvm::ReplaceInstWithInst(&br_instr,
2945 llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2960 const auto& outer_input_desc = ra_exe_unit.
input_descs[0];
2964 const auto deleted_cd =
2965 plan_state_->getDeletedColForTable(outer_input_desc.getTableId());
2969 CHECK(deleted_cd->columnType.is_boolean());
2970 const auto deleted_expr =
2971 makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
2972 outer_input_desc.getTableId(),
2973 deleted_cd->columnId,
2974 outer_input_desc.getNestLevel());
2976 const auto is_deleted =
2977 code_generator.toBool(code_generator.codegen(deleted_expr.get(),
true, co).front());
2978 const auto is_deleted_bb = llvm::BasicBlock::Create(
2979 cgen_state_->context_,
"is_deleted", cgen_state_->row_func_);
2980 llvm::BasicBlock* bb = llvm::BasicBlock::Create(
2981 cgen_state_->context_,
"is_not_deleted", cgen_state_->row_func_);
2982 cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
2983 cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
2984 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2985 cgen_state_->ir_builder_.SetInsertPoint(bb);
3000 cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
3001 llvm::Value* loop_done{
nullptr};
3002 std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
3003 if (cgen_state_->filter_func_) {
3004 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3005 auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
3006 cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
3007 row_func_entry_bb->begin());
3008 loop_done = cgen_state_->ir_builder_.CreateAlloca(
3009 get_int_type(1, cgen_state_->context_),
nullptr,
"loop_done");
3010 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3011 cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(
true), loop_done);
3013 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
3014 cgen_state_->current_func_ = cgen_state_->filter_func_;
3015 fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
3019 std::vector<Analyzer::Expr*> primary_quals;
3020 std::vector<Analyzer::Expr*> deferred_quals;
3022 ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
3023 if (short_circuited) {
3025 <<
"short-circuited and deferred " <<
std::to_string(deferred_quals.size())
3028 llvm::Value* filter_lv = cgen_state_->llBool(
true);
3030 for (
auto expr : primary_quals) {
3032 auto cond = code_generator.toBool(code_generator.codegen(expr,
true, co).front());
3033 filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
3035 CHECK(filter_lv->getType()->isIntegerTy(1));
3036 llvm::BasicBlock* sc_false{
nullptr};
3037 if (!deferred_quals.empty()) {
3038 auto sc_true = llvm::BasicBlock::Create(
3039 cgen_state_->context_,
"sc_true", cgen_state_->current_func_);
3040 sc_false = llvm::BasicBlock::Create(
3041 cgen_state_->context_,
"sc_false", cgen_state_->current_func_);
3042 cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
3043 cgen_state_->ir_builder_.SetInsertPoint(sc_false);
3045 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
3047 cgen_state_->ir_builder_.SetInsertPoint(sc_true);
3048 filter_lv = cgen_state_->llBool(
true);
3050 for (
auto expr : deferred_quals) {
3051 filter_lv = cgen_state_->ir_builder_.CreateAnd(
3052 filter_lv, code_generator.toBool(code_generator.codegen(expr,
true, co).front()));
3055 CHECK(filter_lv->getType()->isIntegerTy(1));
3056 auto ret = group_by_and_aggregate.
codegen(
3057 filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
3061 if (cgen_state_->filter_func_) {
3062 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3063 cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(
false), loop_done);
3064 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3067 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3068 cgen_state_->current_func_ = cgen_state_->row_func_;
3069 cgen_state_->filter_func_call_ =
3070 cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
3074 redeclareFilterFunction();
3076 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
3077 auto loop_done_true = llvm::BasicBlock::Create(
3078 cgen_state_->context_,
"loop_done_true", cgen_state_->row_func_);
3079 auto loop_done_false = llvm::BasicBlock::Create(
3080 cgen_state_->context_,
"loop_done_false", cgen_state_->row_func_);
3081 auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(
3082 loop_done->getType()->getPointerElementType(), loop_done);
3083 cgen_state_->ir_builder_.CreateCondBr(
3084 loop_done_flag, loop_done_true, loop_done_false);
3085 cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
3086 cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3087 cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
3089 cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3096 llvm::Value* byte_stream_arg,
3097 llvm::IRBuilder<>& ir_builder,
3098 llvm::LLVMContext& ctx) {
3099 CHECK(byte_stream_arg);
3100 const auto max_col_local_id = num_columns - 1;
3102 std::vector<llvm::Value*> col_heads;
3103 for (
int col_id = 0; col_id <= max_col_local_id; ++col_id) {
3104 auto* gep = ir_builder.CreateGEP(
3105 byte_stream_arg->getType()->getScalarType()->getPointerElementType(),
3107 llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id));
3108 col_heads.emplace_back(
3109 ir_builder.CreateLoad(gep->getType()->getPointerElementType(), gep));
3113 void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, const std::vector< JoinLoop > &join_loops, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
std::vector< Analyzer::Expr * > target_exprs
double g_running_query_interrupt_freq
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
std::string gen_translate_null_key_sigs()
bool countDistinctDescriptorsLogicallyEmpty() const
size_t getEntryCount() const
static const int32_t ERR_INTERRUPTED
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
void mark_function_never_inline(llvm::Function *func)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
void optimize_ir(llvm::Function *query_func, llvm::Module *llvm_module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co)
bool with_dynamic_watchdog
Streaming Top N algorithm.
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void AutoTrackBuffersInRuntimeIR()
void checkCudaErrors(CUresult err)
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
llvm::ConstantInt * ll_int(const T v, llvm::LLVMContext &context)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *llvm_module)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_string(const std::string &udf_ir_string, llvm::LLVMContext &ctx, bool is_gpu=false)
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *mod, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
bool hasKeylessHash() const
std::vector< std::string > CodeCacheKey
ExecutorOptLevel opt_level
bool g_enable_dynamic_watchdog
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *mod, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
bool filter_on_deleted_column
size_t getRowSize() const
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
llvm::Function * row_func_
bool g_enable_smem_non_grouped_agg
std::shared_lock< T > shared_lock
unsigned getExpOfTwo(unsigned n)
bool output_columnar_hint
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned gpu_blocksize, const unsigned num_blocks_per_mp)
llvm::StringRef get_gpu_target_triple_string()
Supported runtime functions management and retrieval.
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
void verify_function_ir(const llvm::Function *func)
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
bool useStreamingTopN() const
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
ExecutionEngineWrapper create_execution_engine(llvm::Module *llvm_module, llvm::EngineBuilder &eb, const CompilationOptions &co)
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx, bool is_gpu=false)
ExecutorExplainType explain_type
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
void insertErrorCodeChecker(llvm::Function *query_func, bool hoist_literals, bool allow_runtime_query_interrupt)
static const int32_t ERR_OUT_OF_TIME
void initializeNVPTXBackend() const
size_t getMinSharedMemoryPerBlockForAllDevices() const
const std::string cuda_rt_decls
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
QueryDescriptionType getQueryDescriptionType() const
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *llvm_module, llvm::LLVMContext &context)
ExecutorDeviceType device_type
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *llvm_module)
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
llvm::Function * filter_func_
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
static void addUdfIrToModule(const std::string &udf_ir_filename, const bool is_cuda_ir)
bool isArchMaxwellOrLaterForAll() const
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *llvm_module)
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool is_gpu_smem_used, const CompilationOptions &)
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(Executor *executor, llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co, const GPUTarget &gpu_target)
bool g_enable_smem_grouped_non_count_agg
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
static std::map< ExtModuleKinds, std::string > extension_module_sources
void show_defined(llvm::Module &llvm_module)
bool g_enable_filter_function
static void linkModuleWithLibdevice(Executor *executor, llvm::Module &module, llvm::PassManagerBuilder &pass_manager_builder, const GPUTarget &gpu_target)
float g_fraction_code_cache_to_evict
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
SQLAgg get_aggtype() const
std::string filename(char const *path)
std::list< std::shared_ptr< Analyzer::Expr > > quals
std::string gen_array_any_all_sigs()
bool didOutputColumnar() const
#define DEBUG_TIMER(name)
llvm::ValueToValueMapTy vmap_
std::vector< llvm::Value * > inlineHoistedLiterals()
static std::shared_ptr< QueryEngine > getInstance()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool register_intel_jit_listener
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
static llvm::sys::Mutex g_ee_create_mutex
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls, const bool is_gpu=false)
bool allow_runtime_query_interrupt
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
CubinResult ptx_to_cubin(const std::string &ptx, const unsigned block_size, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
llvm::Type * get_int_ptr_type(const int width, llvm::LLVMContext &context)
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
std::unique_ptr< llvm::Module > read_llvm_module_from_bc_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
size_t g_gpu_smem_threshold