31 #if LLVM_VERSION_MAJOR < 9
32 static_assert(
false,
"LLVM Version >= 9 is required.");
35 #include <llvm/Bitcode/BitcodeReader.h>
36 #include <llvm/Bitcode/BitcodeWriter.h>
37 #include <llvm/ExecutionEngine/MCJIT.h>
38 #include <llvm/IR/Attributes.h>
39 #include <llvm/IR/GlobalValue.h>
40 #include <llvm/IR/InstIterator.h>
41 #include <llvm/IR/IntrinsicInst.h>
42 #include <llvm/IR/Intrinsics.h>
43 #include <llvm/IR/LegacyPassManager.h>
44 #include <llvm/IR/Verifier.h>
45 #include <llvm/IRReader/IRReader.h>
46 #include <llvm/Linker/Linker.h>
47 #include <llvm/Support/Casting.h>
48 #include <llvm/Support/FileSystem.h>
49 #include <llvm/Support/FormattedStream.h>
50 #include <llvm/Support/MemoryBuffer.h>
51 #include <llvm/Support/SourceMgr.h>
52 #include <llvm/Support/TargetRegistry.h>
53 #include <llvm/Support/TargetSelect.h>
54 #include <llvm/Support/raw_os_ostream.h>
55 #include <llvm/Support/raw_ostream.h>
56 #include <llvm/Transforms/IPO.h>
57 #include <llvm/Transforms/IPO/AlwaysInliner.h>
58 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
59 #include <llvm/Transforms/InstCombine/InstCombine.h>
60 #include <llvm/Transforms/Instrumentation.h>
61 #include <llvm/Transforms/Scalar.h>
62 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
63 #include <llvm/Transforms/Utils.h>
64 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
65 #include <llvm/Transforms/Utils/Cloning.h>
67 #if LLVM_VERSION_MAJOR >= 11
68 #include <llvm/Support/Host.h>
81 extern std::unique_ptr<llvm::Module> g_rt_libdevice_module;
85 extern std::unique_ptr<llvm::Module> g_rt_geos_module;
87 #include <llvm/Support/DynamicLibrary.h>
89 #ifndef GEOS_LIBRARY_FILENAME
90 #error Configuration should include GEOS library file name
92 std::unique_ptr<std::string> g_libgeos_so_filename(
93 new std::string(GEOS_LIBRARY_FILENAME));
94 static llvm::sys::DynamicLibrary geos_dynamic_library;
95 static std::mutex geos_init_mutex;
99 void load_geos_dynamic_library() {
100 std::lock_guard<std::mutex> guard(geos_init_mutex);
102 if (!geos_dynamic_library.isValid()) {
103 if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
104 LOG(
WARNING) <<
"Misconfigured GEOS library file name, trying 'libgeos_c.so'";
105 g_libgeos_so_filename.reset(
new std::string(
"libgeos_c.so"));
107 auto filename = *g_libgeos_so_filename;
108 std::string error_message;
109 geos_dynamic_library =
110 llvm::sys::DynamicLibrary::getPermanentLibrary(
filename.c_str(), &error_message);
111 if (!geos_dynamic_library.isValid()) {
113 std::string exception_message =
"Failed to load GEOS library: " + error_message;
114 throw std::runtime_error(exception_message.c_str());
127 std::string src =
"",
128 const bool is_gpu =
false) {
129 std::string excname = (is_gpu ?
"NVVM IR ParseError: " :
"LLVM IR ParseError: ");
130 llvm::raw_string_ostream ss(excname);
131 parse_error.print(src.c_str(), ss,
false,
false);
145 #define SHOW_DEFINED(MODULE) \
147 std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
148 ::show_defined(MODULE); \
151 #define SHOW_FUNCTIONS(MODULE) \
153 std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
154 ::show_functions(MODULE); \
157 template <
typename T =
void>
159 std::cout <<
"defines: ";
160 for (
auto& f : module.getFunctionList()) {
161 if (!f.isDeclaration()) {
162 std::cout << f.getName().str() <<
", ";
165 std::cout << std::endl;
168 template <
typename T =
void>
170 if (module ==
nullptr) {
171 std::cout <<
"is null" << std::endl;
177 template <
typename T =
void>
196 template <
typename T =
void>
198 std::unordered_set<std::string>& defined,
199 std::unordered_set<std::string>& undefined,
200 const std::unordered_set<std::string>& ignored) {
201 for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
202 if (
auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
203 auto* F2 = CI->getCalledFunction();
205 auto F2name = F2->getName().str();
206 if (F2->isDeclaration()) {
207 if (F2name.rfind(
"__", 0) !=
209 && F2name.rfind(
"llvm.", 0) !=
211 && ignored.find(F2name) == ignored.end()
213 undefined.emplace(F2name);
216 if (defined.find(F2name) == defined.end()) {
217 defined.emplace(F2name);
226 template <
typename T =
void>
228 std::unordered_set<std::string>& defined,
229 std::unordered_set<std::string>& undefined,
230 const std::unordered_set<std::string>& ignored) {
231 for (
auto& F : module) {
232 if (!F.isDeclaration()) {
238 template <
typename T =
void>
239 std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>>
241 const std::unordered_set<std::string>& ignored = {}) {
242 std::unordered_set<std::string> defined, undefined;
244 return std::make_tuple(defined, undefined);
247 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
250 const std::unordered_set<llvm::Function*>& live_funcs) {
251 std::vector<llvm::Function*> dead_funcs;
254 if (live_funcs.count(&F)) {
257 for (
auto U : F.users()) {
258 auto* C = llvm::dyn_cast<
const llvm::CallInst>(U);
259 if (!C || C->getParent()->getParent() != &F) {
265 dead_funcs.push_back(&F);
268 for (
auto pFn : dead_funcs) {
269 pFn->eraseFromParent();
277 bool check_module_requires_libdevice(llvm::Module* module) {
278 for (llvm::Function& F : *module) {
279 if (F.hasName() && F.getName().startswith(
"__nv_")) {
280 LOG(
INFO) <<
"Module requires linking with libdevice: " << std::string(F.getName());
284 LOG(
DEBUG1) <<
"module does not require linking against libdevice";
289 void add_intrinsics_to_module(llvm::Module* module) {
290 for (llvm::Function& F : *module) {
291 for (llvm::Instruction& I : instructions(F)) {
292 if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
293 if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
294 llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
295 llvm::Function& decl_fn =
296 *llvm::Intrinsic::getDeclaration(module, ii->getIntrinsicID(), Tys);
297 ii->setCalledFunction(&decl_fn);
300 llvm::Intrinsic::getDeclaration(module, ii->getIntrinsicID());
310 llvm::Module* module,
311 llvm::legacy::PassManager& pass_manager,
312 const std::unordered_set<llvm::Function*>& live_funcs,
314 pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
315 pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
316 pass_manager.add(llvm::createInstSimplifyLegacyPass());
317 pass_manager.add(llvm::createInstructionCombiningPass());
318 pass_manager.add(llvm::createGlobalOptimizerPass());
320 pass_manager.add(llvm::createLICMPass());
322 pass_manager.add(llvm::createLoopStrengthReducePass());
324 pass_manager.run(*module);
335 : execution_engine_(execution_engine) {}
339 : execution_engine_(execution_engine) {
342 #ifdef ENABLE_INTEL_JIT_LISTENER
346 LOG(
INFO) <<
"Registered IntelJITEventListener";
348 LOG(
WARNING) <<
"This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
349 "listener configuration parameter.";
350 #endif // ENABLE_INTEL_JIT_LISTENER
356 llvm::ExecutionEngine* execution_engine) {
363 std::stringstream err_ss;
364 llvm::raw_os_ostream err_os(err_ss);
365 err_os <<
"\n-----\n";
366 if (llvm::verifyFunction(*func, &err_os)) {
367 err_os <<
"\n-----\n";
368 func->print(err_os,
nullptr);
369 err_os <<
"\n-----\n";
376 auto it = cache.
find(key);
377 if (it != cache.
cend()) {
378 delete cgen_state_->module_;
379 cgen_state_->module_ = it->second.second;
380 return it->second.first;
386 std::shared_ptr<CompilationContext> compilation_context,
387 llvm::Module* module,
390 std::make_pair<std::shared_ptr<CompilationContext>, decltype(module)>(
391 std::move(compilation_context), std::move(module)));
397 llvm::Module* module) {
398 llvm::legacy::PassManager pass_manager;
399 auto cpu_target_machine = execution_engine->getTargetMachine();
400 CHECK(cpu_target_machine);
401 llvm::SmallString<256> code_str;
402 llvm::raw_svector_ostream os(code_str);
403 #if LLVM_VERSION_MAJOR >= 10
404 cpu_target_machine->addPassesToEmitFile(
405 pass_manager, os,
nullptr, llvm::CGFT_AssemblyFile);
407 cpu_target_machine->addPassesToEmitFile(
408 pass_manager, os,
nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
410 pass_manager.run(*module);
411 return "Assembly for the CPU:\n" + std::string(code_str.str()) +
"\nEnd of assembly";
417 llvm::Function* func,
418 const std::unordered_set<llvm::Function*>& live_funcs,
420 auto module = func->getParent();
422 #ifndef WITH_JIT_DEBUG
423 llvm::legacy::PassManager pass_manager;
424 optimize_ir(func, module, pass_manager, live_funcs, co);
425 #endif // WITH_JIT_DEBUG
427 auto init_err = llvm::InitializeNativeTarget();
430 llvm::InitializeAllTargetMCs();
431 llvm::InitializeNativeTargetAsmPrinter();
432 llvm::InitializeNativeTargetAsmParser();
435 std::unique_ptr<llvm::Module> owner(module);
436 llvm::EngineBuilder eb(std::move(owner));
437 eb.setErrorStr(&err_str);
438 eb.setEngineKind(llvm::EngineKind::JIT);
439 llvm::TargetOptions to;
440 to.EnableFastISel =
true;
441 eb.setTargetOptions(to);
443 eb.setOptLevel(llvm::CodeGenOpt::None);
447 CHECK(execution_engine.get());
450 execution_engine->finalizeObject();
451 return execution_engine;
455 llvm::Function* query_func,
456 llvm::Function* multifrag_query_func,
457 const std::unordered_set<llvm::Function*>& live_funcs,
459 auto module = multifrag_query_func->getParent();
462 if (cgen_state_->filter_func_) {
465 for (
const auto helper : cgen_state_->helper_functions_) {
473 if (cgen_state_->needs_geos_) {
475 load_geos_dynamic_library();
478 auto rt_geos_module_copy = llvm::CloneModule(
479 *g_rt_geos_module.get(), cgen_state_->vmap_, [](
const llvm::GlobalValue* gv) {
480 auto func = llvm::dyn_cast<llvm::Function>(gv);
484 return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
485 func->getLinkage() ==
486 llvm::GlobalValue::LinkageTypes::InternalLinkage ||
487 func->getLinkage() == llvm::GlobalValue::LinkageTypes::ExternalLinkage);
492 llvm::Linker::Flags::LinkOnlyNeeded);
494 throw std::runtime_error(
"GEOS is disabled in this build");
498 auto execution_engine =
500 auto cpu_compilation_context =
501 std::make_shared<CpuCompilationContext>(std::move(execution_engine));
502 cpu_compilation_context->setFunctionPointer(multifrag_query_func);
504 return cpu_compilation_context;
508 llvm::Module& module,
510 llvm::Linker::Flags flags) {
513 for (
auto& f : *udf_module.get()) {
514 auto func = module.getFunction(f.getName());
515 if (!(func ==
nullptr) && !f.isDeclaration() && flags == llvm::Linker::Flags::None) {
516 LOG(
ERROR) <<
" Attempt to overwrite " << f.getName().str() <<
" in "
517 << module.getModuleIdentifier() <<
" from `"
518 << udf_module->getModuleIdentifier() <<
"`" << std::endl;
519 throw std::runtime_error(
520 "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
523 VLOG(1) <<
" Adding " << f.getName().str() <<
" to "
524 << module.getModuleIdentifier() <<
" from `"
525 << udf_module->getModuleIdentifier() <<
"`" << std::endl;
529 std::unique_ptr<llvm::Module> udf_module_copy;
531 udf_module_copy = llvm::CloneModule(*udf_module.get(), cgen_state->
vmap_);
533 udf_module_copy->setDataLayout(module.getDataLayout());
534 udf_module_copy->setTargetTriple(module.getTargetTriple());
537 llvm::Linker ld(module);
538 bool link_error =
false;
540 link_error = ld.linkInModule(std::move(udf_module_copy), flags);
543 throw std::runtime_error(
"link_udf_module: *** error linking module ***");
553 if (s ==
"int16_t") {
556 if (s ==
"int32_t") {
559 if (s ==
"int64_t") {
562 CHECK(s ==
"float" || s ==
"double");
568 for (
const std::string any_or_all : {
"any",
"all"}) {
569 for (
const std::string elem_type :
570 {
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"float",
"double"}) {
571 for (
const std::string needle_type :
572 {
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"float",
"double"}) {
573 for (
const std::string op_name : {
"eq",
"ne",
"lt",
"le",
"gt",
"ge"}) {
574 result += (
"declare i1 @array_" + any_or_all +
"_" + op_name +
"_" + elem_type +
586 for (
const std::string key_type : {
"int8_t",
"int16_t",
"int32_t",
"int64_t"}) {
588 result +=
"declare i64 @translate_null_key_" + key_type +
"(" + key_llvm_type +
", " +
589 key_llvm_type +
", i64);\n";
595 R
"(
declare void @llvm.dbg.declare(metadata, metadata, metadata)
declare void @llvm.dbg.value(metadata, metadata, metadata)
declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
declare i64 @get_thread_index();
declare i64 @get_block_index();
declare i32 @pos_start_impl(i32*);
declare i32 @group_buff_idx_impl();
declare i32 @pos_step_impl();
declare i8 @thread_warp_idx(i8);
declare i64* @init_shared_mem(i64*, i32);
declare i64* @init_shared_mem_nop(i64*, i32);
declare i64* @declare_dynamic_shared_memory();
declare void @write_back_nop(i64*, i64*, i32);
declare void @write_back_non_grouped_agg(i64*, i64*, i32);
declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8);
declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32);
declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32);
declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32);
declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32);
declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32);
declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32);
declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64);
declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64);
declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64);
declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64);
declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64);
declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double);
declare i64 @get_bucket_key_for_range_double(i8*, i64, double);
declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double);
declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64);
declare i64 @agg_count_shared(i64*, i64);
declare i64 @agg_count_skip_val_shared(i64*, i64, i64);
declare i32 @agg_count_int32_shared(i32*, i32);
declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32);
declare i64 @agg_count_double_shared(i64*, double);
declare i64 @agg_count_double_skip_val_shared(i64*, double, double);
declare i32 @agg_count_float_shared(i32*, float);
declare i32 @agg_count_float_skip_val_shared(i32*, float, float);
declare i64 @agg_sum_shared(i64*, i64);
declare i64 @agg_sum_skip_val_shared(i64*, i64, i64);
declare i32 @agg_sum_int32_shared(i32*, i32);
declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_sum_double_shared(i64*, double);
declare void @agg_sum_double_skip_val_shared(i64*, double, double);
declare void @agg_sum_float_shared(i32*, float);
declare void @agg_sum_float_skip_val_shared(i32*, float, float);
declare void @agg_max_shared(i64*, i64);
declare void @agg_max_skip_val_shared(i64*, i64, i64);
declare void @agg_max_int32_shared(i32*, i32);
declare void @agg_max_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_max_int16_shared(i16*, i16);
declare void @agg_max_int16_skip_val_shared(i16*, i16, i16);
declare void @agg_max_int8_shared(i8*, i8);
declare void @agg_max_int8_skip_val_shared(i8*, i8, i8);
declare void @agg_max_double_shared(i64*, double);
declare void @agg_max_double_skip_val_shared(i64*, double, double);
declare void @agg_max_float_shared(i32*, float);
declare void @agg_max_float_skip_val_shared(i32*, float, float);
declare void @agg_min_shared(i64*, i64);
declare void @agg_min_skip_val_shared(i64*, i64, i64);
declare void @agg_min_int32_shared(i32*, i32);
declare void @agg_min_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_min_int16_shared(i16*, i16);
declare void @agg_min_int16_skip_val_shared(i16*, i16, i16);
declare void @agg_min_int8_shared(i8*, i8);
declare void @agg_min_int8_skip_val_shared(i8*, i8, i8);
declare void @agg_min_double_shared(i64*, double);
declare void @agg_min_double_skip_val_shared(i64*, double, double);
declare void @agg_min_float_shared(i32*, float);
declare void @agg_min_float_skip_val_shared(i32*, float, float);
declare void @agg_id_shared(i64*, i64);
declare void @agg_id_int32_shared(i32*, i32);
declare void @agg_id_int16_shared(i16*, i16);
declare void @agg_id_int8_shared(i8*, i8);
declare void @agg_id_double_shared(i64*, double);
declare void @agg_id_double_shared_slow(i64*, double*);
declare void @agg_id_float_shared(i32*, float);
declare i32 @checked_single_agg_id_shared(i64*, i64, i64);
declare i32 @checked_single_agg_id_double_shared(i64*, double, double);
declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double);
declare i32 @checked_single_agg_id_float_shared(i32*, float, float);
declare i1 @slotEmptyKeyCAS(i64*, i64, i64);
declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32);
declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16);
declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8);
declare i64 @datetrunc_century(i64);
declare i64 @datetrunc_day(i64);
declare i64 @datetrunc_decade(i64);
declare i64 @datetrunc_hour(i64);
declare i64 @datetrunc_millennium(i64);
declare i64 @datetrunc_minute(i64);
declare i64 @datetrunc_month(i64);
declare i64 @datetrunc_quarter(i64);
declare i64 @datetrunc_quarterday(i64);
declare i64 @datetrunc_week_monday(i64);
declare i64 @datetrunc_week_sunday(i64);
declare i64 @datetrunc_week_saturday(i64);
declare i64 @datetrunc_year(i64);
declare i64 @extract_epoch(i64);
declare i64 @extract_dateepoch(i64);
declare i64 @extract_quarterday(i64);
declare i64 @extract_hour(i64);
declare i64 @extract_minute(i64);
declare i64 @extract_second(i64);
declare i64 @extract_millisecond(i64);
declare i64 @extract_microsecond(i64);
declare i64 @extract_nanosecond(i64);
declare i64 @extract_dow(i64);
declare i64 @extract_isodow(i64);
declare i64 @extract_day(i64);
declare i64 @extract_week_monday(i64);
declare i64 @extract_week_sunday(i64);
declare i64 @extract_week_saturday(i64);
declare i64 @extract_day_of_year(i64);
declare i64 @extract_month(i64);
declare i64 @extract_quarter(i64);
declare i64 @extract_year(i64);
declare i64 @DateTruncateHighPrecisionToDate(i64, i64);
declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64);
declare i64 @DateDiff(i32, i64, i64);
declare i64 @DateDiffNullable(i32, i64, i64, i64);
declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32);
declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64);
declare i64 @DateAdd(i32, i64, i64);
declare i64 @DateAddNullable(i32, i64, i64, i64);
declare i64 @DateAddHighPrecision(i32, i64, i64, i32);
declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64);
declare i64 @string_decode(i8*, i64);
declare i32 @array_size(i8*, i64, i32);
declare i32 @array_size_nullable(i8*, i64, i32, i32);
declare i32 @fast_fixlen_array_size(i8*, i32);
declare i1 @array_is_null(i8*, i64);
declare i1 @point_coord_array_is_null(i8*, i64);
declare i8* @array_buff(i8*, i64);
declare i8* @fast_fixlen_array_buff(i8*, i64);
declare i8 @array_at_int8_t(i8*, i64, i32);
declare i16 @array_at_int16_t(i8*, i64, i32);
declare i32 @array_at_int32_t(i8*, i64, i32);
declare i64 @array_at_int64_t(i8*, i64, i32);
declare float @array_at_float(i8*, i64, i32);
declare double @array_at_double(i8*, i64, i32);
declare i8 @varlen_array_at_int8_t(i8*, i64, i32);
declare i16 @varlen_array_at_int16_t(i8*, i64, i32);
declare i32 @varlen_array_at_int32_t(i8*, i64, i32);
declare i64 @varlen_array_at_int64_t(i8*, i64, i32);
declare float @varlen_array_at_float(i8*, i64, i32);
declare double @varlen_array_at_double(i8*, i64, i32);
declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32);
declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32);
declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32);
declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32);
declare float @varlen_notnull_array_at_float(i8*, i64, i32);
declare double @varlen_notnull_array_at_double(i8*, i64, i32);
declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8);
declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16);
declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32);
declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64);
declare float @array_at_float_checked(i8*, i64, i64, float);
declare double @array_at_double_checked(i8*, i64, i64, double);
declare i32 @char_length(i8*, i32);
declare i32 @char_length_nullable(i8*, i32, i32);
declare i32 @char_length_encoded(i8*, i32);
declare i32 @char_length_encoded_nullable(i8*, i32, i32);
declare i32 @key_for_string_encoded(i32);
declare i1 @sample_ratio(double, i64);
declare i1 @string_like(i8*, i32, i8*, i32, i8);
declare i1 @string_ilike(i8*, i32, i8*, i32, i8);
declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8);
declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8);
declare i1 @string_like_simple(i8*, i32, i8*, i32);
declare i1 @string_ilike_simple(i8*, i32, i8*, i32);
declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8);
declare i1 @string_lt(i8*, i32, i8*, i32);
declare i1 @string_le(i8*, i32, i8*, i32);
declare i1 @string_gt(i8*, i32, i8*, i32);
declare i1 @string_ge(i8*, i32, i8*, i32);
declare i1 @string_eq(i8*, i32, i8*, i32);
declare i1 @string_ne(i8*, i32, i8*, i32);
declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8);
declare i1 @regexp_like(i8*, i32, i8*, i32, i8);
declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8);
declare void @linear_probabilistic_count(i8*, i32, i8*, i32);
declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64);
declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64);
declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64);
declare void @agg_approx_median(i64*, double);
declare void @agg_approx_median_skip_val(i64*, double, i64);
declare void @record_error_code(i32, i32*);
declare i32 @get_error_code(i32*);
declare i1 @dynamic_watchdog();
declare i1 @check_interrupt();
declare void @force_sync();
declare void @sync_warp();
declare void @sync_warp_protected(i64, i64);
declare void @sync_threadblock();
declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32);
declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64);
declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float);
declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double);
)" + gen_array_any_all_sigs() +
599 std::string extension_function_decls(
const std::unordered_set<std::string>& udf_decls) {
604 void legalize_nvvm_ir(llvm::Function* query_func) {
611 std::vector<llvm::Instruction*> stackrestore_intrinsics;
612 std::vector<llvm::Instruction*> stacksave_intrinsics;
613 for (
auto& BB : *query_func) {
614 for (llvm::Instruction& I : BB) {
615 if (
const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
616 if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
617 stacksave_intrinsics.push_back(&I);
618 }
else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
619 stackrestore_intrinsics.push_back(&I);
628 for (
auto& II : stackrestore_intrinsics) {
629 II->eraseFromParent();
631 for (
auto& II : stacksave_intrinsics) {
632 II->eraseFromParent();
640 return llvm::StringRef(
"nvptx64-nvidia-cuda");
644 return llvm::StringRef(
645 "e-p:64:64:64-i1:8:8-i8:8:8-"
646 "i16:16:16-i32:32:32-i64:64:64-"
647 "f32:32:32-f64:64:64-v16:16:16-"
648 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
652 std::map<std::string, std::string>
result;
654 result.insert(std::make_pair(
"cpu_name", llvm::sys::getHostCPUName()));
655 result.insert(std::make_pair(
"cpu_triple", llvm::sys::getProcessTriple()));
657 std::make_pair(
"cpu_cores",
std::to_string(llvm::sys::getHostNumPhysicalCores())));
660 std::string null_values;
661 null_values +=
"boolean1:" +
std::to_string(serialized_null_value<bool>()) +
";";
662 null_values +=
"boolean8:" +
std::to_string(serialized_null_value<int8_t>()) +
";";
663 null_values +=
"int8:" +
std::to_string(serialized_null_value<int8_t>()) +
";";
664 null_values +=
"int16:" +
std::to_string(serialized_null_value<int16_t>()) +
";";
665 null_values +=
"int32:" +
std::to_string(serialized_null_value<int32_t>()) +
";";
666 null_values +=
"int64:" +
std::to_string(serialized_null_value<int64_t>()) +
";";
667 null_values +=
"uint8:" +
std::to_string(serialized_null_value<uint8_t>()) +
";";
668 null_values +=
"uint16:" +
std::to_string(serialized_null_value<uint16_t>()) +
";";
669 null_values +=
"uint32:" +
std::to_string(serialized_null_value<uint32_t>()) +
";";
670 null_values +=
"uint64:" +
std::to_string(serialized_null_value<uint64_t>()) +
";";
671 null_values +=
"float32:" +
std::to_string(serialized_null_value<float>()) +
";";
672 null_values +=
"float64:" +
std::to_string(serialized_null_value<double>()) +
";";
674 "Array<boolean8>:" +
std::to_string(serialized_null_value<int8_t, true>()) +
";";
676 "Array<int8>:" +
std::to_string(serialized_null_value<int8_t, true>()) +
";";
678 "Array<int16>:" +
std::to_string(serialized_null_value<int16_t, true>()) +
";";
680 "Array<int32>:" +
std::to_string(serialized_null_value<int32_t, true>()) +
";";
682 "Array<int64>:" +
std::to_string(serialized_null_value<int64_t, true>()) +
";";
684 "Array<float32>:" +
std::to_string(serialized_null_value<float, true>()) +
";";
686 "Array<float64>:" +
std::to_string(serialized_null_value<double, true>()) +
";";
688 result.insert(std::make_pair(
"null_values", null_values));
690 llvm::StringMap<bool> cpu_features;
691 if (llvm::sys::getHostCPUFeatures(cpu_features)) {
692 std::string features_str =
"";
693 for (
auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
694 features_str += (it->getValue() ?
" +" :
" -");
695 features_str += it->getKey().str();
697 result.insert(std::make_pair(
"cpu_features", features_str));
700 result.insert(std::make_pair(
"llvm_version",
707 int device_count = 0;
711 char device_name[256];
712 int major = 0, minor = 0;
717 &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
719 &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
722 result.insert(std::make_pair(
"gpu_name", device_name));
723 result.insert(std::make_pair(
"gpu_count",
std::to_string(device_count)));
724 result.insert(std::make_pair(
"gpu_compute_capability",
728 result.insert(std::make_pair(
"gpu_driver",
739 llvm::Function* func,
740 llvm::Function* wrapper_func,
741 const std::unordered_set<llvm::Function*>& live_funcs,
743 const GPUTarget& gpu_target) {
745 auto module = func->getParent();
766 CHECK(gpu_target.cgen_state->module_ == module);
767 module->setDataLayout(
768 "e-p:64:64:64-i1:8:8-i8:8:8-"
769 "i16:16:16-i32:32:32-i64:64:64-"
770 "f32:32:32-f64:64:64-v16:16:16-"
771 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
772 module->setTargetTriple(
"nvptx64-nvidia-cuda");
773 CHECK(gpu_target.nvptx_target_machine);
774 auto pass_manager_builder = llvm::PassManagerBuilder();
776 pass_manager_builder.OptLevel = 0;
777 llvm::legacy::PassManager module_pass_manager;
778 pass_manager_builder.populateModulePassManager(module_pass_manager);
780 bool requires_libdevice = check_module_requires_libdevice(module);
782 if (requires_libdevice) {
784 gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
785 llvm::legacy::FunctionPassManager FPM(module);
786 pass_manager_builder.populateFunctionPassManager(FPM);
789 FPM.doInitialization();
790 for (
auto& F : *module) {
793 FPM.doFinalization();
797 optimize_ir(func, module, module_pass_manager, live_funcs, co);
798 legalize_nvvm_ir(func);
800 std::stringstream ss;
801 llvm::raw_os_ostream os(ss);
803 llvm::LLVMContext& ctx = module->getContext();
805 llvm::NamedMDNode* md = module->getOrInsertNamedMetadata(
"nvvm.annotations");
807 llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
808 llvm::MDString::get(ctx,
"kernel"),
809 llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
810 llvm::Type::getInt32Ty(ctx), 1))};
813 md->addOperand(llvm::MDNode::get(ctx, md_vals));
815 std::unordered_set<llvm::Function*> roots{wrapper_func, func};
816 if (gpu_target.row_func_not_inlined) {
818 roots.insert(gpu_target.cgen_state->row_func_);
819 if (gpu_target.cgen_state->filter_func_) {
820 roots.insert(gpu_target.cgen_state->filter_func_);
825 for (
auto f : gpu_target.cgen_state->helper_functions_) {
830 std::unordered_set<std::string> udf_declarations;
833 llvm::Function* udf_function = module->getFunction(f.getName());
836 legalize_nvvm_ir(udf_function);
837 roots.insert(udf_function);
841 if (f.isDeclaration()) {
842 udf_declarations.insert(f.getName().str());
850 llvm::Function* udf_function = module->getFunction(f.getName());
852 legalize_nvvm_ir(udf_function);
853 roots.insert(udf_function);
857 if (f.isDeclaration()) {
858 udf_declarations.insert(f.getName().str());
864 std::vector<llvm::Function*> rt_funcs;
865 for (
auto& Fn : *module) {
866 if (roots.count(&Fn)) {
869 rt_funcs.push_back(&Fn);
871 for (
auto& pFn : rt_funcs) {
872 pFn->removeFromParent();
875 if (requires_libdevice) {
876 add_intrinsics_to_module(module);
879 module->print(os,
nullptr);
882 for (
auto& pFn : rt_funcs) {
883 module->getFunctionList().push_back(pFn);
885 module->eraseNamedMetadata(md);
887 auto cuda_llir = ss.str() + cuda_rt_decls + extension_function_decls(udf_declarations);
891 cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
893 LOG(
WARNING) <<
"Failed to generate PTX: " << e.what()
894 <<
". Switching to CPU execution target.";
897 LOG(
PTX) <<
"PTX for the GPU:\n" << ptx <<
"\nEnd of PTX";
899 auto cubin_result =
ptx_to_cubin(ptx, gpu_target.block_size, gpu_target.cuda_mgr);
900 auto& option_keys = cubin_result.option_keys;
901 auto& option_values = cubin_result.option_values;
902 auto cubin = cubin_result.cubin;
903 auto link_state = cubin_result.link_state;
904 const auto num_options = option_keys.size();
906 auto func_name = wrapper_func->getName().str();
907 auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
908 for (
int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
910 gpu_compilation_context->addDeviceCode(
911 std::make_unique<GpuDeviceCompilationContext>(cubin,
921 return gpu_compilation_context;
928 llvm::Function* query_func,
929 llvm::Function* multifrag_query_func,
930 std::unordered_set<llvm::Function*>& live_funcs,
931 const bool no_inline,
935 auto module = multifrag_query_func->getParent();
940 if (cgen_state_->filter_func_) {
943 for (
const auto helper : cgen_state_->helper_functions_) {
951 bool row_func_not_inlined =
false;
953 for (
auto it = llvm::inst_begin(cgen_state_->row_func_),
954 e = llvm::inst_end(cgen_state_->row_func_);
957 if (llvm::isa<llvm::CallInst>(*it)) {
958 auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
959 if (get_gv_call.getCalledFunction()->getName() ==
"array_size" ||
960 get_gv_call.getCalledFunction()->getName() ==
"linear_probabilistic_count") {
962 row_func_not_inlined =
true;
969 initializeNVPTXBackend();
974 row_func_not_inlined};
975 std::shared_ptr<GpuCompilationContext> compilation_context;
977 if (check_module_requires_libdevice(module)) {
978 if (g_rt_libdevice_module ==
nullptr) {
980 throw std::runtime_error(
981 "libdevice library is not available but required by the UDF module");
988 llvm::Linker::Flags::OverrideFromSrc);
991 module->addModuleFlag(llvm::Module::Override,
"nvvm-reflect-ftz", (
int)1);
992 for (llvm::Function& fn : *module) {
993 fn.addFnAttr(
"nvptx-f32ftz",
"true");
999 query_func, multifrag_query_func, live_funcs, co, gpu_target);
1001 }
catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1002 if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1005 LOG(
WARNING) <<
"Failed to allocate GPU memory for generated code. Evicting "
1007 <<
"% of GPU code cache and re-trying.";
1010 query_func, multifrag_query_func, live_funcs, co, gpu_target);
1016 CHECK(compilation_context);
1017 return compilation_context;
1024 llvm::TargetMachine* nvptx_target_machine,
1025 llvm::LLVMContext& context) {
1026 auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir,
"",
false);
1028 llvm::SMDiagnostic parse_error;
1030 auto module = llvm::parseIR(mem_buff->getMemBufferRef(), parse_error, context);
1032 LOG(
IR) <<
"CodeGenerator::generatePTX:NVVM IR:\n" << cuda_llir <<
"\nEnd of NNVM IR";
1036 llvm::SmallString<256> code_str;
1037 llvm::raw_svector_ostream formatted_os(code_str);
1038 CHECK(nvptx_target_machine);
1040 llvm::legacy::PassManager ptxgen_pm;
1041 module->setDataLayout(nvptx_target_machine->createDataLayout());
1043 #if LLVM_VERSION_MAJOR >= 10
1044 nvptx_target_machine->addPassesToEmitFile(
1045 ptxgen_pm, formatted_os,
nullptr, llvm::CGFT_AssemblyFile);
1047 nvptx_target_machine->addPassesToEmitFile(
1048 ptxgen_pm, formatted_os,
nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1050 ptxgen_pm.run(*module);
1053 #if LLVM_VERSION_MAJOR >= 11
1054 return std::string(code_str);
1056 return code_str.str();
1062 llvm::InitializeAllTargets();
1063 llvm::InitializeAllTargetMCs();
1064 llvm::InitializeAllAsmPrinters();
1066 auto target = llvm::TargetRegistry::lookupTarget(
"nvptx64", err);
1070 return std::unique_ptr<llvm::TargetMachine>(
1071 target->createTargetMachine(
"nvptx64-nvidia-cuda",
1074 llvm::TargetOptions(),
1075 llvm::Reloc::Static));
1080 cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1084 if (nvptx_target_machine_) {
1087 const auto cuda_mgr =
catalog_->getDataMgr().getCudaMgr();
1088 LOG_IF(
FATAL, cuda_mgr ==
nullptr) <<
"No CudaMgr instantiated, unable to check device "
1089 "architecture or generate code for nvidia GPUs.";
1097 return func->getName() ==
"query_stub_hoisted_literals" ||
1098 func->getName() ==
"multifrag_query_hoisted_literals" ||
1099 func->getName() ==
"query_stub" || func->getName() ==
"multifrag_query" ||
1100 func->getName() ==
"fixed_width_int_decode" ||
1101 func->getName() ==
"fixed_width_unsigned_decode" ||
1102 func->getName() ==
"diff_fixed_width_int_decode" ||
1103 func->getName() ==
"fixed_width_double_decode" ||
1104 func->getName() ==
"fixed_width_float_decode" ||
1105 func->getName() ==
"fixed_width_small_date_decode" ||
1106 func->getName() ==
"record_error_code" || func->getName() ==
"get_error_code";
1110 llvm::SMDiagnostic err;
1113 "/QueryEngine/RuntimeFunctions.bc");
1114 CHECK(!buffer_or_error.getError());
1115 llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1117 auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1118 CHECK(!owner.takeError());
1119 auto module = owner.get().release();
1126 llvm::Module* read_libdevice_module(llvm::LLVMContext& context) {
1127 llvm::SMDiagnostic err;
1130 boost::filesystem::path cuda_path{env};
1131 cuda_path /=
"nvvm";
1132 cuda_path /=
"libdevice";
1133 cuda_path /=
"libdevice.10.bc";
1135 if (!boost::filesystem::exists(cuda_path)) {
1136 LOG(
WARNING) <<
"Could not find CUDA libdevice; support for some UDF "
1137 "functions might not be available.";
1141 auto buffer_or_error = llvm::MemoryBuffer::getFile(cuda_path.c_str());
1142 CHECK(!buffer_or_error.getError());
1143 llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1145 auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1146 CHECK(!owner.takeError());
1147 auto module = owner.get().release();
1155 llvm::Module* read_geos_module(llvm::LLVMContext& context) {
1156 llvm::SMDiagnostic err;
1159 "/QueryEngine/GeosRuntime.bc");
1160 CHECK(!buffer_or_error.getError());
1161 llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1163 auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1164 CHECK(!owner.takeError());
1165 auto module = owner.get().release();
1175 const bool use_resume_param,
1176 llvm::Function* query_func,
1177 llvm::Module* module) {
1178 for (
auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1180 if (!llvm::isa<llvm::CallInst>(*it)) {
1183 auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1184 if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
1185 if (use_resume_param) {
1186 const auto error_code_arg =
get_arg_by_name(query_func,
"error_code");
1187 llvm::ReplaceInstWithInst(
1189 llvm::CallInst::Create(module->getFunction(pos_fn_name +
"_impl"),
1192 llvm::ReplaceInstWithInst(
1194 llvm::CallInst::Create(module->getFunction(pos_fn_name +
"_impl")));
1202 const size_t in_col_count,
1203 const size_t agg_col_count,
1204 const bool hoist_literals) {
1205 auto arg_it = row_func->arg_begin();
1207 if (agg_col_count) {
1208 for (
size_t i = 0; i < agg_col_count; ++i) {
1209 arg_it->setName(
"out");
1213 arg_it->setName(
"group_by_buff");
1215 arg_it->setName(
"crt_matched");
1217 arg_it->setName(
"total_matched");
1219 arg_it->setName(
"old_total_matched");
1221 arg_it->setName(
"max_matched");
1225 arg_it->setName(
"agg_init_val");
1228 arg_it->setName(
"pos");
1231 arg_it->setName(
"frag_row_off");
1234 arg_it->setName(
"num_rows_per_scan");
1237 if (hoist_literals) {
1238 arg_it->setName(
"literals");
1242 for (
size_t i = 0; i < in_col_count; ++i) {
1247 arg_it->setName(
"join_hash_tables");
1251 const size_t agg_col_count,
1252 const bool hoist_literals,
1253 llvm::Module* module,
1254 llvm::LLVMContext& context) {
1255 std::vector<llvm::Type*> row_process_arg_types;
1257 if (agg_col_count) {
1259 for (
size_t i = 0; i < agg_col_count; ++i) {
1260 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1264 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1266 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1268 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1270 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1272 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1276 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1279 row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1282 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1285 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1288 if (hoist_literals) {
1289 row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1293 for (
size_t i = 0; i < in_col_count; ++i) {
1294 row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1298 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1302 llvm::FunctionType::get(
get_int_type(32, context), row_process_arg_types,
false);
1305 llvm::Function::Create(ft, llvm::Function::ExternalLinkage,
"row_func", module);
1315 const std::string& query_fname,
1316 llvm::Function* multifrag_query_func,
1317 llvm::Module* module) {
1318 std::vector<llvm::CallInst*> query_stubs;
1319 for (
auto it = llvm::inst_begin(multifrag_query_func),
1320 e = llvm::inst_end(multifrag_query_func);
1323 if (!llvm::isa<llvm::CallInst>(*it)) {
1326 auto& query_call = llvm::cast<llvm::CallInst>(*it);
1327 if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
1328 query_stubs.push_back(&query_call);
1331 for (
auto& S : query_stubs) {
1332 std::vector<llvm::Value*>
args;
1333 for (
size_t i = 0; i < S->getNumArgOperands(); ++i) {
1334 args.push_back(S->getArgOperand(i));
1336 llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args,
""));
1340 std::vector<std::string>
get_agg_fnames(
const std::vector<Analyzer::Expr*>& target_exprs,
1341 const bool is_group_by) {
1342 std::vector<std::string>
result;
1343 for (
size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1344 ++target_idx, ++agg_col_idx) {
1345 const auto target_expr = target_exprs[target_idx];
1347 const auto target_type_info = target_expr->get_type_info();
1349 const bool is_varlen =
1350 (target_type_info.is_string() &&
1352 target_type_info.is_array();
1353 if (!agg_expr || agg_expr->get_aggtype() ==
kSAMPLE) {
1354 result.emplace_back(target_type_info.is_fp() ?
"agg_id_double" :
"agg_id");
1356 result.emplace_back(
"agg_id");
1358 if (target_type_info.is_geometry()) {
1359 result.emplace_back(
"agg_id");
1360 for (
auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1361 result.emplace_back(
"agg_id");
1367 const auto& agg_type_info =
1368 agg_type !=
kCOUNT ? agg_expr->get_arg()->get_type_info() : target_type_info;
1371 if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1372 !agg_type_info.is_fp()) {
1373 throw std::runtime_error(
"AVG is only valid on integer and floating point");
1375 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1377 :
"agg_sum_double");
1378 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1380 :
"agg_count_double");
1384 if (agg_type_info.is_string() || agg_type_info.is_array() ||
1385 agg_type_info.is_geometry()) {
1386 throw std::runtime_error(
1387 "MIN on strings, arrays or geospatial types not supported yet");
1389 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1391 :
"agg_min_double");
1395 if (agg_type_info.is_string() || agg_type_info.is_array() ||
1396 agg_type_info.is_geometry()) {
1397 throw std::runtime_error(
1398 "MAX on strings, arrays or geospatial types not supported yet");
1400 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1402 :
"agg_max_double");
1406 if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1407 !agg_type_info.is_fp()) {
1408 throw std::runtime_error(
"SUM is only valid on integer and floating point");
1410 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1412 :
"agg_sum_double");
1416 result.emplace_back(agg_expr->get_is_distinct() ?
"agg_count_distinct"
1420 result.emplace_back(agg_type_info.is_fp() ?
"agg_id_double" :
"agg_id");
1425 result.emplace_back(agg_type_info.is_fp() ?
"agg_id_double" :
"agg_id");
1429 result.emplace_back(
"agg_approximate_count_distinct");
1432 result.emplace_back(
"agg_approx_median");
1450 std::unique_ptr<llvm::Module> g_rt_libdevice_module(
1463 llvm::SMDiagnostic parse_error;
1465 llvm::StringRef file_name_arg(udf_ir_filename);
1473 if (!gpu_triple.isNVPTX()) {
1475 <<
"Expected triple nvptx64-nvidia-cuda for NVVM IR of loadtime UDFs but got "
1476 << gpu_triple.str() <<
". Disabling the NVVM IR module.";
1482 llvm::SMDiagnostic parse_error;
1484 llvm::StringRef file_name_arg(udf_ir_filename);
1493 llvm::SMDiagnostic parse_error;
1496 std::make_unique<llvm::MemoryBufferRef>(udf_ir_string,
"Runtime UDF for GPU");
1499 if (!rt_udf_gpu_module) {
1500 LOG(
IR) <<
"read_rt_udf_gpu_module:NVVM IR:\n" << udf_ir_string <<
"\nEnd of NNVM IR";
1504 llvm::Triple gpu_triple(rt_udf_gpu_module->getTargetTriple());
1505 if (!gpu_triple.isNVPTX()) {
1506 LOG(
IR) <<
"read_rt_udf_gpu_module:NVVM IR:\n" << udf_ir_string <<
"\nEnd of NNVM IR";
1507 LOG(
WARNING) <<
"Expected triple nvptx64-nvidia-cuda for NVVM IR but got "
1509 <<
". Executing runtime UDFs on GPU will be disabled.";
1510 rt_udf_gpu_module =
nullptr;
1516 llvm::SMDiagnostic parse_error;
1519 std::make_unique<llvm::MemoryBufferRef>(udf_ir_string,
"Runtime UDF for CPU");
1522 if (!rt_udf_cpu_module) {
1523 LOG(
IR) <<
"read_rt_udf_cpu_module:LLVM IR:\n" << udf_ir_string <<
"\nEnd of LLVM IR";
1529 llvm::Module& module,
1530 const std::vector<llvm::Function*>& roots,
1531 const std::vector<llvm::Function*>& leaves) {
1532 std::unordered_set<llvm::Function*> live_funcs;
1533 live_funcs.insert(roots.begin(), roots.end());
1534 live_funcs.insert(leaves.begin(), leaves.end());
1536 if (
auto F = module.getFunction(
"init_shared_mem_nop")) {
1537 live_funcs.insert(F);
1539 if (
auto F = module.getFunction(
"write_back_nop")) {
1540 live_funcs.insert(F);
1543 for (
const llvm::Function* F : roots) {
1544 for (
const llvm::BasicBlock& BB : *F) {
1545 for (
const llvm::Instruction& I : BB) {
1546 if (
const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1547 live_funcs.insert(CI->getCalledFunction());
1553 for (llvm::Function& F : module) {
1554 if (!live_funcs.count(&F) && !F.isDeclaration()) {
1555 F.setLinkage(llvm::GlobalValue::InternalLinkage);
1565 template <
typename InstType>
1567 std::string bb_name,
1568 std::string variable_name) {
1569 llvm::Value* result =
nullptr;
1570 if (func ==
nullptr || variable_name.empty()) {
1573 bool is_found =
false;
1574 for (
auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1575 if (!bb_name.empty() && bb_it->getName() != bb_name) {
1578 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1579 if (llvm::isa<InstType>(*inst_it)) {
1580 if (inst_it->getName() == variable_name) {
1593 llvm::Function* query_func,
1594 bool run_with_dynamic_watchdog,
1595 bool run_with_allowing_runtime_interrupt,
1597 const std::vector<InputTableInfo>& input_table_infos) {
1603 if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1606 run_with_allowing_runtime_interrupt =
false;
1611 mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
1612 if (current_query_session_.empty()) {
1613 run_with_allowing_runtime_interrupt =
false;
1617 llvm::Value* row_count =
nullptr;
1618 if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1621 find_variable_in_basic_block<llvm::LoadInst>(query_func,
".entry",
"row_count");
1624 bool done_splitting =
false;
1625 for (
auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1627 llvm::Value* pos =
nullptr;
1628 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1629 if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1630 llvm::isa<llvm::PHINode>(*inst_it)) {
1631 if (inst_it->getName() ==
"pos") {
1636 if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1639 auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1640 if (std::string(row_func_call.getCalledFunction()->getName()) ==
"row_process") {
1641 auto next_inst_it = inst_it;
1643 auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1644 auto& br_instr = bb_it->back();
1645 llvm::IRBuilder<> ir_builder(&br_instr);
1646 llvm::Value* err_lv = &*inst_it;
1647 llvm::Value* err_lv_returned_from_row_func =
nullptr;
1648 if (run_with_dynamic_watchdog) {
1650 llvm::Value* call_watchdog_lv =
nullptr;
1656 auto crit_edge_rem =
1657 (blockSize() & (blockSize() - 1))
1658 ? ir_builder.CreateSRem(
1660 cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1661 : ir_builder.CreateAnd(
1663 cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1664 auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1665 crit_edge_threshold->setName(
"crit_edge_threshold");
1670 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1673 auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1674 call_watchdog_lv = ir_builder.CreateICmp(
1675 llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1677 CHECK(call_watchdog_lv);
1678 auto error_check_bb = bb_it->splitBasicBlock(
1679 llvm::BasicBlock::iterator(br_instr),
".error_check");
1680 auto& watchdog_br_instr = bb_it->back();
1682 auto watchdog_check_bb = llvm::BasicBlock::Create(
1683 cgen_state_->context_,
".watchdog_check", query_func, error_check_bb);
1684 llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1685 auto detected_timeout = watchdog_ir_builder.CreateCall(
1686 cgen_state_->module_->getFunction(
"dynamic_watchdog"), {});
1687 auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1689 watchdog_ir_builder.CreateBr(error_check_bb);
1691 llvm::ReplaceInstWithInst(
1693 llvm::BranchInst::Create(
1694 watchdog_check_bb, error_check_bb, call_watchdog_lv));
1695 ir_builder.SetInsertPoint(&br_instr);
1696 auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1698 unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1699 unified_err_lv->addIncoming(err_lv, &*bb_it);
1700 err_lv = unified_err_lv;
1701 }
else if (run_with_allowing_runtime_interrupt) {
1703 llvm::Value* call_check_interrupt_lv =
nullptr;
1714 int total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1715 uint64_t interrupt_checking_freq = 32;
1719 if (!input_table_infos.empty()) {
1720 const auto& outer_table_info = *input_table_infos.begin();
1721 auto num_outer_table_tuples = outer_table_info.info.getNumTuples();
1722 if (outer_table_info.table_id < 0) {
1723 auto* rs = (*outer_table_info.info.fragments.begin()).resultSet;
1725 num_outer_table_tuples = rs->entryCount();
1727 auto num_frags = outer_table_info.info.fragments.size();
1728 if (num_frags > 0) {
1729 num_outer_table_tuples =
1730 outer_table_info.info.fragments.begin()->getNumTuples();
1733 if (num_outer_table_tuples > 0) {
1740 auto max_inc = uint64_t(
1741 floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1747 auto calibrated_inc = uint64_t(floor(max_inc * (1 - freq_control_knob)));
1748 interrupt_checking_freq =
1753 if (interrupt_checking_freq > max_inc) {
1754 interrupt_checking_freq = max_inc / 2;
1756 if (interrupt_checking_freq < 8) {
1759 interrupt_checking_freq = 8;
1763 VLOG(1) <<
"Set the running query interrupt checking frequency: "
1764 << interrupt_checking_freq;
1766 llvm::Value* pos_shifted_per_iteration =
1767 ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1768 auto interrupt_predicate =
1769 ir_builder.CreateAnd(pos_shifted_per_iteration, interrupt_checking_freq);
1770 call_check_interrupt_lv =
1771 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1772 interrupt_predicate,
1773 cgen_state_->llInt(int64_t(0LL)));
1776 auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1777 call_check_interrupt_lv =
1778 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1779 interrupt_predicate,
1780 cgen_state_->llInt(int64_t(0LL)));
1782 CHECK(call_check_interrupt_lv);
1783 auto error_check_bb = bb_it->splitBasicBlock(
1784 llvm::BasicBlock::iterator(br_instr),
".error_check");
1785 auto& check_interrupt_br_instr = bb_it->back();
1787 auto interrupt_check_bb = llvm::BasicBlock::Create(
1788 cgen_state_->context_,
".interrupt_check", query_func, error_check_bb);
1789 llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1790 auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1791 cgen_state_->module_->getFunction(
"check_interrupt"), {});
1792 auto interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1794 interrupt_checker_ir_builder.CreateBr(error_check_bb);
1796 llvm::ReplaceInstWithInst(
1797 &check_interrupt_br_instr,
1798 llvm::BranchInst::Create(
1799 interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
1800 ir_builder.SetInsertPoint(&br_instr);
1801 auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1803 unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
1804 unified_err_lv->addIncoming(err_lv, &*bb_it);
1805 err_lv = unified_err_lv;
1807 if (!err_lv_returned_from_row_func) {
1808 err_lv_returned_from_row_func = err_lv;
1814 err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1818 err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1820 cgen_state_->llInt(static_cast<int32_t>(0)));
1822 auto error_bb = llvm::BasicBlock::Create(
1823 cgen_state_->context_,
".error_exit", query_func, new_bb);
1824 const auto error_code_arg =
get_arg_by_name(query_func,
"error_code");
1825 llvm::CallInst::Create(
1826 cgen_state_->module_->getFunction(
"record_error_code"),
1827 std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
1830 llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1831 llvm::ReplaceInstWithInst(&br_instr,
1832 llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1833 done_splitting =
true;
1838 CHECK(done_splitting);
1844 std::vector<llvm::Value*> hoisted_literals;
1848 std::vector<llvm::Type*> row_process_arg_types;
1850 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1851 E = cgen_state_->row_func_->arg_end();
1854 row_process_arg_types.push_back(I->getType());
1857 for (
auto& element : cgen_state_->query_func_literal_loads_) {
1858 for (
auto value : element.second) {
1859 row_process_arg_types.push_back(value->getType());
1863 auto ft = llvm::FunctionType::get(
1864 get_int_type(32, cgen_state_->context_), row_process_arg_types,
false);
1865 auto row_func_with_hoisted_literals =
1866 llvm::Function::Create(ft,
1867 llvm::Function::ExternalLinkage,
1868 "row_func_hoisted_literals",
1869 cgen_state_->row_func_->getParent());
1871 auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
1872 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1873 E = cgen_state_->row_func_->arg_end();
1877 row_func_arg_it->setName(I->getName());
1882 decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{
nullptr};
1883 decltype(row_func_arg_it) filter_func_arg_it{
nullptr};
1884 if (cgen_state_->filter_func_) {
1887 std::vector<llvm::Type*> filter_func_arg_types;
1889 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
1890 E = cgen_state_->filter_func_->arg_end();
1893 filter_func_arg_types.push_back(I->getType());
1896 for (
auto& element : cgen_state_->query_func_literal_loads_) {
1897 for (
auto value : element.second) {
1898 filter_func_arg_types.push_back(value->getType());
1902 auto ft2 = llvm::FunctionType::get(
1903 get_int_type(32, cgen_state_->context_), filter_func_arg_types,
false);
1904 filter_func_with_hoisted_literals =
1905 llvm::Function::Create(ft2,
1906 llvm::Function::ExternalLinkage,
1907 "filter_func_hoisted_literals",
1908 cgen_state_->filter_func_->getParent());
1910 filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
1911 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
1912 E = cgen_state_->filter_func_->arg_end();
1916 filter_func_arg_it->setName(I->getName());
1918 ++filter_func_arg_it;
1922 std::unordered_map<int, std::vector<llvm::Value*>>
1923 query_func_literal_loads_function_arguments,
1924 query_func_literal_loads_function_arguments2;
1926 for (
auto& element : cgen_state_->query_func_literal_loads_) {
1927 std::vector<llvm::Value*> argument_values, argument_values2;
1929 for (
auto value : element.second) {
1930 hoisted_literals.push_back(value);
1931 argument_values.push_back(&*row_func_arg_it);
1932 if (cgen_state_->filter_func_) {
1933 argument_values2.push_back(&*filter_func_arg_it);
1934 cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
1936 if (value->hasName()) {
1937 row_func_arg_it->setName(
"arg_" + value->getName());
1938 if (cgen_state_->filter_func_) {
1939 filter_func_arg_it->getContext();
1940 filter_func_arg_it->setName(
"arg_" + value->getName());
1944 ++filter_func_arg_it;
1947 query_func_literal_loads_function_arguments[element.first] = argument_values;
1948 query_func_literal_loads_function_arguments2[element.first] = argument_values2;
1954 row_func_with_hoisted_literals->getBasicBlockList().splice(
1955 row_func_with_hoisted_literals->begin(),
1956 cgen_state_->row_func_->getBasicBlockList());
1959 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1960 E = cgen_state_->row_func_->arg_end(),
1961 I2 = row_func_with_hoisted_literals->arg_begin();
1964 I->replaceAllUsesWith(&*I2);
1966 cgen_state_->filter_func_args_.replace(&*I, &*I2);
1970 cgen_state_->row_func_ = row_func_with_hoisted_literals;
1973 std::vector<llvm::Instruction*> placeholders;
1974 std::string prefix(
"__placeholder__literal_");
1975 for (
auto it = llvm::inst_begin(row_func_with_hoisted_literals),
1976 e = llvm::inst_end(row_func_with_hoisted_literals);
1979 if (it->hasName() && it->getName().startswith(prefix)) {
1980 auto offset_and_index_entry =
1981 cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
1982 CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
1984 int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
1985 int lit_idx = offset_and_index_entry->second.index_of_literal_load;
1987 it->replaceAllUsesWith(
1988 query_func_literal_loads_function_arguments[lit_off][lit_idx]);
1989 placeholders.push_back(&*it);
1992 for (
auto placeholder : placeholders) {
1993 placeholder->removeFromParent();
1996 if (cgen_state_->filter_func_) {
2000 filter_func_with_hoisted_literals->getBasicBlockList().splice(
2001 filter_func_with_hoisted_literals->begin(),
2002 cgen_state_->filter_func_->getBasicBlockList());
2006 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2007 E = cgen_state_->filter_func_->arg_end(),
2008 I2 = filter_func_with_hoisted_literals->arg_begin();
2011 I->replaceAllUsesWith(&*I2);
2016 cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2019 std::vector<llvm::Instruction*> placeholders;
2020 std::string prefix(
"__placeholder__literal_");
2021 for (
auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2022 e = llvm::inst_end(filter_func_with_hoisted_literals);
2025 if (it->hasName() && it->getName().startswith(prefix)) {
2026 auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2027 llvm::dyn_cast<llvm::Value>(&*it));
2028 CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2030 int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2031 int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2033 it->replaceAllUsesWith(
2034 query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2035 placeholders.push_back(&*it);
2038 for (
auto placeholder : placeholders) {
2039 placeholder->removeFromParent();
2043 return hoisted_literals;
2050 return shared_mem_used
2059 const unsigned gpu_blocksize,
2060 const unsigned num_blocks_per_mp) {
2067 CHECK(query_mem_desc_ptr);
2087 if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2092 const auto target_infos =
2094 std::unordered_set<SQLAgg> supported_aggs{
kCOUNT};
2095 if (std::find_if(target_infos.begin(),
2098 if (ti.sql_type.is_varlen() ||
2099 !supported_aggs.count(ti.agg_kind)) {
2104 }) == target_infos.end()) {
2119 if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2131 const size_t shared_memory_threshold_bytes = std::min(
2134 const auto output_buffer_size =
2136 if (output_buffer_size > shared_memory_threshold_bytes) {
2143 const auto target_infos =
2145 std::unordered_set<SQLAgg> supported_aggs{
kCOUNT};
2149 if (std::find_if(target_infos.begin(),
2152 if (ti.sql_type.is_varlen() ||
2153 !supported_aggs.count(ti.agg_kind)) {
2158 }) == target_infos.end()) {
2169 std::string llvm_ir;
2170 std::unordered_set<llvm::MDNode*> md;
2173 for (
auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2174 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2175 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2176 instr_it->getAllMetadata(imd);
2177 for (
auto [kind, node] : imd) {
2184 for (
auto bb_it = cgen_state->
row_func_->begin(); bb_it != cgen_state->
row_func_->end();
2186 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2187 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2188 instr_it->getAllMetadata(imd);
2189 for (
auto [kind, node] : imd) {
2200 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2201 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2202 instr_it->getAllMetadata(imd);
2203 for (
auto [kind, node] : imd) {
2212 std::map<size_t, std::string> sorted_strings;
2215 llvm::raw_string_ostream os(str);
2216 p->print(os, cgen_state->
module_,
true);
2218 auto fields =
split(str, {}, 1);
2219 if (fields.empty() || fields[0].empty()) {
2222 sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2225 for (
auto [
id, text] : sorted_strings) {
2237 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2244 const bool allow_lazy_fetch,
2245 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2246 const size_t max_groups_buffer_entry_guess,
2247 const int8_t crt_min_byte_width,
2248 const bool has_cardinality_estimation,
2254 static std::uint64_t counter = 0;
2256 VLOG(1) <<
"CODEGEN #" << counter <<
":";
2257 LOG(
IR) <<
"CODEGEN #" << counter <<
":";
2258 LOG(
PTX) <<
"CODEGEN #" << counter <<
":";
2259 LOG(
ASM) <<
"CODEGEN #" << counter <<
":";
2262 nukeOldState(allow_lazy_fetch, query_infos, deleted_cols_map, &ra_exe_unit);
2270 has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2274 max_groups_buffer_entry_guess,
2281 !has_cardinality_estimation &&
2283 const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2287 const bool output_columnar =
query_mem_desc->didOutputColumnar();
2288 const bool gpu_shared_mem_optimization =
2293 cuda_mgr ? this->blockSize() : 1,
2294 cuda_mgr ? this->numBlocksPerMP() : 1);
2295 if (gpu_shared_mem_optimization) {
2298 LOG(
DEBUG1) <<
"GPU shared memory is used for the " +
2309 const size_t num_count_distinct_descs =
2311 for (
size_t i = 0; i < num_count_distinct_descs; i++) {
2312 const auto& count_distinct_descriptor =
2325 auto rt_module_copy = llvm::CloneModule(
2326 *
g_rt_module.get(), cgen_state_->vmap_, [](
const llvm::GlobalValue* gv) {
2327 auto func = llvm::dyn_cast<llvm::Function>(gv);
2331 return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2332 func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
2341 rt_udf_cpu_module, *rt_module_copy, cgen_state_.get());
2351 rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
2355 cgen_state_->module_ = rt_module_copy.release();
2361 const auto agg_slot_count = ra_exe_unit.
estimator ? size_t(1) : agg_fnames.size();
2364 auto [query_func, row_func_call] = is_group_by
2374 !!ra_exe_unit.estimator,
2380 cgen_state_->query_func_ = query_func;
2381 cgen_state_->row_func_call_ = row_func_call;
2382 cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2383 &query_func->getEntryBlock().front());
2387 auto& fetch_bb = query_func->front();
2388 llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2389 fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2391 query_func->args().begin(),
2393 cgen_state_->context_);
2397 is_group_by ? 0 : agg_slot_count,
2399 cgen_state_->module_,
2400 cgen_state_->context_);
2401 CHECK(cgen_state_->row_func_);
2402 cgen_state_->row_func_bb_ =
2403 llvm::BasicBlock::Create(cgen_state_->context_,
"entry", cgen_state_->row_func_);
2406 auto filter_func_ft =
2407 llvm::FunctionType::get(
get_int_type(32, cgen_state_->context_), {},
false);
2408 cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2409 llvm::Function::ExternalLinkage,
2411 cgen_state_->module_);
2412 CHECK(cgen_state_->filter_func_);
2413 cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2414 cgen_state_->context_,
"entry", cgen_state_->filter_func_);
2417 cgen_state_->current_func_ = cgen_state_->row_func_;
2418 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2420 preloadFragOffsets(ra_exe_unit.
input_descs, query_infos);
2422 const auto join_loops =
2423 buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2426 const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2427 if (is_not_deleted_bb) {
2428 cgen_state_->row_func_bb_ = is_not_deleted_bb;
2430 if (!join_loops.empty()) {
2431 codegenJoinLoops(join_loops,
2432 body_execution_unit,
2433 group_by_and_aggregate,
2435 cgen_state_->row_func_bb_,
2440 const bool can_return_error = compileBody(
2441 ra_exe_unit, group_by_and_aggregate, *
query_mem_desc, co, gpu_smem_context);
2444 createErrorCheckControlFlow(query_func,
2448 group_by_and_aggregate.query_infos_);
2451 std::vector<llvm::Value*> hoisted_literals;
2454 VLOG(1) <<
"number of hoisted literals: "
2455 << cgen_state_->query_func_literal_loads_.size()
2456 <<
" / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2460 if (co.
hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2462 hoisted_literals = inlineHoistedLiterals();
2466 std::vector<llvm::Value*> row_func_args;
2467 for (
size_t i = 0; i < cgen_state_->row_func_call_->getNumArgOperands(); ++i) {
2468 row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2470 row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2471 row_func_args.push_back(
get_arg_by_name(query_func,
"join_hash_tables"));
2473 row_func_args.insert(
2474 row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2475 llvm::ReplaceInstWithInst(
2476 cgen_state_->row_func_call_,
2477 llvm::CallInst::Create(cgen_state_->row_func_, row_func_args,
""));
2480 if (cgen_state_->filter_func_) {
2481 std::vector<llvm::Value*> filter_func_args;
2482 for (
auto arg_it = cgen_state_->filter_func_args_.begin();
2483 arg_it != cgen_state_->filter_func_args_.end();
2485 filter_func_args.push_back(arg_it->first);
2487 llvm::ReplaceInstWithInst(
2488 cgen_state_->filter_func_call_,
2489 llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args,
""));
2493 plan_state_->init_agg_vals_ =
2503 if (gpu_smem_context.isSharedMemoryUsed()) {
2507 cgen_state_->module_,
2508 cgen_state_->context_,
2511 plan_state_->init_agg_vals_);
2512 gpu_smem_code.codegen();
2513 gpu_smem_code.injectFunctionsInto(query_func);
2516 cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2517 cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2518 LOG(
IR) << gpu_smem_code.toString();
2522 auto multifrag_query_func = cgen_state_->module_->getFunction(
2523 "multifrag_query" + std::string(co.
hoist_literals ?
"_hoisted_literals" :
""));
2524 CHECK(multifrag_query_func);
2531 "query_stub" + std::string(co.
hoist_literals ?
"_hoisted_literals" :
""),
2532 multifrag_query_func,
2533 cgen_state_->module_);
2535 std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2536 if (cgen_state_->filter_func_) {
2537 root_funcs.push_back(cgen_state_->filter_func_);
2540 *cgen_state_->module_, root_funcs, {multifrag_query_func});
2547 if (cgen_state_->filter_func_) {
2557 std::string llvm_ir;
2560 #ifdef WITH_JIT_DEBUG
2561 throw std::runtime_error(
2562 "Explain optimized not available when JIT runtime debug symbols are enabled");
2566 llvm::legacy::PassManager pass_manager;
2567 optimize_ir(query_func, cgen_state_->module_, pass_manager, live_funcs, co);
2568 #endif // WITH_JIT_DEBUG
2582 LOG(
IR) <<
"IR for the "
2596 if (cgen_state_->filter_func_) {
2601 return std::make_tuple(
2604 ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2605 : optimizeAndCodegenGPU(query_func,
2606 multifrag_query_func,
2608 is_group_by || ra_exe_unit.estimator,
2611 cgen_state_->getLiterals(),
2614 std::move(gpu_smem_context)},
2619 auto query_stub_func_name =
2620 "query_stub" + std::string(hoist_literals ?
"_hoisted_literals" :
"");
2621 for (
auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2622 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
2623 if (!llvm::isa<llvm::CallInst>(*inst_it)) {
2626 auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
2627 if (std::string(row_func_call.getCalledFunction()->getName()) ==
2628 query_stub_func_name) {
2629 auto next_inst_it = inst_it;
2631 auto new_bb = bb_it->splitBasicBlock(next_inst_it);
2632 auto& br_instr = bb_it->back();
2633 llvm::IRBuilder<> ir_builder(&br_instr);
2634 llvm::Value* err_lv = &*inst_it;
2635 auto error_check_bb =
2636 bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
".error_check");
2637 llvm::Value* error_code_arg =
nullptr;
2639 for (
auto arg_it = query_func->arg_begin(); arg_it != query_func->arg_end();
2640 arg_it++, ++arg_cnt) {
2643 if (hoist_literals) {
2645 error_code_arg = &*arg_it;
2650 error_code_arg = &*arg_it;
2655 CHECK(error_code_arg);
2656 llvm::Value* err_code =
nullptr;
2659 auto& check_interrupt_br_instr = bb_it->back();
2660 auto interrupt_check_bb = llvm::BasicBlock::Create(
2661 cgen_state_->context_,
".interrupt_check", query_func, error_check_bb);
2662 llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
2663 auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
2664 cgen_state_->module_->getFunction(
"check_interrupt"), {});
2665 auto detected_error = interrupt_checker_ir_builder.CreateCall(
2666 cgen_state_->module_->getFunction(
"get_error_code"),
2667 std::vector<llvm::Value*>{error_code_arg});
2668 err_code = interrupt_checker_ir_builder.CreateSelect(
2672 interrupt_checker_ir_builder.CreateBr(error_check_bb);
2673 llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
2674 llvm::BranchInst::Create(interrupt_check_bb));
2675 ir_builder.SetInsertPoint(&br_instr);
2678 ir_builder.SetInsertPoint(&br_instr);
2680 ir_builder.CreateCall(cgen_state_->module_->getFunction(
"get_error_code"),
2681 std::vector<llvm::Value*>{error_code_arg});
2683 err_lv = ir_builder.CreateICmp(
2684 llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
2685 auto error_bb = llvm::BasicBlock::Create(
2686 cgen_state_->context_,
".error_exit", query_func, new_bb);
2687 llvm::CallInst::Create(cgen_state_->module_->getFunction(
"record_error_code"),
2688 std::vector<llvm::Value*>{err_code, error_code_arg},
2691 llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2692 llvm::ReplaceInstWithInst(&br_instr,
2693 llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2708 const auto& outer_input_desc = ra_exe_unit.
input_descs[0];
2712 const auto deleted_cd =
2713 plan_state_->getDeletedColForTable(outer_input_desc.getTableId());
2717 CHECK(deleted_cd->columnType.is_boolean());
2718 const auto deleted_expr =
2719 makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
2720 outer_input_desc.getTableId(),
2721 deleted_cd->columnId,
2722 outer_input_desc.getNestLevel());
2724 const auto is_deleted =
2725 code_generator.toBool(code_generator.codegen(deleted_expr.get(),
true, co).front());
2726 const auto is_deleted_bb = llvm::BasicBlock::Create(
2727 cgen_state_->context_,
"is_deleted", cgen_state_->row_func_);
2728 llvm::BasicBlock* bb = llvm::BasicBlock::Create(
2729 cgen_state_->context_,
"is_not_deleted", cgen_state_->row_func_);
2730 cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
2731 cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
2732 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2733 cgen_state_->ir_builder_.SetInsertPoint(bb);
2748 cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
2749 llvm::Value* loop_done{
nullptr};
2750 std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
2751 if (cgen_state_->filter_func_) {
2752 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
2753 auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
2754 cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
2755 row_func_entry_bb->begin());
2756 loop_done = cgen_state_->ir_builder_.CreateAlloca(
2757 get_int_type(1, cgen_state_->context_),
nullptr,
"loop_done");
2758 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2759 cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(
true), loop_done);
2761 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
2762 cgen_state_->current_func_ = cgen_state_->filter_func_;
2763 fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
2767 std::vector<Analyzer::Expr*> primary_quals;
2768 std::vector<Analyzer::Expr*> deferred_quals;
2769 bool short_circuited =
2771 if (short_circuited) {
2773 <<
"short-circuited and deferred " <<
std::to_string(deferred_quals.size())
2776 llvm::Value* filter_lv = cgen_state_->llBool(
true);
2778 for (
auto expr : primary_quals) {
2780 auto cond = code_generator.toBool(code_generator.codegen(expr,
true, co).front());
2781 filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
2783 CHECK(filter_lv->getType()->isIntegerTy(1));
2784 llvm::BasicBlock* sc_false{
nullptr};
2785 if (!deferred_quals.empty()) {
2786 auto sc_true = llvm::BasicBlock::Create(
2787 cgen_state_->context_,
"sc_true", cgen_state_->current_func_);
2788 sc_false = llvm::BasicBlock::Create(
2789 cgen_state_->context_,
"sc_false", cgen_state_->current_func_);
2790 cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
2791 cgen_state_->ir_builder_.SetInsertPoint(sc_false);
2793 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
2795 cgen_state_->ir_builder_.SetInsertPoint(sc_true);
2796 filter_lv = cgen_state_->llBool(
true);
2798 for (
auto expr : deferred_quals) {
2799 filter_lv = cgen_state_->ir_builder_.CreateAnd(
2800 filter_lv, code_generator.toBool(code_generator.codegen(expr,
true, co).front()));
2803 CHECK(filter_lv->getType()->isIntegerTy(1));
2804 auto ret = group_by_and_aggregate.
codegen(
2805 filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
2809 if (cgen_state_->filter_func_) {
2810 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
2811 cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(
false), loop_done);
2812 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2815 redeclareFilterFunction();
2817 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2818 cgen_state_->current_func_ = cgen_state_->row_func_;
2819 cgen_state_->filter_func_call_ =
2820 cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
2822 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
2823 auto loop_done_true = llvm::BasicBlock::Create(
2824 cgen_state_->context_,
"loop_done_true", cgen_state_->row_func_);
2825 auto loop_done_false = llvm::BasicBlock::Create(
2826 cgen_state_->context_,
"loop_done_false", cgen_state_->row_func_);
2827 auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(loop_done);
2828 cgen_state_->ir_builder_.CreateCondBr(
2829 loop_done_flag, loop_done_true, loop_done_false);
2830 cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
2831 cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2832 cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
2834 cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2841 return llvm::CloneModule(
2843 auto func = llvm::dyn_cast<llvm::Function>(gv);
2847 return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2848 func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage);
2853 llvm::Value* byte_stream_arg,
2854 llvm::IRBuilder<>& ir_builder,
2855 llvm::LLVMContext& ctx) {
2856 CHECK(byte_stream_arg);
2857 const auto max_col_local_id = num_columns - 1;
2859 std::vector<llvm::Value*> col_heads;
2860 for (
int col_id = 0; col_id <= max_col_local_id; ++col_id) {
2861 col_heads.emplace_back(ir_builder.CreateLoad(ir_builder.CreateGEP(
2862 byte_stream_arg, llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id))));
2866
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
void read_rt_udf_gpu_module(const std::string &udf_ir)
std::vector< Analyzer::Expr * > target_exprs
std::string filename(char const *path)
double g_running_query_interrupt_freq
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
std::unique_ptr< llvm::Module > rt_udf_cpu_module
std::string gen_translate_null_key_sigs()
bool countDistinctDescriptorsLogicallyEmpty() const
size_t getEntryCount() const
std::unique_ptr< llvm::Module > runtime_module_shallow_copy(CgenState *cgen_state)
static const int32_t ERR_INTERRUPTED
void mark_function_never_inline(llvm::Function *func)
std::unique_ptr< llvm::Module > udf_gpu_module
void show_defined(llvm::Module &module)
void read_rt_udf_cpu_module(const std::string &udf_ir)
Streaming Top N algorithm.
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void checkCudaErrors(CUresult err)
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
bool is_udf_module_present(bool cpu_only=false)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
bool hasKeylessHash() const
void read_udf_cpu_module(const std::string &udf_ir_filename)
void read_udf_gpu_module(const std::string &udf_ir_filename)
std::vector< std::string > CodeCacheKey
ExecutorOptLevel opt_level
bool g_enable_dynamic_watchdog
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
void optimize_ir(llvm::Function *query_func, llvm::Module *module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals)
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
bool filter_on_deleted_column
#define LOG_IF(severity, condition)
gpu_code_cache_(code_cache_size)
size_t getRowSize() const
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *module)
llvm::Function * row_func_
cpu_code_cache_(code_cache_size)
std::shared_ptr< CompilationContext > getCodeFromCache(const CodeCacheKey &, const CodeCache &)
bool g_enable_smem_non_grouped_agg
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co, const GPUTarget &gpu_target)
void insertErrorCodeChecker(llvm::Function *query_func, bool hoist_literals)
unsigned getExpOfTwo(unsigned n)
bool output_columnar_hint
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned gpu_blocksize, const unsigned num_blocks_per_mp)
std::string get_cuda_home(void)
llvm::StringRef get_gpu_target_triple_string()
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
void verify_function_ir(const llvm::Function *func)
const bool allow_multifrag
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
bool useStreamingTopN() const
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
const bool with_dynamic_watchdog
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > g_rt_module
ExecutorExplainType explain_type
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
static const int32_t ERR_OUT_OF_TIME
void initializeNVPTXBackend() const
size_t getMinSharedMemoryPerBlockForAllDevices() const
const_list_iterator_t cend() const
const std::string cuda_rt_decls
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
QueryDescriptionType getQueryDescriptionType() const
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
static void addCodeToCache(const CodeCacheKey &, std::shared_ptr< CompilationContext >, llvm::Module *, CodeCache &)
ExecutorDeviceType device_type
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
llvm::Function * filter_func_
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
bool isArchMaxwellOrLaterForAll() const
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
llvm::Module * read_template_module(llvm::LLVMContext &context)
bool g_enable_smem_grouped_non_count_agg
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
std::unique_ptr< llvm::Module > udf_cpu_module
bool g_enable_filter_function
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
llvm::LLVMContext & getGlobalLLVMContext()
float g_fraction_code_cache_to_evict
SQLAgg get_aggtype() const
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls)
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
std::list< std::shared_ptr< Analyzer::Expr > > quals
std::string gen_array_any_all_sigs()
bool didOutputColumnar() const
bool isPotentialInSituRender() const
#define DEBUG_TIMER(name)
llvm::ValueToValueMapTy vmap_
std::string get_root_abs_path()
std::vector< llvm::Value * > inlineHoistedLiterals()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool register_intel_jit_listener
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *module, llvm::LLVMContext &context)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
const bool allow_runtime_query_interrupt
NvidiaDeviceArch getDeviceArch() const
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
CubinResult ptx_to_cubin(const std::string &ptx, const unsigned block_size, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
bool is_rt_udf_module_present(bool cpu_only=false)
void put(const key_t &key, value_t &&value)
bool g_enable_runtime_query_interrupt
const_list_iterator_t find(const key_t &key) const
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
size_t g_gpu_smem_threshold