31 #if LLVM_VERSION_MAJOR < 9
32 static_assert(
false,
"LLVM Version >= 9 is required.");
35 #include <llvm/Bitcode/BitcodeReader.h>
36 #include <llvm/Bitcode/BitcodeWriter.h>
37 #include <llvm/ExecutionEngine/MCJIT.h>
38 #include <llvm/IR/Attributes.h>
39 #include <llvm/IR/GlobalValue.h>
40 #include <llvm/IR/InstIterator.h>
41 #include <llvm/IR/IntrinsicInst.h>
42 #include <llvm/IR/Intrinsics.h>
43 #include <llvm/IR/LegacyPassManager.h>
44 #include <llvm/IR/Verifier.h>
45 #include <llvm/IRReader/IRReader.h>
46 #include <llvm/Linker/Linker.h>
47 #include <llvm/Support/Casting.h>
48 #include <llvm/Support/FileSystem.h>
49 #include <llvm/Support/FormattedStream.h>
50 #include <llvm/Support/MemoryBuffer.h>
51 #include <llvm/Support/SourceMgr.h>
52 #include <llvm/Support/TargetRegistry.h>
53 #include <llvm/Support/TargetSelect.h>
54 #include <llvm/Support/raw_os_ostream.h>
55 #include <llvm/Support/raw_ostream.h>
56 #include <llvm/Transforms/IPO.h>
57 #include <llvm/Transforms/IPO/AlwaysInliner.h>
58 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
59 #include <llvm/Transforms/InstCombine/InstCombine.h>
60 #include <llvm/Transforms/Instrumentation.h>
61 #include <llvm/Transforms/Scalar.h>
62 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
63 #include <llvm/Transforms/Utils.h>
64 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
65 #include <llvm/Transforms/Utils/Cloning.h>
67 #if LLVM_VERSION_MAJOR >= 11
68 #include <llvm/Support/Host.h>
81 extern std::unique_ptr<llvm::Module> g_rt_libdevice_module;
85 extern std::unique_ptr<llvm::Module> g_rt_geos_module;
87 #include <llvm/Support/DynamicLibrary.h>
89 #ifndef GEOS_LIBRARY_FILENAME
90 #error Configuration should include GEOS library file name
92 std::unique_ptr<std::string> g_libgeos_so_filename(
93 new std::string(GEOS_LIBRARY_FILENAME));
94 static llvm::sys::DynamicLibrary geos_dynamic_library;
95 static std::mutex geos_init_mutex;
99 void load_geos_dynamic_library() {
100 std::lock_guard<std::mutex> guard(geos_init_mutex);
102 if (!geos_dynamic_library.isValid()) {
103 if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
104 LOG(
WARNING) <<
"Misconfigured GEOS library file name, trying 'libgeos_c.so'";
105 g_libgeos_so_filename.reset(
new std::string(
"libgeos_c.so"));
107 auto filename = *g_libgeos_so_filename;
108 std::string error_message;
109 geos_dynamic_library =
110 llvm::sys::DynamicLibrary::getPermanentLibrary(
filename.c_str(), &error_message);
111 if (!geos_dynamic_library.isValid()) {
113 std::string exception_message =
"Failed to load GEOS library: " + error_message;
114 throw std::runtime_error(exception_message.c_str());
127 std::string src =
"",
128 const bool is_gpu =
false) {
129 std::string excname = (is_gpu ?
"NVVM IR ParseError: " :
"LLVM IR ParseError: ");
130 llvm::raw_string_ostream ss(excname);
131 parse_error.print(src.c_str(), ss,
false,
false);
145 #define SHOW_DEFINED(MODULE) \
147 std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
148 ::show_defined(MODULE); \
151 #define SHOW_FUNCTIONS(MODULE) \
153 std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
154 ::show_functions(MODULE); \
157 template <
typename T =
void>
159 std::cout <<
"defines: ";
160 for (
auto&
f : module.getFunctionList()) {
161 if (!
f.isDeclaration()) {
162 std::cout <<
f.getName().str() <<
", ";
165 std::cout << std::endl;
168 template <
typename T =
void>
170 if (module ==
nullptr) {
171 std::cout <<
"is null" << std::endl;
177 template <
typename T =
void>
196 template <
typename T =
void>
198 std::unordered_set<std::string>& defined,
199 std::unordered_set<std::string>& undefined,
200 const std::unordered_set<std::string>& ignored) {
201 for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
202 if (
auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
203 auto* F2 = CI->getCalledFunction();
205 auto F2name = F2->getName().str();
206 if (F2->isDeclaration()) {
207 if (F2name.rfind(
"__", 0) !=
209 && F2name.rfind(
"llvm.", 0) !=
211 && ignored.find(F2name) == ignored.end()
213 undefined.emplace(F2name);
216 if (defined.find(F2name) == defined.end()) {
217 defined.emplace(F2name);
218 scan_function_calls<T>(*F2, defined, undefined, ignored);
226 template <
typename T =
void>
228 std::unordered_set<std::string>& defined,
229 std::unordered_set<std::string>& undefined,
230 const std::unordered_set<std::string>& ignored) {
231 for (
auto& F : module) {
232 if (!F.isDeclaration()) {
238 template <
typename T =
void>
239 std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>>
241 const std::unordered_set<std::string>& ignored = {}) {
242 std::unordered_set<std::string> defined, undefined;
244 return std::make_tuple(defined, undefined);
247 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
250 const std::unordered_set<llvm::Function*>& live_funcs) {
251 std::vector<llvm::Function*> dead_funcs;
254 if (live_funcs.count(&F)) {
257 for (
auto U : F.users()) {
258 auto* C = llvm::dyn_cast<
const llvm::CallInst>(U);
259 if (!C || C->getParent()->getParent() != &F) {
265 dead_funcs.push_back(&F);
268 for (
auto pFn : dead_funcs) {
269 pFn->eraseFromParent();
277 bool check_module_requires_libdevice(llvm::Module* module) {
278 for (llvm::Function& F : *module) {
279 if (F.hasName() && F.getName().startswith(
"__nv_")) {
280 LOG(
INFO) <<
"Module requires linking with libdevice: " << std::string(F.getName());
284 LOG(
DEBUG1) <<
"module does not require linking against libdevice";
289 void add_intrinsics_to_module(llvm::Module* module) {
290 for (llvm::Function& F : *module) {
291 for (llvm::Instruction& I : instructions(F)) {
292 if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
293 if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
294 llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
295 llvm::Function& decl_fn =
296 *llvm::Intrinsic::getDeclaration(module, ii->getIntrinsicID(), Tys);
297 ii->setCalledFunction(&decl_fn);
300 llvm::Intrinsic::getDeclaration(module, ii->getIntrinsicID());
310 llvm::Module* module,
311 llvm::legacy::PassManager& pass_manager,
312 const std::unordered_set<llvm::Function*>& live_funcs,
314 pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
315 pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
316 pass_manager.add(llvm::createInstSimplifyLegacyPass());
317 pass_manager.add(llvm::createInstructionCombiningPass());
318 pass_manager.add(llvm::createGlobalOptimizerPass());
320 pass_manager.add(llvm::createLICMPass());
322 pass_manager.add(llvm::createLoopStrengthReducePass());
324 pass_manager.run(*module);
335 : execution_engine_(execution_engine) {}
339 : execution_engine_(execution_engine) {
342 #ifdef ENABLE_INTEL_JIT_LISTENER
346 LOG(
INFO) <<
"Registered IntelJITEventListener";
348 LOG(
WARNING) <<
"This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
349 "listener configuration parameter.";
350 #endif // ENABLE_INTEL_JIT_LISTENER
356 llvm::ExecutionEngine* execution_engine) {
363 std::stringstream err_ss;
364 llvm::raw_os_ostream err_os(err_ss);
365 err_os <<
"\n-----\n";
366 if (llvm::verifyFunction(*func, &err_os)) {
367 err_os <<
"\n-----\n";
368 func->print(err_os,
nullptr);
369 err_os <<
"\n-----\n";
376 auto it = cache.
find(key);
377 if (it != cache.
cend()) {
378 delete cgen_state_->module_;
379 cgen_state_->module_ = it->second.second;
380 return it->second.first;
386 std::shared_ptr<CompilationContext> compilation_context,
387 llvm::Module* module,
390 std::make_pair<std::shared_ptr<CompilationContext>, decltype(module)>(
391 std::move(compilation_context), std::move(module)));
397 llvm::Module* module) {
398 llvm::legacy::PassManager pass_manager;
399 auto cpu_target_machine = execution_engine->getTargetMachine();
400 CHECK(cpu_target_machine);
401 llvm::SmallString<256> code_str;
402 llvm::raw_svector_ostream os(code_str);
403 #if LLVM_VERSION_MAJOR >= 10
404 cpu_target_machine->addPassesToEmitFile(
405 pass_manager, os,
nullptr, llvm::CGFT_AssemblyFile);
407 cpu_target_machine->addPassesToEmitFile(
408 pass_manager, os,
nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
410 pass_manager.run(*module);
411 return "Assembly for the CPU:\n" + std::string(code_str.str()) +
"\nEnd of assembly";
417 llvm::Function* func,
418 const std::unordered_set<llvm::Function*>& live_funcs,
420 auto module = func->getParent();
422 #ifndef WITH_JIT_DEBUG
423 llvm::legacy::PassManager pass_manager;
424 optimize_ir(func, module, pass_manager, live_funcs, co);
425 #endif // WITH_JIT_DEBUG
427 auto init_err = llvm::InitializeNativeTarget();
430 llvm::InitializeAllTargetMCs();
431 llvm::InitializeNativeTargetAsmPrinter();
432 llvm::InitializeNativeTargetAsmParser();
435 std::unique_ptr<llvm::Module> owner(module);
436 llvm::EngineBuilder eb(std::move(owner));
437 eb.setErrorStr(&err_str);
438 eb.setEngineKind(llvm::EngineKind::JIT);
439 llvm::TargetOptions
to;
440 to.EnableFastISel =
true;
441 eb.setTargetOptions(to);
443 eb.setOptLevel(llvm::CodeGenOpt::None);
448 auto target_machine = eb.selectTarget();
449 CHECK(target_machine);
450 module->setDataLayout(target_machine->createDataLayout());
454 CHECK(execution_engine.get());
457 execution_engine->finalizeObject();
458 return execution_engine;
462 llvm::Function* query_func,
463 llvm::Function* multifrag_query_func,
464 const std::unordered_set<llvm::Function*>& live_funcs,
466 auto module = multifrag_query_func->getParent();
469 if (cgen_state_->filter_func_) {
472 for (
const auto helper : cgen_state_->helper_functions_) {
480 if (cgen_state_->needs_geos_) {
482 load_geos_dynamic_library();
485 auto rt_geos_module_copy = llvm::CloneModule(
486 *g_rt_geos_module.get(), cgen_state_->vmap_, [](
const llvm::GlobalValue* gv) {
487 auto func = llvm::dyn_cast<llvm::Function>(gv);
491 return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
492 func->getLinkage() ==
493 llvm::GlobalValue::LinkageTypes::InternalLinkage ||
494 func->getLinkage() == llvm::GlobalValue::LinkageTypes::ExternalLinkage);
499 llvm::Linker::Flags::LinkOnlyNeeded);
501 throw std::runtime_error(
"GEOS is disabled in this build");
505 auto execution_engine =
507 auto cpu_compilation_context =
508 std::make_shared<CpuCompilationContext>(std::move(execution_engine));
509 cpu_compilation_context->setFunctionPointer(multifrag_query_func);
511 return cpu_compilation_context;
515 llvm::Module& module,
517 llvm::Linker::Flags flags) {
520 for (
auto&
f : *udf_module.get()) {
521 auto func = module.getFunction(
f.getName());
522 if (!(func ==
nullptr) && !
f.isDeclaration() && flags == llvm::Linker::Flags::None) {
523 LOG(
ERROR) <<
" Attempt to overwrite " <<
f.getName().str() <<
" in "
524 << module.getModuleIdentifier() <<
" from `"
525 << udf_module->getModuleIdentifier() <<
"`" << std::endl;
526 throw std::runtime_error(
527 "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
530 VLOG(1) <<
" Adding " <<
f.getName().str() <<
" to "
531 << module.getModuleIdentifier() <<
" from `"
532 << udf_module->getModuleIdentifier() <<
"`" << std::endl;
536 std::unique_ptr<llvm::Module> udf_module_copy;
538 udf_module_copy = llvm::CloneModule(*udf_module.get(), cgen_state->
vmap_);
540 udf_module_copy->setDataLayout(module.getDataLayout());
541 udf_module_copy->setTargetTriple(module.getTargetTriple());
544 llvm::Linker ld(module);
545 bool link_error =
false;
547 link_error = ld.linkInModule(std::move(udf_module_copy), flags);
550 throw std::runtime_error(
"link_udf_module: *** error linking module ***");
560 if (s ==
"int16_t") {
563 if (s ==
"int32_t") {
566 if (s ==
"int64_t") {
569 CHECK(s ==
"float" || s ==
"double");
575 for (
const std::string any_or_all : {
"any",
"all"}) {
576 for (
const std::string elem_type :
577 {
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"float",
"double"}) {
578 for (
const std::string needle_type :
579 {
"int8_t",
"int16_t",
"int32_t",
"int64_t",
"float",
"double"}) {
580 for (
const std::string op_name : {
"eq",
"ne",
"lt",
"le",
"gt",
"ge"}) {
581 result += (
"declare i1 @array_" + any_or_all +
"_" + op_name +
"_" + elem_type +
593 for (
const std::string key_type : {
"int8_t",
"int16_t",
"int32_t",
"int64_t"}) {
595 result +=
"declare i64 @translate_null_key_" + key_type +
"(" + key_llvm_type +
", " +
596 key_llvm_type +
", i64);\n";
602 R
"(
declare void @llvm.dbg.declare(metadata, metadata, metadata)
declare void @llvm.dbg.value(metadata, metadata, metadata)
declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
declare i64 @get_thread_index();
declare i64 @get_block_index();
declare i32 @pos_start_impl(i32*);
declare i32 @group_buff_idx_impl();
declare i32 @pos_step_impl();
declare i8 @thread_warp_idx(i8);
declare i64* @init_shared_mem(i64*, i32);
declare i64* @init_shared_mem_nop(i64*, i32);
declare i64* @declare_dynamic_shared_memory();
declare void @write_back_nop(i64*, i64*, i32);
declare void @write_back_non_grouped_agg(i64*, i64*, i32);
declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8);
declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32);
declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32);
declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32);
declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32);
declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32);
declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32);
declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64);
declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64);
declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64);
declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64);
declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64);
declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double);
declare i64 @get_bucket_key_for_range_double(i8*, i64, double);
declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double);
declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64);
declare i64 @agg_count_shared(i64*, i64);
declare i64 @agg_count_skip_val_shared(i64*, i64, i64);
declare i32 @agg_count_int32_shared(i32*, i32);
declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32);
declare i64 @agg_count_double_shared(i64*, double);
declare i64 @agg_count_double_skip_val_shared(i64*, double, double);
declare i32 @agg_count_float_shared(i32*, float);
declare i32 @agg_count_float_skip_val_shared(i32*, float, float);
declare i64 @agg_sum_shared(i64*, i64);
declare i64 @agg_sum_skip_val_shared(i64*, i64, i64);
declare i32 @agg_sum_int32_shared(i32*, i32);
declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_sum_double_shared(i64*, double);
declare void @agg_sum_double_skip_val_shared(i64*, double, double);
declare void @agg_sum_float_shared(i32*, float);
declare void @agg_sum_float_skip_val_shared(i32*, float, float);
declare void @agg_max_shared(i64*, i64);
declare void @agg_max_skip_val_shared(i64*, i64, i64);
declare void @agg_max_int32_shared(i32*, i32);
declare void @agg_max_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_max_int16_shared(i16*, i16);
declare void @agg_max_int16_skip_val_shared(i16*, i16, i16);
declare void @agg_max_int8_shared(i8*, i8);
declare void @agg_max_int8_skip_val_shared(i8*, i8, i8);
declare void @agg_max_double_shared(i64*, double);
declare void @agg_max_double_skip_val_shared(i64*, double, double);
declare void @agg_max_float_shared(i32*, float);
declare void @agg_max_float_skip_val_shared(i32*, float, float);
declare void @agg_min_shared(i64*, i64);
declare void @agg_min_skip_val_shared(i64*, i64, i64);
declare void @agg_min_int32_shared(i32*, i32);
declare void @agg_min_int32_skip_val_shared(i32*, i32, i32);
declare void @agg_min_int16_shared(i16*, i16);
declare void @agg_min_int16_skip_val_shared(i16*, i16, i16);
declare void @agg_min_int8_shared(i8*, i8);
declare void @agg_min_int8_skip_val_shared(i8*, i8, i8);
declare void @agg_min_double_shared(i64*, double);
declare void @agg_min_double_skip_val_shared(i64*, double, double);
declare void @agg_min_float_shared(i32*, float);
declare void @agg_min_float_skip_val_shared(i32*, float, float);
declare void @agg_id_shared(i64*, i64);
declare void @agg_id_int32_shared(i32*, i32);
declare void @agg_id_int16_shared(i16*, i16);
declare void @agg_id_int8_shared(i8*, i8);
declare void @agg_id_double_shared(i64*, double);
declare void @agg_id_double_shared_slow(i64*, double*);
declare void @agg_id_float_shared(i32*, float);
declare i32 @checked_single_agg_id_shared(i64*, i64, i64);
declare i32 @checked_single_agg_id_double_shared(i64*, double, double);
declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double);
declare i32 @checked_single_agg_id_float_shared(i32*, float, float);
declare i1 @slotEmptyKeyCAS(i64*, i64, i64);
declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32);
declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16);
declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8);
declare i64 @datetrunc_century(i64);
declare i64 @datetrunc_day(i64);
declare i64 @datetrunc_decade(i64);
declare i64 @datetrunc_hour(i64);
declare i64 @datetrunc_millennium(i64);
declare i64 @datetrunc_minute(i64);
declare i64 @datetrunc_month(i64);
declare i64 @datetrunc_quarter(i64);
declare i64 @datetrunc_quarterday(i64);
declare i64 @datetrunc_week_monday(i64);
declare i64 @datetrunc_week_sunday(i64);
declare i64 @datetrunc_week_saturday(i64);
declare i64 @datetrunc_year(i64);
declare i64 @extract_epoch(i64);
declare i64 @extract_dateepoch(i64);
declare i64 @extract_quarterday(i64);
declare i64 @extract_hour(i64);
declare i64 @extract_minute(i64);
declare i64 @extract_second(i64);
declare i64 @extract_millisecond(i64);
declare i64 @extract_microsecond(i64);
declare i64 @extract_nanosecond(i64);
declare i64 @extract_dow(i64);
declare i64 @extract_isodow(i64);
declare i64 @extract_day(i64);
declare i64 @extract_week_monday(i64);
declare i64 @extract_week_sunday(i64);
declare i64 @extract_week_saturday(i64);
declare i64 @extract_day_of_year(i64);
declare i64 @extract_month(i64);
declare i64 @extract_quarter(i64);
declare i64 @extract_year(i64);
declare i64 @DateTruncateHighPrecisionToDate(i64, i64);
declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64);
declare i64 @DateDiff(i32, i64, i64);
declare i64 @DateDiffNullable(i32, i64, i64, i64);
declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32);
declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64);
declare i64 @DateAdd(i32, i64, i64);
declare i64 @DateAddNullable(i32, i64, i64, i64);
declare i64 @DateAddHighPrecision(i32, i64, i64, i32);
declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64);
declare i64 @string_decode(i8*, i64);
declare i32 @array_size(i8*, i64, i32);
declare i32 @array_size_nullable(i8*, i64, i32, i32);
declare i32 @fast_fixlen_array_size(i8*, i32);
declare i1 @array_is_null(i8*, i64);
declare i1 @point_coord_array_is_null(i8*, i64);
declare i8* @array_buff(i8*, i64);
declare i8* @fast_fixlen_array_buff(i8*, i64);
declare i8 @array_at_int8_t(i8*, i64, i32);
declare i16 @array_at_int16_t(i8*, i64, i32);
declare i32 @array_at_int32_t(i8*, i64, i32);
declare i64 @array_at_int64_t(i8*, i64, i32);
declare float @array_at_float(i8*, i64, i32);
declare double @array_at_double(i8*, i64, i32);
declare i8 @varlen_array_at_int8_t(i8*, i64, i32);
declare i16 @varlen_array_at_int16_t(i8*, i64, i32);
declare i32 @varlen_array_at_int32_t(i8*, i64, i32);
declare i64 @varlen_array_at_int64_t(i8*, i64, i32);
declare float @varlen_array_at_float(i8*, i64, i32);
declare double @varlen_array_at_double(i8*, i64, i32);
declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32);
declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32);
declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32);
declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32);
declare float @varlen_notnull_array_at_float(i8*, i64, i32);
declare double @varlen_notnull_array_at_double(i8*, i64, i32);
declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8);
declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16);
declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32);
declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64);
declare float @array_at_float_checked(i8*, i64, i64, float);
declare double @array_at_double_checked(i8*, i64, i64, double);
declare i32 @char_length(i8*, i32);
declare i32 @char_length_nullable(i8*, i32, i32);
declare i32 @char_length_encoded(i8*, i32);
declare i32 @char_length_encoded_nullable(i8*, i32, i32);
declare i32 @key_for_string_encoded(i32);
declare i1 @sample_ratio(double, i64);
declare i1 @string_like(i8*, i32, i8*, i32, i8);
declare i1 @string_ilike(i8*, i32, i8*, i32, i8);
declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8);
declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8);
declare i1 @string_like_simple(i8*, i32, i8*, i32);
declare i1 @string_ilike_simple(i8*, i32, i8*, i32);
declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8);
declare i1 @string_lt(i8*, i32, i8*, i32);
declare i1 @string_le(i8*, i32, i8*, i32);
declare i1 @string_gt(i8*, i32, i8*, i32);
declare i1 @string_ge(i8*, i32, i8*, i32);
declare i1 @string_eq(i8*, i32, i8*, i32);
declare i1 @string_ne(i8*, i32, i8*, i32);
declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8);
declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8);
declare i1 @regexp_like(i8*, i32, i8*, i32, i8);
declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8);
declare void @linear_probabilistic_count(i8*, i32, i8*, i32);
declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64);
declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64);
declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64);
declare void @record_error_code(i32, i32*);
declare i32 @get_error_code(i32*);
declare i1 @dynamic_watchdog();
declare i1 @check_interrupt();
declare void @force_sync();
declare void @sync_warp();
declare void @sync_warp_protected(i64, i64);
declare void @sync_threadblock();
declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32);
declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64);
declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float);
declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double);
)" + gen_array_any_all_sigs() +
606 std::string extension_function_decls(
const std::unordered_set<std::string>& udf_decls) {
612 void legalize_nvvm_ir(llvm::Function* query_func) {
619 std::vector<llvm::Instruction*> stackrestore_intrinsics;
620 std::vector<llvm::Instruction*> stacksave_intrinsics;
621 for (
auto& BB : *query_func) {
622 for (llvm::Instruction& I : BB) {
623 if (
const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
624 if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
625 stacksave_intrinsics.push_back(&I);
626 }
else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
627 stackrestore_intrinsics.push_back(&I);
636 for (
auto& II : stackrestore_intrinsics) {
637 II->eraseFromParent();
639 for (
auto& II : stacksave_intrinsics) {
640 II->eraseFromParent();
648 return llvm::StringRef(
"nvptx64-nvidia-cuda");
652 return llvm::StringRef(
653 "e-p:64:64:64-i1:8:8-i8:8:8-"
654 "i16:16:16-i32:32:32-i64:64:64-"
655 "f32:32:32-f64:64:64-v16:16:16-"
656 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
660 std::map<std::string, std::string>
result;
662 result.insert(std::make_pair(
"cpu_name", llvm::sys::getHostCPUName()));
663 result.insert(std::make_pair(
"cpu_triple", llvm::sys::getProcessTriple()));
665 std::make_pair(
"cpu_cores",
std::to_string(llvm::sys::getHostNumPhysicalCores())));
669 std::string sizeof_types;
672 sizeof_types +=
"ssize_t:" +
std::to_string(
sizeof(ssize_t)) +
";";
674 sizeof_types +=
"uchar:" +
std::to_string(
sizeof(
unsigned char)) +
";";
676 sizeof_types +=
"ushort:" +
std::to_string(
sizeof(
unsigned short int)) +
";";
678 sizeof_types +=
"uint:" +
std::to_string(
sizeof(
unsigned int)) +
";";
680 sizeof_types +=
"ulong:" +
std::to_string(
sizeof(
unsigned long int)) +
";";
681 sizeof_types +=
"longlong:" +
std::to_string(
sizeof(
long long int)) +
";";
682 sizeof_types +=
"ulonglong:" +
std::to_string(
sizeof(
unsigned long long int)) +
";";
685 sizeof_types +=
"longdouble:" +
std::to_string(
sizeof(
long double)) +
";";
688 result.insert(std::make_pair(
"type_sizeof", sizeof_types));
690 std::string null_values;
691 null_values +=
"boolean1:" +
std::to_string(serialized_null_value<bool>()) +
";";
692 null_values +=
"boolean8:" +
std::to_string(serialized_null_value<int8_t>()) +
";";
693 null_values +=
"int8:" +
std::to_string(serialized_null_value<int8_t>()) +
";";
694 null_values +=
"int16:" +
std::to_string(serialized_null_value<int16_t>()) +
";";
695 null_values +=
"int32:" +
std::to_string(serialized_null_value<int32_t>()) +
";";
696 null_values +=
"int64:" +
std::to_string(serialized_null_value<int64_t>()) +
";";
697 null_values +=
"uint8:" +
std::to_string(serialized_null_value<uint8_t>()) +
";";
698 null_values +=
"uint16:" +
std::to_string(serialized_null_value<uint16_t>()) +
";";
699 null_values +=
"uint32:" +
std::to_string(serialized_null_value<uint32_t>()) +
";";
700 null_values +=
"uint64:" +
std::to_string(serialized_null_value<uint64_t>()) +
";";
701 null_values +=
"float32:" +
std::to_string(serialized_null_value<float>()) +
";";
702 null_values +=
"float64:" +
std::to_string(serialized_null_value<double>()) +
";";
704 "Array<boolean8>:" +
std::to_string(serialized_null_value<int8_t, true>()) +
";";
706 "Array<int8>:" +
std::to_string(serialized_null_value<int8_t, true>()) +
";";
708 "Array<int16>:" +
std::to_string(serialized_null_value<int16_t, true>()) +
";";
710 "Array<int32>:" +
std::to_string(serialized_null_value<int32_t, true>()) +
";";
712 "Array<int64>:" +
std::to_string(serialized_null_value<int64_t, true>()) +
";";
714 "Array<float32>:" +
std::to_string(serialized_null_value<float, true>()) +
";";
716 "Array<float64>:" +
std::to_string(serialized_null_value<double, true>()) +
";";
718 result.insert(std::make_pair(
"null_values", null_values));
720 llvm::StringMap<bool> cpu_features;
721 if (llvm::sys::getHostCPUFeatures(cpu_features)) {
722 std::string features_str =
"";
723 for (
auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
724 features_str += (it->getValue() ?
" +" :
" -");
725 features_str += it->getKey().str();
727 result.insert(std::make_pair(
"cpu_features", features_str));
730 result.insert(std::make_pair(
"llvm_version",
737 int device_count = 0;
741 char device_name[256];
742 int major = 0, minor = 0;
747 &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
749 &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
752 result.insert(std::make_pair(
"gpu_name", device_name));
753 result.insert(std::make_pair(
"gpu_count",
std::to_string(device_count)));
754 result.insert(std::make_pair(
"gpu_compute_capability",
758 result.insert(std::make_pair(
"gpu_driver",
769 llvm::Function* func,
770 llvm::Function* wrapper_func,
771 const std::unordered_set<llvm::Function*>& live_funcs,
773 const GPUTarget& gpu_target) {
775 auto module = func->getParent();
796 CHECK(gpu_target.cgen_state->module_ == module);
797 module->setDataLayout(
798 "e-p:64:64:64-i1:8:8-i8:8:8-"
799 "i16:16:16-i32:32:32-i64:64:64-"
800 "f32:32:32-f64:64:64-v16:16:16-"
801 "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
802 module->setTargetTriple(
"nvptx64-nvidia-cuda");
803 CHECK(gpu_target.nvptx_target_machine);
804 auto pass_manager_builder = llvm::PassManagerBuilder();
806 pass_manager_builder.OptLevel = 0;
807 llvm::legacy::PassManager module_pass_manager;
808 pass_manager_builder.populateModulePassManager(module_pass_manager);
810 bool requires_libdevice = check_module_requires_libdevice(module);
812 if (requires_libdevice) {
814 gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
815 llvm::legacy::FunctionPassManager FPM(module);
816 pass_manager_builder.populateFunctionPassManager(FPM);
819 FPM.doInitialization();
820 for (
auto& F : *module) {
823 FPM.doFinalization();
827 optimize_ir(func, module, module_pass_manager, live_funcs, co);
828 legalize_nvvm_ir(func);
830 std::stringstream ss;
831 llvm::raw_os_ostream os(ss);
833 llvm::LLVMContext& ctx = module->getContext();
835 llvm::NamedMDNode* md = module->getOrInsertNamedMetadata(
"nvvm.annotations");
837 llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
838 llvm::MDString::get(ctx,
"kernel"),
839 llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
840 llvm::Type::getInt32Ty(ctx), 1))};
843 md->addOperand(llvm::MDNode::get(ctx, md_vals));
845 std::unordered_set<llvm::Function*> roots{wrapper_func, func};
846 if (gpu_target.row_func_not_inlined) {
848 roots.insert(gpu_target.cgen_state->row_func_);
849 if (gpu_target.cgen_state->filter_func_) {
850 roots.insert(gpu_target.cgen_state->filter_func_);
855 for (
auto f : gpu_target.cgen_state->helper_functions_) {
860 std::unordered_set<std::string> udf_declarations;
863 llvm::Function* udf_function = module->getFunction(
f.getName());
866 legalize_nvvm_ir(udf_function);
867 roots.insert(udf_function);
871 if (
f.isDeclaration()) {
872 udf_declarations.insert(
f.getName().str());
880 llvm::Function* udf_function = module->getFunction(
f.getName());
882 legalize_nvvm_ir(udf_function);
883 roots.insert(udf_function);
887 if (
f.isDeclaration()) {
888 udf_declarations.insert(
f.getName().str());
894 std::vector<llvm::Function*> rt_funcs;
895 for (
auto& Fn : *module) {
896 if (roots.count(&Fn)) {
899 rt_funcs.push_back(&Fn);
901 for (
auto& pFn : rt_funcs) {
902 pFn->removeFromParent();
905 if (requires_libdevice) {
906 add_intrinsics_to_module(module);
909 module->print(os,
nullptr);
912 for (
auto& pFn : rt_funcs) {
913 module->getFunctionList().push_back(pFn);
915 module->eraseNamedMetadata(md);
917 auto cuda_llir = ss.str() + cuda_rt_decls + extension_function_decls(udf_declarations);
921 cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
923 LOG(
WARNING) <<
"Failed to generate PTX: " << e.what()
924 <<
". Switching to CPU execution target.";
927 LOG(
PTX) <<
"PTX for the GPU:\n" << ptx <<
"\nEnd of PTX";
929 auto cubin_result =
ptx_to_cubin(ptx, gpu_target.block_size, gpu_target.cuda_mgr);
930 auto& option_keys = cubin_result.option_keys;
931 auto& option_values = cubin_result.option_values;
932 auto cubin = cubin_result.cubin;
933 auto link_state = cubin_result.link_state;
934 const auto num_options = option_keys.size();
936 auto func_name = wrapper_func->getName().str();
937 auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
938 for (
int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
940 gpu_compilation_context->addDeviceCode(
941 std::make_unique<GpuDeviceCompilationContext>(cubin,
951 return gpu_compilation_context;
958 llvm::Function* query_func,
959 llvm::Function* multifrag_query_func,
960 std::unordered_set<llvm::Function*>& live_funcs,
961 const bool no_inline,
965 auto module = multifrag_query_func->getParent();
970 if (cgen_state_->filter_func_) {
973 for (
const auto helper : cgen_state_->helper_functions_) {
981 bool row_func_not_inlined =
false;
983 for (
auto it = llvm::inst_begin(cgen_state_->row_func_),
984 e = llvm::inst_end(cgen_state_->row_func_);
987 if (llvm::isa<llvm::CallInst>(*it)) {
988 auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
989 if (get_gv_call.getCalledFunction()->getName() ==
"array_size" ||
990 get_gv_call.getCalledFunction()->getName() ==
"linear_probabilistic_count") {
992 row_func_not_inlined =
true;
999 initializeNVPTXBackend();
1004 row_func_not_inlined};
1005 std::shared_ptr<GpuCompilationContext> compilation_context;
1007 if (check_module_requires_libdevice(module)) {
1008 if (g_rt_libdevice_module ==
nullptr) {
1010 throw std::runtime_error(
1011 "libdevice library is not available but required by the UDF module");
1018 llvm::Linker::Flags::OverrideFromSrc);
1021 module->addModuleFlag(llvm::Module::Override,
"nvvm-reflect-ftz", (
int)1);
1022 for (llvm::Function& fn : *module) {
1023 fn.addFnAttr(
"nvptx-f32ftz",
"true");
1029 query_func, multifrag_query_func, live_funcs, co, gpu_target);
1031 }
catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1032 if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1035 LOG(
WARNING) <<
"Failed to allocate GPU memory for generated code. Evicting "
1037 <<
"% of GPU code cache and re-trying.";
1040 query_func, multifrag_query_func, live_funcs, co, gpu_target);
1046 CHECK(compilation_context);
1047 return compilation_context;
1054 llvm::TargetMachine* nvptx_target_machine,
1055 llvm::LLVMContext& context) {
1056 auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir,
"",
false);
1058 llvm::SMDiagnostic parse_error;
1060 auto module = llvm::parseIR(mem_buff->getMemBufferRef(), parse_error, context);
1062 LOG(
IR) <<
"CodeGenerator::generatePTX:NVVM IR:\n" << cuda_llir <<
"\nEnd of NNVM IR";
1066 llvm::SmallString<256> code_str;
1067 llvm::raw_svector_ostream formatted_os(code_str);
1068 CHECK(nvptx_target_machine);
1070 llvm::legacy::PassManager ptxgen_pm;
1071 module->setDataLayout(nvptx_target_machine->createDataLayout());
1073 #if LLVM_VERSION_MAJOR >= 10
1074 nvptx_target_machine->addPassesToEmitFile(
1075 ptxgen_pm, formatted_os,
nullptr, llvm::CGFT_AssemblyFile);
1077 nvptx_target_machine->addPassesToEmitFile(
1078 ptxgen_pm, formatted_os,
nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1080 ptxgen_pm.run(*module);
1083 #if LLVM_VERSION_MAJOR >= 11
1084 return std::string(code_str);
1086 return code_str.str();
1092 llvm::InitializeAllTargets();
1093 llvm::InitializeAllTargetMCs();
1094 llvm::InitializeAllAsmPrinters();
1096 auto target = llvm::TargetRegistry::lookupTarget(
"nvptx64", err);
1100 return std::unique_ptr<llvm::TargetMachine>(
1101 target->createTargetMachine(
"nvptx64-nvidia-cuda",
1104 llvm::TargetOptions(),
1105 llvm::Reloc::Static));
1110 cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1114 if (nvptx_target_machine_) {
1117 const auto cuda_mgr =
catalog_->getDataMgr().getCudaMgr();
1118 LOG_IF(
FATAL, cuda_mgr ==
nullptr) <<
"No CudaMgr instantiated, unable to check device "
1119 "architecture or generate code for nvidia GPUs.";
1127 return func->getName() ==
"query_stub_hoisted_literals" ||
1128 func->getName() ==
"multifrag_query_hoisted_literals" ||
1129 func->getName() ==
"query_stub" || func->getName() ==
"multifrag_query" ||
1130 func->getName() ==
"fixed_width_int_decode" ||
1131 func->getName() ==
"fixed_width_unsigned_decode" ||
1132 func->getName() ==
"diff_fixed_width_int_decode" ||
1133 func->getName() ==
"fixed_width_double_decode" ||
1134 func->getName() ==
"fixed_width_float_decode" ||
1135 func->getName() ==
"fixed_width_small_date_decode" ||
1136 func->getName() ==
"record_error_code" || func->getName() ==
"get_error_code" ||
1137 func->getName() ==
"pos_start_impl" || func->getName() ==
"pos_step_impl" ||
1138 func->getName() ==
"group_buff_idx_impl" ||
1139 func->getName() ==
"init_shared_mem" ||
1140 func->getName() ==
"init_shared_mem_nop" || func->getName() ==
"write_back_nop";
1144 llvm::SMDiagnostic err;
1147 "/QueryEngine/RuntimeFunctions.bc");
1148 CHECK(!buffer_or_error.getError());
1149 llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1151 auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1152 CHECK(!owner.takeError());
1153 auto module = owner.get().release();
1160 llvm::Module* read_libdevice_module(llvm::LLVMContext& context) {
1161 llvm::SMDiagnostic err;
1164 boost::filesystem::path cuda_path{env};
1165 cuda_path /=
"nvvm";
1166 cuda_path /=
"libdevice";
1167 cuda_path /=
"libdevice.10.bc";
1169 if (!boost::filesystem::exists(cuda_path)) {
1170 LOG(
WARNING) <<
"Could not find CUDA libdevice; support for some UDF "
1171 "functions might not be available.";
1175 auto buffer_or_error = llvm::MemoryBuffer::getFile(cuda_path.c_str());
1176 CHECK(!buffer_or_error.getError());
1177 llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1179 auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1180 CHECK(!owner.takeError());
1181 auto module = owner.get().release();
1189 llvm::Module* read_geos_module(llvm::LLVMContext& context) {
1190 llvm::SMDiagnostic err;
1193 "/QueryEngine/GeosRuntime.bc");
1194 CHECK(!buffer_or_error.getError());
1195 llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1197 auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1198 CHECK(!owner.takeError());
1199 auto module = owner.get().release();
1209 const bool use_resume_param,
1210 llvm::Function* query_func,
1211 llvm::Module* module) {
1212 for (
auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1214 if (!llvm::isa<llvm::CallInst>(*it)) {
1217 auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1218 if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
1219 if (use_resume_param) {
1220 const auto error_code_arg =
get_arg_by_name(query_func,
"error_code");
1221 llvm::ReplaceInstWithInst(
1223 llvm::CallInst::Create(module->getFunction(pos_fn_name +
"_impl"),
1226 llvm::ReplaceInstWithInst(
1228 llvm::CallInst::Create(module->getFunction(pos_fn_name +
"_impl")));
1236 const size_t in_col_count,
1237 const size_t agg_col_count,
1238 const bool hoist_literals) {
1239 auto arg_it = row_func->arg_begin();
1241 if (agg_col_count) {
1242 for (
size_t i = 0;
i < agg_col_count; ++
i) {
1243 arg_it->setName(
"out");
1247 arg_it->setName(
"group_by_buff");
1249 arg_it->setName(
"crt_matched");
1251 arg_it->setName(
"total_matched");
1253 arg_it->setName(
"old_total_matched");
1255 arg_it->setName(
"max_matched");
1259 arg_it->setName(
"agg_init_val");
1262 arg_it->setName(
"pos");
1265 arg_it->setName(
"frag_row_off");
1268 arg_it->setName(
"num_rows_per_scan");
1271 if (hoist_literals) {
1272 arg_it->setName(
"literals");
1276 for (
size_t i = 0;
i < in_col_count; ++
i) {
1281 arg_it->setName(
"join_hash_tables");
1285 const size_t agg_col_count,
1286 const bool hoist_literals,
1287 llvm::Module* module,
1288 llvm::LLVMContext& context) {
1289 std::vector<llvm::Type*> row_process_arg_types;
1291 if (agg_col_count) {
1293 for (
size_t i = 0;
i < agg_col_count; ++
i) {
1294 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1298 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1300 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1302 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1304 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1306 row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1310 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1313 row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1316 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1319 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1322 if (hoist_literals) {
1323 row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1327 for (
size_t i = 0;
i < in_col_count; ++
i) {
1328 row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1332 row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1336 llvm::FunctionType::get(
get_int_type(32, context), row_process_arg_types,
false);
1339 llvm::Function::Create(ft, llvm::Function::ExternalLinkage,
"row_func", module);
1349 const std::string& query_fname,
1350 llvm::Function* multifrag_query_func,
1351 llvm::Module* module) {
1352 std::vector<llvm::CallInst*> query_stubs;
1353 for (
auto it = llvm::inst_begin(multifrag_query_func),
1354 e = llvm::inst_end(multifrag_query_func);
1357 if (!llvm::isa<llvm::CallInst>(*it)) {
1360 auto& query_call = llvm::cast<llvm::CallInst>(*it);
1361 if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
1362 query_stubs.push_back(&query_call);
1365 for (
auto& S : query_stubs) {
1366 std::vector<llvm::Value*>
args;
1367 for (
size_t i = 0;
i < S->getNumArgOperands(); ++
i) {
1368 args.push_back(S->getArgOperand(
i));
1370 llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args,
""));
1374 std::vector<std::string>
get_agg_fnames(
const std::vector<Analyzer::Expr*>& target_exprs,
1375 const bool is_group_by) {
1376 std::vector<std::string>
result;
1377 for (
size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1378 ++target_idx, ++agg_col_idx) {
1379 const auto target_expr = target_exprs[target_idx];
1381 const auto target_type_info = target_expr->get_type_info();
1383 const bool is_varlen =
1384 (target_type_info.is_string() &&
1386 target_type_info.is_array();
1387 if (!agg_expr || agg_expr->get_aggtype() ==
kSAMPLE) {
1388 result.emplace_back(target_type_info.is_fp() ?
"agg_id_double" :
"agg_id");
1390 result.emplace_back(
"agg_id");
1392 if (target_type_info.is_geometry()) {
1393 result.emplace_back(
"agg_id");
1394 for (
auto i = 2;
i < 2 * target_type_info.get_physical_coord_cols(); ++
i) {
1395 result.emplace_back(
"agg_id");
1401 const auto& agg_type_info =
1402 agg_type !=
kCOUNT ? agg_expr->get_arg()->get_type_info() : target_type_info;
1405 if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1406 !agg_type_info.is_fp()) {
1407 throw std::runtime_error(
"AVG is only valid on integer and floating point");
1409 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1411 :
"agg_sum_double");
1412 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1414 :
"agg_count_double");
1418 if (agg_type_info.is_string() || agg_type_info.is_array() ||
1419 agg_type_info.is_geometry()) {
1420 throw std::runtime_error(
1421 "MIN on strings, arrays or geospatial types not supported yet");
1423 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1425 :
"agg_min_double");
1429 if (agg_type_info.is_string() || agg_type_info.is_array() ||
1430 agg_type_info.is_geometry()) {
1431 throw std::runtime_error(
1432 "MAX on strings, arrays or geospatial types not supported yet");
1434 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1436 :
"agg_max_double");
1440 if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1441 !agg_type_info.is_fp()) {
1442 throw std::runtime_error(
"SUM is only valid on integer and floating point");
1444 result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1446 :
"agg_sum_double");
1450 result.emplace_back(agg_expr->get_is_distinct() ?
"agg_count_distinct"
1454 result.emplace_back(agg_type_info.is_fp() ?
"agg_id_double" :
"agg_id");
1459 result.emplace_back(agg_type_info.is_fp() ?
"agg_id_double" :
"agg_id");
1463 result.emplace_back(
"agg_approximate_count_distinct");
1466 result.emplace_back(
"agg_approx_median");
1484 std::unique_ptr<llvm::Module> g_rt_libdevice_module(
1497 llvm::SMDiagnostic parse_error;
1499 llvm::StringRef file_name_arg(udf_ir_filename);
1507 if (!gpu_triple.isNVPTX()) {
1509 <<
"Expected triple nvptx64-nvidia-cuda for NVVM IR of loadtime UDFs but got "
1510 << gpu_triple.str() <<
". Disabling the NVVM IR module.";
1516 llvm::SMDiagnostic parse_error;
1518 llvm::StringRef file_name_arg(udf_ir_filename);
1527 llvm::SMDiagnostic parse_error;
1530 std::make_unique<llvm::MemoryBufferRef>(udf_ir_string,
"Runtime UDF for GPU");
1533 if (!rt_udf_gpu_module) {
1534 LOG(
IR) <<
"read_rt_udf_gpu_module:NVVM IR:\n" << udf_ir_string <<
"\nEnd of NNVM IR";
1538 llvm::Triple gpu_triple(rt_udf_gpu_module->getTargetTriple());
1539 if (!gpu_triple.isNVPTX()) {
1540 LOG(
IR) <<
"read_rt_udf_gpu_module:NVVM IR:\n" << udf_ir_string <<
"\nEnd of NNVM IR";
1541 LOG(
WARNING) <<
"Expected triple nvptx64-nvidia-cuda for NVVM IR but got "
1543 <<
". Executing runtime UDFs on GPU will be disabled.";
1544 rt_udf_gpu_module =
nullptr;
1550 llvm::SMDiagnostic parse_error;
1553 std::make_unique<llvm::MemoryBufferRef>(udf_ir_string,
"Runtime UDF for CPU");
1557 LOG(
IR) <<
"read_rt_udf_cpu_module:LLVM IR:\n" << udf_ir_string <<
"\nEnd of LLVM IR";
1563 llvm::Module& module,
1564 const std::vector<llvm::Function*>& roots,
1565 const std::vector<llvm::Function*>& leaves) {
1566 std::unordered_set<llvm::Function*> live_funcs;
1567 live_funcs.insert(roots.begin(), roots.end());
1568 live_funcs.insert(leaves.begin(), leaves.end());
1570 if (
auto F = module.getFunction(
"init_shared_mem_nop")) {
1571 live_funcs.insert(F);
1573 if (
auto F = module.getFunction(
"write_back_nop")) {
1574 live_funcs.insert(F);
1577 for (
const llvm::Function* F : roots) {
1578 for (
const llvm::BasicBlock& BB : *F) {
1579 for (
const llvm::Instruction& I : BB) {
1580 if (
const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1581 live_funcs.insert(CI->getCalledFunction());
1587 for (llvm::Function& F : module) {
1588 if (!live_funcs.count(&F) && !F.isDeclaration()) {
1589 F.setLinkage(llvm::GlobalValue::InternalLinkage);
1599 template <
typename InstType>
1601 std::string bb_name,
1602 std::string variable_name) {
1603 llvm::Value* result =
nullptr;
1604 if (func ==
nullptr || variable_name.empty()) {
1607 bool is_found =
false;
1608 for (
auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1609 if (!bb_name.empty() && bb_it->getName() != bb_name) {
1612 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1613 if (llvm::isa<InstType>(*inst_it)) {
1614 if (inst_it->getName() == variable_name) {
1627 llvm::Function* query_func,
1628 bool run_with_dynamic_watchdog,
1629 bool run_with_allowing_runtime_interrupt,
1631 const std::vector<InputTableInfo>& input_table_infos) {
1637 if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1640 run_with_allowing_runtime_interrupt =
false;
1645 mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
1646 if (current_query_session_.empty()) {
1647 run_with_allowing_runtime_interrupt =
false;
1651 llvm::Value* row_count =
nullptr;
1652 if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1655 find_variable_in_basic_block<llvm::LoadInst>(query_func,
".entry",
"row_count");
1658 bool done_splitting =
false;
1659 for (
auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1661 llvm::Value* pos =
nullptr;
1662 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1663 if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1664 llvm::isa<llvm::PHINode>(*inst_it)) {
1665 if (inst_it->getName() ==
"pos") {
1670 if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1673 auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1674 if (std::string(row_func_call.getCalledFunction()->getName()) ==
"row_process") {
1675 auto next_inst_it = inst_it;
1677 auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1678 auto& br_instr = bb_it->back();
1679 llvm::IRBuilder<> ir_builder(&br_instr);
1680 llvm::Value* err_lv = &*inst_it;
1681 llvm::Value* err_lv_returned_from_row_func =
nullptr;
1682 if (run_with_dynamic_watchdog) {
1684 llvm::Value* call_watchdog_lv =
nullptr;
1690 auto crit_edge_rem =
1691 (blockSize() & (blockSize() - 1))
1692 ? ir_builder.CreateSRem(
1694 cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1695 : ir_builder.CreateAnd(
1697 cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1698 auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1699 crit_edge_threshold->setName(
"crit_edge_threshold");
1704 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1707 auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1708 call_watchdog_lv = ir_builder.CreateICmp(
1709 llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1711 CHECK(call_watchdog_lv);
1712 auto error_check_bb = bb_it->splitBasicBlock(
1713 llvm::BasicBlock::iterator(br_instr),
".error_check");
1714 auto& watchdog_br_instr = bb_it->back();
1716 auto watchdog_check_bb = llvm::BasicBlock::Create(
1717 cgen_state_->context_,
".watchdog_check", query_func, error_check_bb);
1718 llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1719 auto detected_timeout = watchdog_ir_builder.CreateCall(
1720 cgen_state_->module_->getFunction(
"dynamic_watchdog"), {});
1721 auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1723 watchdog_ir_builder.CreateBr(error_check_bb);
1725 llvm::ReplaceInstWithInst(
1727 llvm::BranchInst::Create(
1728 watchdog_check_bb, error_check_bb, call_watchdog_lv));
1729 ir_builder.SetInsertPoint(&br_instr);
1730 auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1732 unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1733 unified_err_lv->addIncoming(err_lv, &*bb_it);
1734 err_lv = unified_err_lv;
1735 }
else if (run_with_allowing_runtime_interrupt) {
1737 llvm::Value* call_check_interrupt_lv =
nullptr;
1748 int total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1749 uint64_t interrupt_checking_freq = 32;
1753 if (!input_table_infos.empty()) {
1754 const auto& outer_table_info = *input_table_infos.begin();
1755 auto num_outer_table_tuples = outer_table_info.info.getNumTuples();
1756 if (outer_table_info.table_id < 0) {
1757 auto* rs = (*outer_table_info.info.fragments.begin()).resultSet;
1759 num_outer_table_tuples = rs->entryCount();
1761 auto num_frags = outer_table_info.info.fragments.size();
1762 if (num_frags > 0) {
1763 num_outer_table_tuples =
1764 outer_table_info.info.fragments.begin()->getNumTuples();
1767 if (num_outer_table_tuples > 0) {
1774 auto max_inc = uint64_t(
1775 floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1781 auto calibrated_inc = uint64_t(floor(max_inc * (1 - freq_control_knob)));
1782 interrupt_checking_freq =
1787 if (interrupt_checking_freq > max_inc) {
1788 interrupt_checking_freq = max_inc / 2;
1790 if (interrupt_checking_freq < 8) {
1793 interrupt_checking_freq = 8;
1797 VLOG(1) <<
"Set the running query interrupt checking frequency: "
1798 << interrupt_checking_freq;
1800 llvm::Value* pos_shifted_per_iteration =
1801 ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1802 auto interrupt_predicate =
1803 ir_builder.CreateAnd(pos_shifted_per_iteration, interrupt_checking_freq);
1804 call_check_interrupt_lv =
1805 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1806 interrupt_predicate,
1807 cgen_state_->llInt(int64_t(0LL)));
1810 auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1811 call_check_interrupt_lv =
1812 ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1813 interrupt_predicate,
1814 cgen_state_->llInt(int64_t(0LL)));
1816 CHECK(call_check_interrupt_lv);
1817 auto error_check_bb = bb_it->splitBasicBlock(
1818 llvm::BasicBlock::iterator(br_instr),
".error_check");
1819 auto& check_interrupt_br_instr = bb_it->back();
1821 auto interrupt_check_bb = llvm::BasicBlock::Create(
1822 cgen_state_->context_,
".interrupt_check", query_func, error_check_bb);
1823 llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1824 auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1825 cgen_state_->module_->getFunction(
"check_interrupt"), {});
1826 auto interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1828 interrupt_checker_ir_builder.CreateBr(error_check_bb);
1830 llvm::ReplaceInstWithInst(
1831 &check_interrupt_br_instr,
1832 llvm::BranchInst::Create(
1833 interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
1834 ir_builder.SetInsertPoint(&br_instr);
1835 auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1837 unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
1838 unified_err_lv->addIncoming(err_lv, &*bb_it);
1839 err_lv = unified_err_lv;
1841 if (!err_lv_returned_from_row_func) {
1842 err_lv_returned_from_row_func = err_lv;
1848 err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1852 err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1854 cgen_state_->llInt(static_cast<int32_t>(0)));
1856 auto error_bb = llvm::BasicBlock::Create(
1857 cgen_state_->context_,
".error_exit", query_func, new_bb);
1858 const auto error_code_arg =
get_arg_by_name(query_func,
"error_code");
1859 llvm::CallInst::Create(
1860 cgen_state_->module_->getFunction(
"record_error_code"),
1861 std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
1864 llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1865 llvm::ReplaceInstWithInst(&br_instr,
1866 llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1867 done_splitting =
true;
1872 CHECK(done_splitting);
1878 std::vector<llvm::Value*> hoisted_literals;
1882 std::vector<llvm::Type*> row_process_arg_types;
1884 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1885 E = cgen_state_->row_func_->arg_end();
1888 row_process_arg_types.push_back(I->getType());
1891 for (
auto& element : cgen_state_->query_func_literal_loads_) {
1892 for (
auto value : element.second) {
1893 row_process_arg_types.push_back(value->getType());
1897 auto ft = llvm::FunctionType::get(
1898 get_int_type(32, cgen_state_->context_), row_process_arg_types,
false);
1899 auto row_func_with_hoisted_literals =
1900 llvm::Function::Create(ft,
1901 llvm::Function::ExternalLinkage,
1902 "row_func_hoisted_literals",
1903 cgen_state_->row_func_->getParent());
1905 auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
1906 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1907 E = cgen_state_->row_func_->arg_end();
1911 row_func_arg_it->setName(I->getName());
1916 decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{
nullptr};
1917 decltype(row_func_arg_it) filter_func_arg_it{
nullptr};
1918 if (cgen_state_->filter_func_) {
1921 std::vector<llvm::Type*> filter_func_arg_types;
1923 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
1924 E = cgen_state_->filter_func_->arg_end();
1927 filter_func_arg_types.push_back(I->getType());
1930 for (
auto& element : cgen_state_->query_func_literal_loads_) {
1931 for (
auto value : element.second) {
1932 filter_func_arg_types.push_back(value->getType());
1936 auto ft2 = llvm::FunctionType::get(
1937 get_int_type(32, cgen_state_->context_), filter_func_arg_types,
false);
1938 filter_func_with_hoisted_literals =
1939 llvm::Function::Create(ft2,
1940 llvm::Function::ExternalLinkage,
1941 "filter_func_hoisted_literals",
1942 cgen_state_->filter_func_->getParent());
1944 filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
1945 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
1946 E = cgen_state_->filter_func_->arg_end();
1950 filter_func_arg_it->setName(I->getName());
1952 ++filter_func_arg_it;
1956 std::unordered_map<int, std::vector<llvm::Value*>>
1957 query_func_literal_loads_function_arguments,
1958 query_func_literal_loads_function_arguments2;
1960 for (
auto& element : cgen_state_->query_func_literal_loads_) {
1961 std::vector<llvm::Value*> argument_values, argument_values2;
1963 for (
auto value : element.second) {
1964 hoisted_literals.push_back(value);
1965 argument_values.push_back(&*row_func_arg_it);
1966 if (cgen_state_->filter_func_) {
1967 argument_values2.push_back(&*filter_func_arg_it);
1968 cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
1970 if (value->hasName()) {
1971 row_func_arg_it->setName(
"arg_" + value->getName());
1972 if (cgen_state_->filter_func_) {
1973 filter_func_arg_it->getContext();
1974 filter_func_arg_it->setName(
"arg_" + value->getName());
1978 ++filter_func_arg_it;
1981 query_func_literal_loads_function_arguments[element.first] = argument_values;
1982 query_func_literal_loads_function_arguments2[element.first] = argument_values2;
1988 row_func_with_hoisted_literals->getBasicBlockList().splice(
1989 row_func_with_hoisted_literals->begin(),
1990 cgen_state_->row_func_->getBasicBlockList());
1993 for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1994 E = cgen_state_->row_func_->arg_end(),
1995 I2 = row_func_with_hoisted_literals->arg_begin();
1998 I->replaceAllUsesWith(&*I2);
2000 cgen_state_->filter_func_args_.replace(&*I, &*I2);
2004 cgen_state_->row_func_ = row_func_with_hoisted_literals;
2007 std::vector<llvm::Instruction*> placeholders;
2008 std::string prefix(
"__placeholder__literal_");
2009 for (
auto it = llvm::inst_begin(row_func_with_hoisted_literals),
2010 e = llvm::inst_end(row_func_with_hoisted_literals);
2013 if (it->hasName() && it->getName().startswith(prefix)) {
2014 auto offset_and_index_entry =
2015 cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
2016 CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2018 int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2019 int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2021 it->replaceAllUsesWith(
2022 query_func_literal_loads_function_arguments[lit_off][lit_idx]);
2023 placeholders.push_back(&*it);
2026 for (
auto placeholder : placeholders) {
2027 placeholder->removeFromParent();
2030 if (cgen_state_->filter_func_) {
2034 filter_func_with_hoisted_literals->getBasicBlockList().splice(
2035 filter_func_with_hoisted_literals->begin(),
2036 cgen_state_->filter_func_->getBasicBlockList());
2040 for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2041 E = cgen_state_->filter_func_->arg_end(),
2042 I2 = filter_func_with_hoisted_literals->arg_begin();
2045 I->replaceAllUsesWith(&*I2);
2050 cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2053 std::vector<llvm::Instruction*> placeholders;
2054 std::string prefix(
"__placeholder__literal_");
2055 for (
auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2056 e = llvm::inst_end(filter_func_with_hoisted_literals);
2059 if (it->hasName() && it->getName().startswith(prefix)) {
2060 auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2061 llvm::dyn_cast<llvm::Value>(&*it));
2062 CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2064 int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2065 int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2067 it->replaceAllUsesWith(
2068 query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2069 placeholders.push_back(&*it);
2072 for (
auto placeholder : placeholders) {
2073 placeholder->removeFromParent();
2077 return hoisted_literals;
2084 return shared_mem_used
2093 const unsigned gpu_blocksize,
2094 const unsigned num_blocks_per_mp) {
2101 CHECK(query_mem_desc_ptr);
2121 if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2126 const auto target_infos =
2128 std::unordered_set<SQLAgg> supported_aggs{
kCOUNT};
2129 if (std::find_if(target_infos.begin(),
2132 if (ti.sql_type.is_varlen() ||
2133 !supported_aggs.count(ti.agg_kind)) {
2138 }) == target_infos.end()) {
2153 if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2165 const size_t shared_memory_threshold_bytes = std::min(
2168 const auto output_buffer_size =
2170 if (output_buffer_size > shared_memory_threshold_bytes) {
2177 const auto target_infos =
2179 std::unordered_set<SQLAgg> supported_aggs{
kCOUNT};
2183 if (std::find_if(target_infos.begin(),
2186 if (ti.sql_type.is_varlen() ||
2187 !supported_aggs.count(ti.agg_kind)) {
2192 }) == target_infos.end()) {
2203 std::string llvm_ir;
2204 std::unordered_set<llvm::MDNode*> md;
2207 for (
auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2208 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2209 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2210 instr_it->getAllMetadata(imd);
2211 for (
auto [kind, node] : imd) {
2218 for (
auto bb_it = cgen_state->
row_func_->begin(); bb_it != cgen_state->
row_func_->end();
2220 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2221 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2222 instr_it->getAllMetadata(imd);
2223 for (
auto [kind, node] : imd) {
2234 for (
auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2235 llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2236 instr_it->getAllMetadata(imd);
2237 for (
auto [kind, node] : imd) {
2246 std::map<size_t, std::string> sorted_strings;
2249 llvm::raw_string_ostream os(str);
2250 p->print(os, cgen_state->
module_,
true);
2252 auto fields =
split(str, {}, 1);
2253 if (fields.empty() || fields[0].empty()) {
2256 sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2259 for (
auto [
id, text] : sorted_strings) {
2271 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2278 const bool allow_lazy_fetch,
2279 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2280 const size_t max_groups_buffer_entry_guess,
2281 const int8_t crt_min_byte_width,
2282 const bool has_cardinality_estimation,
2288 const auto cuda_mgr =
catalog_->getDataMgr().getCudaMgr();
2295 static std::uint64_t counter = 0;
2297 VLOG(1) <<
"CODEGEN #" << counter <<
":";
2298 LOG(
IR) <<
"CODEGEN #" << counter <<
":";
2299 LOG(
PTX) <<
"CODEGEN #" << counter <<
":";
2300 LOG(
ASM) <<
"CODEGEN #" << counter <<
":";
2303 nukeOldState(allow_lazy_fetch, query_infos, deleted_cols_map, &ra_exe_unit);
2311 has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2315 max_groups_buffer_entry_guess,
2322 !has_cardinality_estimation &&
2324 const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2328 const bool output_columnar =
query_mem_desc->didOutputColumnar();
2329 const bool gpu_shared_mem_optimization =
2334 cuda_mgr ? this->blockSize() : 1,
2335 cuda_mgr ? this->numBlocksPerMP() : 1);
2336 if (gpu_shared_mem_optimization) {
2339 LOG(
DEBUG1) <<
"GPU shared memory is used for the " +
2350 const size_t num_count_distinct_descs =
2352 for (
size_t i = 0;
i < num_count_distinct_descs;
i++) {
2353 const auto& count_distinct_descriptor =
2366 auto rt_module_copy = llvm::CloneModule(
2367 *
g_rt_module.get(), cgen_state_->vmap_, [](
const llvm::GlobalValue* gv) {
2368 auto func = llvm::dyn_cast<llvm::Function>(gv);
2372 return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2373 func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
2392 rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
2396 cgen_state_->module_ = rt_module_copy.release();
2402 const auto agg_slot_count = ra_exe_unit.
estimator ? size_t(1) : agg_fnames.size();
2405 auto [query_func, row_func_call] = is_group_by
2415 !!ra_exe_unit.estimator,
2421 cgen_state_->query_func_ = query_func;
2422 cgen_state_->row_func_call_ = row_func_call;
2423 cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2424 &query_func->getEntryBlock().front());
2428 auto& fetch_bb = query_func->front();
2429 llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2430 fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2432 query_func->args().begin(),
2434 cgen_state_->context_);
2438 is_group_by ? 0 : agg_slot_count,
2440 cgen_state_->module_,
2441 cgen_state_->context_);
2442 CHECK(cgen_state_->row_func_);
2443 cgen_state_->row_func_bb_ =
2444 llvm::BasicBlock::Create(cgen_state_->context_,
"entry", cgen_state_->row_func_);
2447 auto filter_func_ft =
2448 llvm::FunctionType::get(
get_int_type(32, cgen_state_->context_), {},
false);
2449 cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2450 llvm::Function::ExternalLinkage,
2452 cgen_state_->module_);
2453 CHECK(cgen_state_->filter_func_);
2454 cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2455 cgen_state_->context_,
"entry", cgen_state_->filter_func_);
2458 cgen_state_->current_func_ = cgen_state_->row_func_;
2459 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2461 preloadFragOffsets(ra_exe_unit.
input_descs, query_infos);
2463 const auto join_loops =
2464 buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2467 const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2468 if (is_not_deleted_bb) {
2469 cgen_state_->row_func_bb_ = is_not_deleted_bb;
2471 if (!join_loops.empty()) {
2472 codegenJoinLoops(join_loops,
2473 body_execution_unit,
2474 group_by_and_aggregate,
2476 cgen_state_->row_func_bb_,
2481 const bool can_return_error = compileBody(
2482 ra_exe_unit, group_by_and_aggregate, *
query_mem_desc, co, gpu_smem_context);
2485 createErrorCheckControlFlow(query_func,
2489 group_by_and_aggregate.query_infos_);
2492 std::vector<llvm::Value*> hoisted_literals;
2495 VLOG(1) <<
"number of hoisted literals: "
2496 << cgen_state_->query_func_literal_loads_.size()
2497 <<
" / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2501 if (co.
hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2503 hoisted_literals = inlineHoistedLiterals();
2507 std::vector<llvm::Value*> row_func_args;
2508 for (
size_t i = 0;
i < cgen_state_->row_func_call_->getNumArgOperands(); ++
i) {
2509 row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(
i));
2511 row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2512 row_func_args.push_back(
get_arg_by_name(query_func,
"join_hash_tables"));
2514 row_func_args.insert(
2515 row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2516 llvm::ReplaceInstWithInst(
2517 cgen_state_->row_func_call_,
2518 llvm::CallInst::Create(cgen_state_->row_func_, row_func_args,
""));
2521 if (cgen_state_->filter_func_) {
2522 std::vector<llvm::Value*> filter_func_args;
2523 for (
auto arg_it = cgen_state_->filter_func_args_.begin();
2524 arg_it != cgen_state_->filter_func_args_.end();
2526 filter_func_args.push_back(arg_it->first);
2528 llvm::ReplaceInstWithInst(
2529 cgen_state_->filter_func_call_,
2530 llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args,
""));
2534 plan_state_->init_agg_vals_ =
2544 if (gpu_smem_context.isSharedMemoryUsed()) {
2548 cgen_state_->module_,
2549 cgen_state_->context_,
2552 plan_state_->init_agg_vals_);
2553 gpu_smem_code.codegen();
2554 gpu_smem_code.injectFunctionsInto(query_func);
2557 cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2558 cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2559 LOG(
IR) << gpu_smem_code.toString();
2563 auto multifrag_query_func = cgen_state_->module_->getFunction(
2564 "multifrag_query" + std::string(co.
hoist_literals ?
"_hoisted_literals" :
""));
2565 CHECK(multifrag_query_func);
2568 insertErrorCodeChecker(
2573 "query_stub" + std::string(co.
hoist_literals ?
"_hoisted_literals" :
""),
2574 multifrag_query_func,
2575 cgen_state_->module_);
2577 std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2578 if (cgen_state_->filter_func_) {
2579 root_funcs.push_back(cgen_state_->filter_func_);
2582 *cgen_state_->module_, root_funcs, {multifrag_query_func});
2589 if (cgen_state_->filter_func_) {
2599 std::string llvm_ir;
2602 #ifdef WITH_JIT_DEBUG
2603 throw std::runtime_error(
2604 "Explain optimized not available when JIT runtime debug symbols are enabled");
2608 llvm::legacy::PassManager pass_manager;
2609 optimize_ir(query_func, cgen_state_->module_, pass_manager, live_funcs, co);
2610 #endif // WITH_JIT_DEBUG
2624 LOG(
IR) <<
"IR for the "
2638 if (cgen_state_->filter_func_) {
2643 return std::make_tuple(
2646 ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2647 : optimizeAndCodegenGPU(query_func,
2648 multifrag_query_func,
2650 is_group_by || ra_exe_unit.estimator,
2653 cgen_state_->getLiterals(),
2656 std::move(gpu_smem_context)},
2661 bool hoist_literals,
2662 bool allow_runtime_query_interrupt) {
2663 auto query_stub_func_name =
2664 "query_stub" + std::string(hoist_literals ?
"_hoisted_literals" :
"");
2665 for (
auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2666 for (
auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
2667 if (!llvm::isa<llvm::CallInst>(*inst_it)) {
2670 auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
2671 if (std::string(row_func_call.getCalledFunction()->getName()) ==
2672 query_stub_func_name) {
2673 auto next_inst_it = inst_it;
2675 auto new_bb = bb_it->splitBasicBlock(next_inst_it);
2676 auto& br_instr = bb_it->back();
2677 llvm::IRBuilder<> ir_builder(&br_instr);
2678 llvm::Value* err_lv = &*inst_it;
2679 auto error_check_bb =
2680 bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
".error_check");
2681 llvm::Value* error_code_arg =
nullptr;
2683 for (
auto arg_it = query_func->arg_begin(); arg_it != query_func->arg_end();
2684 arg_it++, ++arg_cnt) {
2687 if (hoist_literals) {
2689 error_code_arg = &*arg_it;
2694 error_code_arg = &*arg_it;
2699 CHECK(error_code_arg);
2700 llvm::Value* err_code =
nullptr;
2701 if (allow_runtime_query_interrupt) {
2703 auto& check_interrupt_br_instr = bb_it->back();
2704 auto interrupt_check_bb = llvm::BasicBlock::Create(
2705 cgen_state_->context_,
".interrupt_check", query_func, error_check_bb);
2706 llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
2707 auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
2708 cgen_state_->module_->getFunction(
"check_interrupt"), {});
2709 auto detected_error = interrupt_checker_ir_builder.CreateCall(
2710 cgen_state_->module_->getFunction(
"get_error_code"),
2711 std::vector<llvm::Value*>{error_code_arg});
2712 err_code = interrupt_checker_ir_builder.CreateSelect(
2716 interrupt_checker_ir_builder.CreateBr(error_check_bb);
2717 llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
2718 llvm::BranchInst::Create(interrupt_check_bb));
2719 ir_builder.SetInsertPoint(&br_instr);
2722 ir_builder.SetInsertPoint(&br_instr);
2724 ir_builder.CreateCall(cgen_state_->module_->getFunction(
"get_error_code"),
2725 std::vector<llvm::Value*>{error_code_arg});
2727 err_lv = ir_builder.CreateICmp(
2728 llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
2729 auto error_bb = llvm::BasicBlock::Create(
2730 cgen_state_->context_,
".error_exit", query_func, new_bb);
2731 llvm::CallInst::Create(cgen_state_->module_->getFunction(
"record_error_code"),
2732 std::vector<llvm::Value*>{err_code, error_code_arg},
2735 llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2736 llvm::ReplaceInstWithInst(&br_instr,
2737 llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2752 const auto& outer_input_desc = ra_exe_unit.
input_descs[0];
2756 const auto deleted_cd =
2757 plan_state_->getDeletedColForTable(outer_input_desc.getTableId());
2761 CHECK(deleted_cd->columnType.is_boolean());
2762 const auto deleted_expr =
2763 makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
2764 outer_input_desc.getTableId(),
2765 deleted_cd->columnId,
2766 outer_input_desc.getNestLevel());
2768 const auto is_deleted =
2769 code_generator.toBool(code_generator.codegen(deleted_expr.get(),
true, co).front());
2770 const auto is_deleted_bb = llvm::BasicBlock::Create(
2771 cgen_state_->context_,
"is_deleted", cgen_state_->row_func_);
2772 llvm::BasicBlock* bb = llvm::BasicBlock::Create(
2773 cgen_state_->context_,
"is_not_deleted", cgen_state_->row_func_);
2774 cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
2775 cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
2776 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2777 cgen_state_->ir_builder_.SetInsertPoint(bb);
2792 cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
2793 llvm::Value* loop_done{
nullptr};
2794 std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
2795 if (cgen_state_->filter_func_) {
2796 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
2797 auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
2798 cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
2799 row_func_entry_bb->begin());
2800 loop_done = cgen_state_->ir_builder_.CreateAlloca(
2801 get_int_type(1, cgen_state_->context_),
nullptr,
"loop_done");
2802 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2803 cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(
true), loop_done);
2805 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
2806 cgen_state_->current_func_ = cgen_state_->filter_func_;
2807 fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
2811 std::vector<Analyzer::Expr*> primary_quals;
2812 std::vector<Analyzer::Expr*> deferred_quals;
2814 ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
2815 if (short_circuited) {
2817 <<
"short-circuited and deferred " <<
std::to_string(deferred_quals.size())
2820 llvm::Value* filter_lv = cgen_state_->llBool(
true);
2822 for (
auto expr : primary_quals) {
2824 auto cond = code_generator.toBool(code_generator.codegen(expr,
true, co).front());
2825 filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
2827 CHECK(filter_lv->getType()->isIntegerTy(1));
2828 llvm::BasicBlock* sc_false{
nullptr};
2829 if (!deferred_quals.empty()) {
2830 auto sc_true = llvm::BasicBlock::Create(
2831 cgen_state_->context_,
"sc_true", cgen_state_->current_func_);
2832 sc_false = llvm::BasicBlock::Create(
2833 cgen_state_->context_,
"sc_false", cgen_state_->current_func_);
2834 cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
2835 cgen_state_->ir_builder_.SetInsertPoint(sc_false);
2837 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
2839 cgen_state_->ir_builder_.SetInsertPoint(sc_true);
2840 filter_lv = cgen_state_->llBool(
true);
2842 for (
auto expr : deferred_quals) {
2843 filter_lv = cgen_state_->ir_builder_.CreateAnd(
2844 filter_lv, code_generator.toBool(code_generator.codegen(expr,
true, co).front()));
2847 CHECK(filter_lv->getType()->isIntegerTy(1));
2848 auto ret = group_by_and_aggregate.
codegen(
2849 filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
2853 if (cgen_state_->filter_func_) {
2854 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
2855 cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(
false), loop_done);
2856 cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2859 cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2860 cgen_state_->current_func_ = cgen_state_->row_func_;
2861 cgen_state_->filter_func_call_ =
2862 cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
2866 redeclareFilterFunction();
2868 if (cgen_state_->row_func_bb_->getName() ==
"loop_body") {
2869 auto loop_done_true = llvm::BasicBlock::Create(
2870 cgen_state_->context_,
"loop_done_true", cgen_state_->row_func_);
2871 auto loop_done_false = llvm::BasicBlock::Create(
2872 cgen_state_->context_,
"loop_done_false", cgen_state_->row_func_);
2873 auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(loop_done);
2874 cgen_state_->ir_builder_.CreateCondBr(
2875 loop_done_flag, loop_done_true, loop_done_false);
2876 cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
2877 cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2878 cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
2880 cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2887 return llvm::CloneModule(
2889 auto func = llvm::dyn_cast<llvm::Function>(gv);
2893 return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2894 func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage);
2899 llvm::Value* byte_stream_arg,
2900 llvm::IRBuilder<>& ir_builder,
2901 llvm::LLVMContext& ctx) {
2902 CHECK(byte_stream_arg);
2903 const auto max_col_local_id = num_columns - 1;
2905 std::vector<llvm::Value*> col_heads;
2906 for (
int col_id = 0; col_id <= max_col_local_id; ++col_id) {
2907 col_heads.emplace_back(ir_builder.CreateLoad(ir_builder.CreateGEP(
2908 byte_stream_arg, llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id))));
2912
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
void read_rt_udf_gpu_module(const std::string &udf_ir)
std::vector< Analyzer::Expr * > target_exprs
std::string filename(char const *path)
double g_running_query_interrupt_freq
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
std::unique_ptr< llvm::Module > rt_udf_cpu_module
std::string gen_translate_null_key_sigs()
bool countDistinctDescriptorsLogicallyEmpty() const
size_t getEntryCount() const
std::unique_ptr< llvm::Module > runtime_module_shallow_copy(CgenState *cgen_state)
static const int32_t ERR_INTERRUPTED
void mark_function_never_inline(llvm::Function *func)
std::unique_ptr< llvm::Module > udf_gpu_module
void show_defined(llvm::Module &module)
void read_rt_udf_cpu_module(const std::string &udf_ir)
bool with_dynamic_watchdog
Streaming Top N algorithm.
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void checkCudaErrors(CUresult err)
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
bool is_udf_module_present(bool cpu_only=false)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
bool hasKeylessHash() const
void read_udf_cpu_module(const std::string &udf_ir_filename)
void read_udf_gpu_module(const std::string &udf_ir_filename)
std::vector< std::string > CodeCacheKey
ExecutorOptLevel opt_level
bool g_enable_dynamic_watchdog
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
void optimize_ir(llvm::Function *query_func, llvm::Module *module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
bool filter_on_deleted_column
#define LOG_IF(severity, condition)
gpu_code_cache_(code_cache_size)
size_t getRowSize() const
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *module)
llvm::Function * row_func_
cpu_code_cache_(code_cache_size)
std::shared_ptr< CompilationContext > getCodeFromCache(const CodeCacheKey &, const CodeCache &)
bool g_enable_smem_non_grouped_agg
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co, const GPUTarget &gpu_target)
unsigned getExpOfTwo(unsigned n)
bool output_columnar_hint
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned gpu_blocksize, const unsigned num_blocks_per_mp)
std::string get_cuda_home(void)
llvm::StringRef get_gpu_target_triple_string()
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
void verify_function_ir(const llvm::Function *func)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
bool useStreamingTopN() const
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > g_rt_module
ExecutorExplainType explain_type
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
void insertErrorCodeChecker(llvm::Function *query_func, bool hoist_literals, bool allow_runtime_query_interrupt)
static const int32_t ERR_OUT_OF_TIME
void initializeNVPTXBackend() const
size_t getMinSharedMemoryPerBlockForAllDevices() const
const_list_iterator_t cend() const
const std::string cuda_rt_decls
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
QueryDescriptionType getQueryDescriptionType() const
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
static void addCodeToCache(const CodeCacheKey &, std::shared_ptr< CompilationContext >, llvm::Module *, CodeCache &)
ExecutorDeviceType device_type
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
llvm::Function * filter_func_
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
bool isArchMaxwellOrLaterForAll() const
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
llvm::Module * read_template_module(llvm::LLVMContext &context)
bool g_enable_smem_grouped_non_count_agg
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
std::unique_ptr< llvm::Module > udf_cpu_module
bool g_enable_filter_function
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
llvm::LLVMContext & getGlobalLLVMContext()
float g_fraction_code_cache_to_evict
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
SQLAgg get_aggtype() const
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
std::list< std::shared_ptr< Analyzer::Expr > > quals
std::string gen_array_any_all_sigs()
bool didOutputColumnar() const
bool isPotentialInSituRender() const
#define DEBUG_TIMER(name)
llvm::ValueToValueMapTy vmap_
std::string get_root_abs_path()
std::vector< llvm::Value * > inlineHoistedLiterals()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool register_intel_jit_listener
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *module, llvm::LLVMContext &context)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
NvidiaDeviceArch getDeviceArch() const
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls, const bool is_gpu=false)
bool allow_runtime_query_interrupt
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
CubinResult ptx_to_cubin(const std::string &ptx, const unsigned block_size, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
bool is_rt_udf_module_present(bool cpu_only=false)
void put(const key_t &key, value_t &&value)
const_list_iterator_t find(const key_t &key) const
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
size_t g_gpu_smem_threshold