OmniSciDB  6686921089
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
NativeCodegen.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryEngine/Execute.h"
18 
19 #if LLVM_VERSION_MAJOR < 9
20 static_assert(false, "LLVM Version >= 9 is required.");
21 #endif
22 
23 #include <llvm/Analysis/ScopedNoAliasAA.h>
24 #include <llvm/Analysis/TypeBasedAliasAnalysis.h>
25 #include <llvm/Bitcode/BitcodeReader.h>
26 #include <llvm/Bitcode/BitcodeWriter.h>
27 #include <llvm/ExecutionEngine/MCJIT.h>
28 #include <llvm/IR/Attributes.h>
29 #include <llvm/IR/GlobalValue.h>
30 #include <llvm/IR/InstIterator.h>
31 #include <llvm/IR/IntrinsicInst.h>
32 #include <llvm/IR/Intrinsics.h>
33 #include <llvm/IR/LegacyPassManager.h>
34 #include <llvm/IR/Verifier.h>
35 #include <llvm/IRReader/IRReader.h>
36 #include <llvm/Linker/Linker.h>
37 #include <llvm/Support/Casting.h>
38 #include <llvm/Support/FileSystem.h>
39 #include <llvm/Support/FormattedStream.h>
40 #include <llvm/Support/MemoryBuffer.h>
41 #include <llvm/Support/SourceMgr.h>
42 #include <llvm/Support/TargetRegistry.h>
43 #include <llvm/Support/TargetSelect.h>
44 #include <llvm/Support/raw_os_ostream.h>
45 #include <llvm/Support/raw_ostream.h>
46 #include <llvm/Transforms/IPO.h>
47 #include <llvm/Transforms/IPO/AlwaysInliner.h>
48 #include <llvm/Transforms/IPO/InferFunctionAttrs.h>
49 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
50 #include <llvm/Transforms/InstCombine/InstCombine.h>
51 #include <llvm/Transforms/Instrumentation.h>
52 #include <llvm/Transforms/Scalar.h>
53 #include <llvm/Transforms/Scalar/GVN.h>
54 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
55 #include <llvm/Transforms/Utils.h>
56 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
57 #include <llvm/Transforms/Utils/Cloning.h>
58 
59 #if LLVM_VERSION_MAJOR >= 11
60 #include <llvm/Support/Host.h>
61 #endif
62 
63 #include "CudaMgr/CudaMgr.h"
73 #include "Shared/MathUtils.h"
74 #include "StreamingTopN.h"
75 
77 
78 std::unique_ptr<llvm::Module> udf_gpu_module;
79 std::unique_ptr<llvm::Module> udf_cpu_module;
80 std::unique_ptr<llvm::Module> rt_udf_gpu_module;
81 std::unique_ptr<llvm::Module> rt_udf_cpu_module;
82 
83 extern std::unique_ptr<llvm::Module> g_rt_module;
84 
85 #ifdef HAVE_CUDA
86 extern std::unique_ptr<llvm::Module> g_rt_libdevice_module;
87 #endif
88 
89 #ifdef ENABLE_GEOS
90 extern std::unique_ptr<llvm::Module> g_rt_geos_module;
91 
92 #include <llvm/Support/DynamicLibrary.h>
93 
94 #ifndef GEOS_LIBRARY_FILENAME
95 #error Configuration should include GEOS library file name
96 #endif
97 std::unique_ptr<std::string> g_libgeos_so_filename(
98  new std::string(GEOS_LIBRARY_FILENAME));
99 static llvm::sys::DynamicLibrary geos_dynamic_library;
100 static std::mutex geos_init_mutex;
101 
102 namespace {
103 
104 void load_geos_dynamic_library() {
105  std::lock_guard<std::mutex> guard(geos_init_mutex);
106 
107  if (!geos_dynamic_library.isValid()) {
108  if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
109  LOG(WARNING) << "Misconfigured GEOS library file name, trying 'libgeos_c.so'";
110  g_libgeos_so_filename.reset(new std::string("libgeos_c.so"));
111  }
112  auto filename = *g_libgeos_so_filename;
113  std::string error_message;
114  geos_dynamic_library =
115  llvm::sys::DynamicLibrary::getPermanentLibrary(filename.c_str(), &error_message);
116  if (!geos_dynamic_library.isValid()) {
117  LOG(ERROR) << "Failed to load GEOS library '" + filename + "'";
118  std::string exception_message = "Failed to load GEOS library: " + error_message;
119  throw std::runtime_error(exception_message.c_str());
120  } else {
121  LOG(INFO) << "Loaded GEOS library '" + filename + "'";
122  }
123  }
124 }
125 
126 } // namespace
127 #endif
128 
129 namespace {
130 
131 void throw_parseIR_error(const llvm::SMDiagnostic& parse_error,
132  std::string src = "",
133  const bool is_gpu = false) {
134  std::string excname = (is_gpu ? "NVVM IR ParseError: " : "LLVM IR ParseError: ");
135  llvm::raw_string_ostream ss(excname);
136  parse_error.print(src.c_str(), ss, false, false);
137  throw ParseIRError(ss.str());
138 }
139 
140 /* SHOW_DEFINED(<llvm::Module instance>) prints the function names
141  that are defined in the given LLVM Module instance.
142 
143  SHOW_FUNCTIONS(<llvm::Module instance>) prints the function names
144  of all used functions in the given LLVM Module
145  instance. Declarations are marked with `[decl]` as a name suffix.
146 
147  Useful for debugging.
148 */
149 
150 #define SHOW_DEFINED(MODULE) \
151  { \
152  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
153  ::show_defined(MODULE); \
154  }
155 
156 #define SHOW_FUNCTIONS(MODULE) \
157  { \
158  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
159  ::show_functions(MODULE); \
160  }
161 
162 template <typename T = void>
163 void show_defined(llvm::Module& module) {
164  std::cout << "defines: ";
165  for (auto& f : module.getFunctionList()) {
166  if (!f.isDeclaration()) {
167  std::cout << f.getName().str() << ", ";
168  }
169  }
170  std::cout << std::endl;
171 }
172 
173 template <typename T = void>
174 void show_defined(llvm::Module* module) {
175  if (module == nullptr) {
176  std::cout << "is null" << std::endl;
177  } else {
178  show_defined(*module);
179  }
180 }
181 
182 template <typename T = void>
183 void show_defined(std::unique_ptr<llvm::Module>& module) {
184  show_defined(module.get());
185 }
186 
187 /*
188  scan_function_calls(module, defined, undefined, ignored) computes
189  defined and undefined sets of function names:
190 
191  - defined functions are those that are defined in the given module
192 
193  - undefined functions are those that are called by defined functions
194  but that are not defined in the given module
195 
196  - ignored functions are functions that may be undefined but will not
197  be listed in the set of undefined functions.
198 
199  Useful for debugging.
200 */
201 template <typename T = void>
202 void scan_function_calls(llvm::Function& F,
203  std::unordered_set<std::string>& defined,
204  std::unordered_set<std::string>& undefined,
205  const std::unordered_set<std::string>& ignored) {
206  for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
207  if (auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
208  auto* F2 = CI->getCalledFunction();
209  if (F2 != nullptr) {
210  auto F2name = F2->getName().str();
211  if (F2->isDeclaration()) {
212  if (F2name.rfind("__", 0) !=
213  0 // assume symbols with double underscore are defined
214  && F2name.rfind("llvm.", 0) !=
215  0 // TODO: this may give false positive for NVVM intrinsics
216  && ignored.find(F2name) == ignored.end() // not in ignored list
217  ) {
218  undefined.emplace(F2name);
219  }
220  } else {
221  if (defined.find(F2name) == defined.end()) {
222  defined.emplace(F2name);
223  scan_function_calls<T>(*F2, defined, undefined, ignored);
224  }
225  }
226  }
227  }
228  }
229 }
230 
231 template <typename T = void>
232 void scan_function_calls(llvm::Module& module,
233  std::unordered_set<std::string>& defined,
234  std::unordered_set<std::string>& undefined,
235  const std::unordered_set<std::string>& ignored) {
236  for (auto& F : module) {
237  if (!F.isDeclaration()) {
238  scan_function_calls(F, defined, undefined, ignored);
239  }
240  }
241 }
242 
243 template <typename T = void>
244 std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>>
245 scan_function_calls(llvm::Module& module,
246  const std::unordered_set<std::string>& ignored = {}) {
247  std::unordered_set<std::string> defined, undefined;
248  scan_function_calls(module, defined, undefined, ignored);
249  return std::make_tuple(defined, undefined);
250 }
251 
252 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
254  llvm::Module& M,
255  const std::unordered_set<llvm::Function*>& live_funcs) {
256  std::vector<llvm::Function*> dead_funcs;
257  for (auto& F : M) {
258  bool bAlive = false;
259  if (live_funcs.count(&F)) {
260  continue;
261  }
262  for (auto U : F.users()) {
263  auto* C = llvm::dyn_cast<const llvm::CallInst>(U);
264  if (!C || C->getParent()->getParent() != &F) {
265  bAlive = true;
266  break;
267  }
268  }
269  if (!bAlive) {
270  dead_funcs.push_back(&F);
271  }
272  }
273  for (auto pFn : dead_funcs) {
274  pFn->eraseFromParent();
275  }
276 }
277 
278 #ifdef HAVE_CUDA
279 
280 // check if linking with libdevice is required
281 // libdevice functions have a __nv_* prefix
282 bool check_module_requires_libdevice(llvm::Module* module) {
283  for (llvm::Function& F : *module) {
284  if (F.hasName() && F.getName().startswith("__nv_")) {
285  LOG(INFO) << "Module requires linking with libdevice: " << std::string(F.getName());
286  return true;
287  }
288  }
289  LOG(DEBUG1) << "module does not require linking against libdevice";
290  return false;
291 }
292 
293 // Adds the missing intrinsics declarations to the given module
294 void add_intrinsics_to_module(llvm::Module* module) {
295  for (llvm::Function& F : *module) {
296  for (llvm::Instruction& I : instructions(F)) {
297  if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
298  if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
299  llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
300  llvm::Function& decl_fn =
301  *llvm::Intrinsic::getDeclaration(module, ii->getIntrinsicID(), Tys);
302  ii->setCalledFunction(&decl_fn);
303  } else {
304  // inserts the declaration into the module if not present
305  llvm::Intrinsic::getDeclaration(module, ii->getIntrinsicID());
306  }
307  }
308  }
309  }
310 }
311 
312 #endif
313 
314 void optimize_ir(llvm::Function* query_func,
315  llvm::Module* module,
316  llvm::legacy::PassManager& pass_manager,
317  const std::unordered_set<llvm::Function*>& live_funcs,
318  const CompilationOptions& co) {
319  // the always inliner legacy pass must always run first
320  pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
321 
322  pass_manager.add(new AnnotateInternalFunctionsPass());
323 
324  pass_manager.add(llvm::createSROAPass());
325  // mem ssa drops unused load and store instructions, e.g. passing variables directly
326  // where possible
327  pass_manager.add(
328  llvm::createEarlyCSEPass(/*enable_mem_ssa=*/true)); // Catch trivial redundancies
329 
330  pass_manager.add(llvm::createJumpThreadingPass()); // Thread jumps.
331  pass_manager.add(llvm::createCFGSimplificationPass());
332 
333  // remove load/stores in PHIs if instructions can be accessed directly post thread jumps
334  pass_manager.add(llvm::createNewGVNPass());
335 
336  pass_manager.add(llvm::createDeadStoreEliminationPass());
337  pass_manager.add(llvm::createLICMPass());
338 
339  pass_manager.add(llvm::createInstructionCombiningPass());
340 
341  // module passes
342  pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
343  pass_manager.add(llvm::createGlobalOptimizerPass());
344 
346  pass_manager.add(llvm::createLoopStrengthReducePass());
347  }
348 
349  pass_manager.add(llvm::createCFGSimplificationPass()); // cleanup after everything
350 
351  pass_manager.run(*module);
352 
353  eliminate_dead_self_recursive_funcs(*module, live_funcs);
354 }
355 #endif
356 
357 } // namespace
358 
360 
361 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine)
362  : execution_engine_(execution_engine) {}
363 
364 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine,
365  const CompilationOptions& co)
366  : execution_engine_(execution_engine) {
367  if (execution_engine_) {
369 #ifdef ENABLE_INTEL_JIT_LISTENER
370  intel_jit_listener_.reset(llvm::JITEventListener::createIntelJITEventListener());
372  execution_engine_->RegisterJITEventListener(intel_jit_listener_.get());
373  LOG(INFO) << "Registered IntelJITEventListener";
374 #else
375  LOG(WARNING) << "This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
376  "listener configuration parameter.";
377 #endif // ENABLE_INTEL_JIT_LISTENER
378  }
379  }
380 }
381 
383  llvm::ExecutionEngine* execution_engine) {
384  execution_engine_.reset(execution_engine);
385  intel_jit_listener_ = nullptr;
386  return *this;
387 }
388 
389 void verify_function_ir(const llvm::Function* func) {
390  std::stringstream err_ss;
391  llvm::raw_os_ostream err_os(err_ss);
392  err_os << "\n-----\n";
393  if (llvm::verifyFunction(*func, &err_os)) {
394  err_os << "\n-----\n";
395  func->print(err_os, nullptr);
396  err_os << "\n-----\n";
397  LOG(FATAL) << err_ss.str();
398  }
399 }
400 
401 std::shared_ptr<CompilationContext> Executor::getCodeFromCache(const CodeCacheKey& key,
402  const CodeCache& cache) {
403  auto it = cache.find(key);
404  if (it != cache.cend()) {
405  delete cgen_state_->module_;
406  cgen_state_->module_ = it->second.second;
407  return it->second.first;
408  }
409  return {};
410 }
411 
413  std::shared_ptr<CompilationContext> compilation_context,
414  llvm::Module* module,
415  CodeCache& cache) {
416  cache.put(key,
417  std::make_pair<std::shared_ptr<CompilationContext>, decltype(module)>(
418  std::move(compilation_context), std::move(module)));
419 }
420 
421 namespace {
422 
423 std::string assemblyForCPU(ExecutionEngineWrapper& execution_engine,
424  llvm::Module* module) {
425  llvm::legacy::PassManager pass_manager;
426  auto cpu_target_machine = execution_engine->getTargetMachine();
427  CHECK(cpu_target_machine);
428  llvm::SmallString<256> code_str;
429  llvm::raw_svector_ostream os(code_str);
430 #if LLVM_VERSION_MAJOR >= 10
431  cpu_target_machine->addPassesToEmitFile(
432  pass_manager, os, nullptr, llvm::CGFT_AssemblyFile);
433 #else
434  cpu_target_machine->addPassesToEmitFile(
435  pass_manager, os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
436 #endif
437  pass_manager.run(*module);
438  return "Assembly for the CPU:\n" + std::string(code_str.str()) + "\nEnd of assembly";
439 }
440 
441 } // namespace
442 
444  llvm::Function* func,
445  const std::unordered_set<llvm::Function*>& live_funcs,
446  const CompilationOptions& co) {
447  auto module = func->getParent();
448  // run optimizations
449 #ifndef WITH_JIT_DEBUG
450  llvm::legacy::PassManager pass_manager;
451  optimize_ir(func, module, pass_manager, live_funcs, co);
452 #endif // WITH_JIT_DEBUG
453 
454  auto init_err = llvm::InitializeNativeTarget();
455  CHECK(!init_err);
456 
457  llvm::InitializeAllTargetMCs();
458  llvm::InitializeNativeTargetAsmPrinter();
459  llvm::InitializeNativeTargetAsmParser();
460 
461  std::string err_str;
462  std::unique_ptr<llvm::Module> owner(module);
463  llvm::EngineBuilder eb(std::move(owner));
464  eb.setErrorStr(&err_str);
465  eb.setEngineKind(llvm::EngineKind::JIT);
466  llvm::TargetOptions to;
467  to.EnableFastISel = true;
468  eb.setTargetOptions(to);
470  eb.setOptLevel(llvm::CodeGenOpt::None);
471  }
472 
473 #ifdef _WIN32
474  // TODO: workaround for data layout mismatch crash for now
475  auto target_machine = eb.selectTarget();
476  CHECK(target_machine);
477  module->setDataLayout(target_machine->createDataLayout());
478 #endif
479 
480  ExecutionEngineWrapper execution_engine(eb.create(), co);
481  CHECK(execution_engine.get());
482  LOG(ASM) << assemblyForCPU(execution_engine, module);
483 
484  execution_engine->finalizeObject();
485  return execution_engine;
486 }
487 
488 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenCPU(
489  llvm::Function* query_func,
490  llvm::Function* multifrag_query_func,
491  const std::unordered_set<llvm::Function*>& live_funcs,
492  const CompilationOptions& co) {
493  auto module = multifrag_query_func->getParent();
494  CodeCacheKey key{serialize_llvm_object(query_func),
495  serialize_llvm_object(cgen_state_->row_func_)};
496  if (cgen_state_->filter_func_) {
497  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
498  }
499  for (const auto helper : cgen_state_->helper_functions_) {
500  key.push_back(serialize_llvm_object(helper));
501  }
502  auto cached_code = getCodeFromCache(key, cpu_code_cache_);
503  if (cached_code) {
504  return cached_code;
505  }
506 
507  if (cgen_state_->needs_geos_) {
508 #ifdef ENABLE_GEOS
509  load_geos_dynamic_library();
510 
511  // Read geos runtime module and bind GEOS API function references to GEOS library
512  auto rt_geos_module_copy = llvm::CloneModule(
513  *g_rt_geos_module.get(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
514  auto func = llvm::dyn_cast<llvm::Function>(gv);
515  if (!func) {
516  return true;
517  }
518  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
519  func->getLinkage() ==
520  llvm::GlobalValue::LinkageTypes::InternalLinkage ||
521  func->getLinkage() == llvm::GlobalValue::LinkageTypes::ExternalLinkage);
522  });
523  CodeGenerator::link_udf_module(rt_geos_module_copy,
524  *module,
525  cgen_state_.get(),
526  llvm::Linker::Flags::LinkOnlyNeeded);
527 #else
528  throw std::runtime_error("GEOS is disabled in this build");
529 #endif
530  }
531 
532  auto execution_engine =
533  CodeGenerator::generateNativeCPUCode(query_func, live_funcs, co);
534  auto cpu_compilation_context =
535  std::make_shared<CpuCompilationContext>(std::move(execution_engine));
536  cpu_compilation_context->setFunctionPointer(multifrag_query_func);
537  addCodeToCache(key, cpu_compilation_context, module, cpu_code_cache_);
538  return cpu_compilation_context;
539 }
540 
541 void CodeGenerator::link_udf_module(const std::unique_ptr<llvm::Module>& udf_module,
542  llvm::Module& module,
543  CgenState* cgen_state,
544  llvm::Linker::Flags flags) {
545  // throw a runtime error if the target module contains functions
546  // with the same name as in module of UDF functions.
547  for (auto& f : *udf_module.get()) {
548  auto func = module.getFunction(f.getName());
549  if (!(func == nullptr) && !f.isDeclaration() && flags == llvm::Linker::Flags::None) {
550  LOG(ERROR) << " Attempt to overwrite " << f.getName().str() << " in "
551  << module.getModuleIdentifier() << " from `"
552  << udf_module->getModuleIdentifier() << "`" << std::endl;
553  throw std::runtime_error(
554  "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
555  "function ***");
556  } else {
557  VLOG(1) << " Adding " << f.getName().str() << " to "
558  << module.getModuleIdentifier() << " from `"
559  << udf_module->getModuleIdentifier() << "`" << std::endl;
560  }
561  }
562 
563  std::unique_ptr<llvm::Module> udf_module_copy;
564 
565  udf_module_copy = llvm::CloneModule(*udf_module.get(), cgen_state->vmap_);
566 
567  udf_module_copy->setDataLayout(module.getDataLayout());
568  udf_module_copy->setTargetTriple(module.getTargetTriple());
569 
570  // Initialize linker with module for RuntimeFunctions.bc
571  llvm::Linker ld(module);
572  bool link_error = false;
573 
574  link_error = ld.linkInModule(std::move(udf_module_copy), flags);
575 
576  if (link_error) {
577  throw std::runtime_error("link_udf_module: *** error linking module ***");
578  }
579 }
580 
581 namespace {
582 
583 std::string cpp_to_llvm_name(const std::string& s) {
584  if (s == "int8_t") {
585  return "i8";
586  }
587  if (s == "int16_t") {
588  return "i16";
589  }
590  if (s == "int32_t") {
591  return "i32";
592  }
593  if (s == "int64_t") {
594  return "i64";
595  }
596  CHECK(s == "float" || s == "double");
597  return s;
598 }
599 
600 std::string gen_array_any_all_sigs() {
601  std::string result;
602  for (const std::string any_or_all : {"any", "all"}) {
603  for (const std::string elem_type :
604  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
605  for (const std::string needle_type :
606  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
607  for (const std::string op_name : {"eq", "ne", "lt", "le", "gt", "ge"}) {
608  result += ("declare i1 @array_" + any_or_all + "_" + op_name + "_" + elem_type +
609  "_" + needle_type + "(i8*, i64, " + cpp_to_llvm_name(needle_type) +
610  ", " + cpp_to_llvm_name(elem_type) + ");\n");
611  }
612  }
613  }
614  }
615  return result;
616 }
617 
619  std::string result;
620  for (const std::string key_type : {"int8_t", "int16_t", "int32_t", "int64_t"}) {
621  const auto key_llvm_type = cpp_to_llvm_name(key_type);
622  result += "declare i64 @translate_null_key_" + key_type + "(" + key_llvm_type + ", " +
623  key_llvm_type + ", i64);\n";
624  }
625  return result;
626 }
627 
628 const std::string cuda_rt_decls =
629  R"( declare void @llvm.dbg.declare(metadata, metadata, metadata) declare void @llvm.dbg.value(metadata, metadata, metadata) declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind declare i64 @get_thread_index(); declare i64 @get_block_index(); declare i32 @pos_start_impl(i32*); declare i32 @group_buff_idx_impl(); declare i32 @pos_step_impl(); declare i8 @thread_warp_idx(i8); declare i64* @init_shared_mem(i64*, i32); declare i64* @init_shared_mem_nop(i64*, i32); declare i64* @declare_dynamic_shared_memory(); declare void @write_back_nop(i64*, i64*, i32); declare void @write_back_non_grouped_agg(i64*, i64*, i32); declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8); declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32); declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32); declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32); declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32); declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32); declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32); declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64); declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64); declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64); declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64); declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64); declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double); declare i64 @get_bucket_key_for_range_double(i8*, i64, double); declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double); declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64); declare i64 @agg_count_shared(i64*, i64); declare i64 @agg_count_skip_val_shared(i64*, i64, i64); declare i32 @agg_count_int32_shared(i32*, i32); declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32); declare i64 @agg_count_double_shared(i64*, double); declare i64 @agg_count_double_skip_val_shared(i64*, double, double); declare i32 @agg_count_float_shared(i32*, float); declare i32 @agg_count_float_skip_val_shared(i32*, float, float); declare i64 @agg_sum_shared(i64*, i64); declare i64 @agg_sum_skip_val_shared(i64*, i64, i64); declare i32 @agg_sum_int32_shared(i32*, i32); declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32); declare void @agg_sum_double_shared(i64*, double); declare void @agg_sum_double_skip_val_shared(i64*, double, double); declare void @agg_sum_float_shared(i32*, float); declare void @agg_sum_float_skip_val_shared(i32*, float, float); declare void @agg_max_shared(i64*, i64); declare void @agg_max_skip_val_shared(i64*, i64, i64); declare void @agg_max_int32_shared(i32*, i32); declare void @agg_max_int32_skip_val_shared(i32*, i32, i32); declare void @agg_max_int16_shared(i16*, i16); declare void @agg_max_int16_skip_val_shared(i16*, i16, i16); declare void @agg_max_int8_shared(i8*, i8); declare void @agg_max_int8_skip_val_shared(i8*, i8, i8); declare void @agg_max_double_shared(i64*, double); declare void @agg_max_double_skip_val_shared(i64*, double, double); declare void @agg_max_float_shared(i32*, float); declare void @agg_max_float_skip_val_shared(i32*, float, float); declare void @agg_min_shared(i64*, i64); declare void @agg_min_skip_val_shared(i64*, i64, i64); declare void @agg_min_int32_shared(i32*, i32); declare void @agg_min_int32_skip_val_shared(i32*, i32, i32); declare void @agg_min_int16_shared(i16*, i16); declare void @agg_min_int16_skip_val_shared(i16*, i16, i16); declare void @agg_min_int8_shared(i8*, i8); declare void @agg_min_int8_skip_val_shared(i8*, i8, i8); declare void @agg_min_double_shared(i64*, double); declare void @agg_min_double_skip_val_shared(i64*, double, double); declare void @agg_min_float_shared(i32*, float); declare void @agg_min_float_skip_val_shared(i32*, float, float); declare void @agg_id_shared(i64*, i64); declare i8* @agg_id_varlen_shared(i8*, i64, i8*, i64); declare void @agg_id_int32_shared(i32*, i32); declare void @agg_id_int16_shared(i16*, i16); declare void @agg_id_int8_shared(i8*, i8); declare void @agg_id_double_shared(i64*, double); declare void @agg_id_double_shared_slow(i64*, double*); declare void @agg_id_float_shared(i32*, float); declare i32 @checked_single_agg_id_shared(i64*, i64, i64); declare i32 @checked_single_agg_id_double_shared(i64*, double, double); declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double); declare i32 @checked_single_agg_id_float_shared(i32*, float, float); declare i1 @slotEmptyKeyCAS(i64*, i64, i64); declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32); declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16); declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8); declare i64 @datetrunc_century(i64); declare i64 @datetrunc_day(i64); declare i64 @datetrunc_decade(i64); declare i64 @datetrunc_hour(i64); declare i64 @datetrunc_millennium(i64); declare i64 @datetrunc_minute(i64); declare i64 @datetrunc_month(i64); declare i64 @datetrunc_quarter(i64); declare i64 @datetrunc_quarterday(i64); declare i64 @datetrunc_week_monday(i64); declare i64 @datetrunc_week_sunday(i64); declare i64 @datetrunc_week_saturday(i64); declare i64 @datetrunc_year(i64); declare i64 @extract_epoch(i64); declare i64 @extract_dateepoch(i64); declare i64 @extract_quarterday(i64); declare i64 @extract_hour(i64); declare i64 @extract_minute(i64); declare i64 @extract_second(i64); declare i64 @extract_millisecond(i64); declare i64 @extract_microsecond(i64); declare i64 @extract_nanosecond(i64); declare i64 @extract_dow(i64); declare i64 @extract_isodow(i64); declare i64 @extract_day(i64); declare i64 @extract_week_monday(i64); declare i64 @extract_week_sunday(i64); declare i64 @extract_week_saturday(i64); declare i64 @extract_day_of_year(i64); declare i64 @extract_month(i64); declare i64 @extract_quarter(i64); declare i64 @extract_year(i64); declare i64 @DateTruncateHighPrecisionToDate(i64, i64); declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64); declare i64 @DateDiff(i32, i64, i64); declare i64 @DateDiffNullable(i32, i64, i64, i64); declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32); declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64); declare i64 @DateAdd(i32, i64, i64); declare i64 @DateAddNullable(i32, i64, i64, i64); declare i64 @DateAddHighPrecision(i32, i64, i64, i32); declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64); declare i64 @string_decode(i8*, i64); declare i32 @array_size(i8*, i64, i32); declare i32 @array_size_nullable(i8*, i64, i32, i32); declare i32 @fast_fixlen_array_size(i8*, i32); declare i1 @array_is_null(i8*, i64); declare i1 @point_coord_array_is_null(i8*, i64); declare i8* @array_buff(i8*, i64); declare i8* @fast_fixlen_array_buff(i8*, i64); declare i8 @array_at_int8_t(i8*, i64, i32); declare i16 @array_at_int16_t(i8*, i64, i32); declare i32 @array_at_int32_t(i8*, i64, i32); declare i64 @array_at_int64_t(i8*, i64, i32); declare float @array_at_float(i8*, i64, i32); declare double @array_at_double(i8*, i64, i32); declare i8 @varlen_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_array_at_int64_t(i8*, i64, i32); declare float @varlen_array_at_float(i8*, i64, i32); declare double @varlen_array_at_double(i8*, i64, i32); declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32); declare float @varlen_notnull_array_at_float(i8*, i64, i32); declare double @varlen_notnull_array_at_double(i8*, i64, i32); declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8); declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16); declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32); declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64); declare float @array_at_float_checked(i8*, i64, i64, float); declare double @array_at_double_checked(i8*, i64, i64, double); declare i32 @char_length(i8*, i32); declare i32 @char_length_nullable(i8*, i32, i32); declare i32 @char_length_encoded(i8*, i32); declare i32 @char_length_encoded_nullable(i8*, i32, i32); declare i32 @key_for_string_encoded(i32); declare i1 @sample_ratio(double, i64); declare double @width_bucket(double, double, double, double, i32); declare double @width_bucket_reverse(double, double, double, double, i32); declare double @width_bucket_nullable(double, double, double, double, i32, double); declare double @width_bucket_reversed_nullable(double, double, double, double, i32, double); declare double @width_bucket_no_oob_check(double, double, double); declare double @width_bucket_reverse_no_oob_check(double, double, double); declare double @width_bucket_expr(double, i1, double, double, i32); declare double @width_bucket_expr_nullable(double, i1, double, double, i32, double); declare double @width_bucket_expr_no_oob_check(double, i1, double, double, i32); declare i1 @string_like(i8*, i32, i8*, i32, i8); declare i1 @string_ilike(i8*, i32, i8*, i32, i8); declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8); declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8); declare i1 @string_like_simple(i8*, i32, i8*, i32); declare i1 @string_ilike_simple(i8*, i32, i8*, i32); declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8); declare i1 @string_lt(i8*, i32, i8*, i32); declare i1 @string_le(i8*, i32, i8*, i32); declare i1 @string_gt(i8*, i32, i8*, i32); declare i1 @string_ge(i8*, i32, i8*, i32); declare i1 @string_eq(i8*, i32, i8*, i32); declare i1 @string_ne(i8*, i32, i8*, i32); declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8); declare i1 @regexp_like(i8*, i32, i8*, i32, i8); declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8); declare void @linear_probabilistic_count(i8*, i32, i8*, i32); declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64); declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64); declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64); declare void @record_error_code(i32, i32*); declare i32 @get_error_code(i32*); declare i1 @dynamic_watchdog(); declare i1 @check_interrupt(); declare void @force_sync(); declare void @sync_warp(); declare void @sync_warp_protected(i64, i64); declare void @sync_threadblock(); declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32); declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64); declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float); declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double); declare double @decompress_x_coord_geoint(i32); declare double @decompress_y_coord_geoint(i32); declare i32 @compress_x_coord_geoint(double); declare i32 @compress_y_coord_geoint(double); )" + gen_array_any_all_sigs() +
631 
632 #ifdef HAVE_CUDA
633 std::string extension_function_decls(const std::unordered_set<std::string>& udf_decls) {
634  const auto decls =
635  ExtensionFunctionsWhitelist::getLLVMDeclarations(udf_decls, /*is_gpu=*/true);
636  return boost::algorithm::join(decls, "\n");
637 }
638 
639 void legalize_nvvm_ir(llvm::Function* query_func) {
640  // optimizations might add attributes to the function
641  // and NVPTX doesn't understand all of them; play it
642  // safe and clear all attributes
643  clear_function_attributes(query_func);
644  verify_function_ir(query_func);
645 
646  std::vector<llvm::Instruction*> stackrestore_intrinsics;
647  std::vector<llvm::Instruction*> stacksave_intrinsics;
648  std::vector<llvm::Instruction*> lifetime;
649  for (auto& BB : *query_func) {
650  for (llvm::Instruction& I : BB) {
651  if (const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
652  if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
653  stacksave_intrinsics.push_back(&I);
654  } else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
655  stackrestore_intrinsics.push_back(&I);
656  } else if (II->getIntrinsicID() == llvm::Intrinsic::lifetime_start ||
657  II->getIntrinsicID() == llvm::Intrinsic::lifetime_end) {
658  lifetime.push_back(&I);
659  }
660  }
661  }
662  }
663 
664  // stacksave and stackrestore intrinsics appear together, and
665  // stackrestore uses stacksaved result as its argument
666  // so it should be removed first.
667  for (auto& II : stackrestore_intrinsics) {
668  II->eraseFromParent();
669  }
670  for (auto& II : stacksave_intrinsics) {
671  II->eraseFromParent();
672  }
673  // Remove lifetime intrinsics as well. NVPTX don't like them
674  for (auto& II : lifetime) {
675  II->eraseFromParent();
676  }
677 }
678 #endif // HAVE_CUDA
679 
680 } // namespace
681 
682 llvm::StringRef get_gpu_target_triple_string() {
683  return llvm::StringRef("nvptx64-nvidia-cuda");
684 }
685 
686 llvm::StringRef get_gpu_data_layout() {
687  return llvm::StringRef(
688  "e-p:64:64:64-i1:8:8-i8:8:8-"
689  "i16:16:16-i32:32:32-i64:64:64-"
690  "f32:32:32-f64:64:64-v16:16:16-"
691  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
692 }
693 
694 std::map<std::string, std::string> get_device_parameters(bool cpu_only) {
695  std::map<std::string, std::string> result;
696 
697  result.insert(std::make_pair("cpu_name", llvm::sys::getHostCPUName()));
698  result.insert(std::make_pair("cpu_triple", llvm::sys::getProcessTriple()));
699  result.insert(
700  std::make_pair("cpu_cores", std::to_string(llvm::sys::getHostNumPhysicalCores())));
701  result.insert(std::make_pair("cpu_threads", std::to_string(cpu_threads())));
702 
703  // https://en.cppreference.com/w/cpp/language/types
704  std::string sizeof_types;
705  sizeof_types += "bool:" + std::to_string(sizeof(bool)) + ";";
706  sizeof_types += "size_t:" + std::to_string(sizeof(size_t)) + ";";
707  sizeof_types += "ssize_t:" + std::to_string(sizeof(ssize_t)) + ";";
708  sizeof_types += "char:" + std::to_string(sizeof(char)) + ";";
709  sizeof_types += "uchar:" + std::to_string(sizeof(unsigned char)) + ";";
710  sizeof_types += "short:" + std::to_string(sizeof(short)) + ";";
711  sizeof_types += "ushort:" + std::to_string(sizeof(unsigned short int)) + ";";
712  sizeof_types += "int:" + std::to_string(sizeof(int)) + ";";
713  sizeof_types += "uint:" + std::to_string(sizeof(unsigned int)) + ";";
714  sizeof_types += "long:" + std::to_string(sizeof(long int)) + ";";
715  sizeof_types += "ulong:" + std::to_string(sizeof(unsigned long int)) + ";";
716  sizeof_types += "longlong:" + std::to_string(sizeof(long long int)) + ";";
717  sizeof_types += "ulonglong:" + std::to_string(sizeof(unsigned long long int)) + ";";
718  sizeof_types += "float:" + std::to_string(sizeof(float)) + ";";
719  sizeof_types += "double:" + std::to_string(sizeof(double)) + ";";
720  sizeof_types += "longdouble:" + std::to_string(sizeof(long double)) + ";";
721  sizeof_types += "voidptr:" + std::to_string(sizeof(void*)) + ";";
722 
723  result.insert(std::make_pair("type_sizeof", sizeof_types));
724 
725  std::string null_values;
726  null_values += "boolean1:" + std::to_string(serialized_null_value<bool>()) + ";";
727  null_values += "boolean8:" + std::to_string(serialized_null_value<int8_t>()) + ";";
728  null_values += "int8:" + std::to_string(serialized_null_value<int8_t>()) + ";";
729  null_values += "int16:" + std::to_string(serialized_null_value<int16_t>()) + ";";
730  null_values += "int32:" + std::to_string(serialized_null_value<int32_t>()) + ";";
731  null_values += "int64:" + std::to_string(serialized_null_value<int64_t>()) + ";";
732  null_values += "uint8:" + std::to_string(serialized_null_value<uint8_t>()) + ";";
733  null_values += "uint16:" + std::to_string(serialized_null_value<uint16_t>()) + ";";
734  null_values += "uint32:" + std::to_string(serialized_null_value<uint32_t>()) + ";";
735  null_values += "uint64:" + std::to_string(serialized_null_value<uint64_t>()) + ";";
736  null_values += "float32:" + std::to_string(serialized_null_value<float>()) + ";";
737  null_values += "float64:" + std::to_string(serialized_null_value<double>()) + ";";
738  null_values +=
739  "Array<boolean8>:" + std::to_string(serialized_null_value<int8_t, true>()) + ";";
740  null_values +=
741  "Array<int8>:" + std::to_string(serialized_null_value<int8_t, true>()) + ";";
742  null_values +=
743  "Array<int16>:" + std::to_string(serialized_null_value<int16_t, true>()) + ";";
744  null_values +=
745  "Array<int32>:" + std::to_string(serialized_null_value<int32_t, true>()) + ";";
746  null_values +=
747  "Array<int64>:" + std::to_string(serialized_null_value<int64_t, true>()) + ";";
748  null_values +=
749  "Array<float32>:" + std::to_string(serialized_null_value<float, true>()) + ";";
750  null_values +=
751  "Array<float64>:" + std::to_string(serialized_null_value<double, true>()) + ";";
752 
753  result.insert(std::make_pair("null_values", null_values));
754 
755  llvm::StringMap<bool> cpu_features;
756  if (llvm::sys::getHostCPUFeatures(cpu_features)) {
757  std::string features_str = "";
758  for (auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
759  features_str += (it->getValue() ? " +" : " -");
760  features_str += it->getKey().str();
761  }
762  result.insert(std::make_pair("cpu_features", features_str));
763  }
764 
765  result.insert(std::make_pair("llvm_version",
766  std::to_string(LLVM_VERSION_MAJOR) + "." +
767  std::to_string(LLVM_VERSION_MINOR) + "." +
768  std::to_string(LLVM_VERSION_PATCH)));
769 
770 #ifdef HAVE_CUDA
771  if (!cpu_only) {
772  int device_count = 0;
773  checkCudaErrors(cuDeviceGetCount(&device_count));
774  if (device_count) {
775  CUdevice device{};
776  char device_name[256];
777  int major = 0, minor = 0;
778  int driver_version;
779  checkCudaErrors(cuDeviceGet(&device, 0)); // assuming homogeneous multi-GPU system
780  checkCudaErrors(cuDeviceGetName(device_name, 256, device));
781  checkCudaErrors(cuDeviceGetAttribute(
782  &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
783  checkCudaErrors(cuDeviceGetAttribute(
784  &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
785  checkCudaErrors(cuDriverGetVersion(&driver_version));
786 
787  result.insert(std::make_pair("gpu_name", device_name));
788  result.insert(std::make_pair("gpu_count", std::to_string(device_count)));
789  result.insert(std::make_pair("gpu_compute_capability",
790  std::to_string(major) + "." + std::to_string(minor)));
791  result.insert(std::make_pair("gpu_triple", get_gpu_target_triple_string()));
792  result.insert(std::make_pair("gpu_datalayout", get_gpu_data_layout()));
793  result.insert(std::make_pair("gpu_driver",
794  "CUDA " + std::to_string(driver_version / 1000) + "." +
795  std::to_string((driver_version % 1000) / 10)));
796  }
797  }
798 #endif
799 
800  return result;
801 }
802 
803 namespace {
804 
805 bool is_udf_module_present(bool cpu_only = false) {
806  return (cpu_only || udf_gpu_module != nullptr) && (udf_cpu_module != nullptr);
807 }
808 
809 } // namespace
810 
811 std::shared_ptr<GpuCompilationContext> CodeGenerator::generateNativeGPUCode(
812  llvm::Function* func,
813  llvm::Function* wrapper_func,
814  const std::unordered_set<llvm::Function*>& live_funcs,
815  const CompilationOptions& co,
816  const GPUTarget& gpu_target) {
817 #ifdef HAVE_CUDA
818  auto module = func->getParent();
819  /*
820  `func` is one of the following generated functions:
821  - `call_table_function(i8** %input_col_buffers, i64*
822  %input_row_count, i64** %output_buffers, i64* %output_row_count)`
823  that wraps the user-defined table function.
824  - `multifrag_query`
825  - `multifrag_query_hoisted_literals`
826  - ...
827 
828  `wrapper_func` is table_func_kernel(i32*, i8**, i64*, i64**,
829  i64*) that wraps `call_table_function`.
830 
831  `module` is from `build/QueryEngine/RuntimeFunctions.bc` and it
832  contains `func` and `wrapper_func`. `module` should also contain
833  the definitions of user-defined table functions.
834 
835  `live_funcs` contains table_func_kernel and call_table_function
836 
837  `gpu_target.cgen_state->module_` appears to be the same as `module`
838  */
839  CHECK(gpu_target.cgen_state->module_ == module);
840  module->setDataLayout(
841  "e-p:64:64:64-i1:8:8-i8:8:8-"
842  "i16:16:16-i32:32:32-i64:64:64-"
843  "f32:32:32-f64:64:64-v16:16:16-"
844  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
845  module->setTargetTriple("nvptx64-nvidia-cuda");
846  CHECK(gpu_target.nvptx_target_machine);
847  auto pass_manager_builder = llvm::PassManagerBuilder();
848 
849  pass_manager_builder.OptLevel = 0;
850  llvm::legacy::PassManager module_pass_manager;
851  pass_manager_builder.populateModulePassManager(module_pass_manager);
852 
853  bool requires_libdevice = check_module_requires_libdevice(module);
854 
855  if (requires_libdevice) {
856  // add nvvm reflect pass replacing any NVVM conditionals with constants
857  gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
858  llvm::legacy::FunctionPassManager FPM(module);
859  pass_manager_builder.populateFunctionPassManager(FPM);
860 
861  // Run the NVVMReflectPass here rather than inside optimize_ir
862  FPM.doInitialization();
863  for (auto& F : *module) {
864  FPM.run(F);
865  }
866  FPM.doFinalization();
867  }
868 
869  // run optimizations
870  optimize_ir(func, module, module_pass_manager, live_funcs, co);
871  legalize_nvvm_ir(func);
872 
873  std::stringstream ss;
874  llvm::raw_os_ostream os(ss);
875 
876  llvm::LLVMContext& ctx = module->getContext();
877  // Get "nvvm.annotations" metadata node
878  llvm::NamedMDNode* md = module->getOrInsertNamedMetadata("nvvm.annotations");
879 
880  llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
881  llvm::MDString::get(ctx, "kernel"),
882  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
883  llvm::Type::getInt32Ty(ctx), 1))};
884 
885  // Append metadata to nvvm.annotations
886  md->addOperand(llvm::MDNode::get(ctx, md_vals));
887 
888  std::unordered_set<llvm::Function*> roots{wrapper_func, func};
889  if (gpu_target.row_func_not_inlined) {
890  clear_function_attributes(gpu_target.cgen_state->row_func_);
891  roots.insert(gpu_target.cgen_state->row_func_);
892  if (gpu_target.cgen_state->filter_func_) {
893  roots.insert(gpu_target.cgen_state->filter_func_);
894  }
895  }
896 
897  // prevent helper functions from being removed
898  for (auto f : gpu_target.cgen_state->helper_functions_) {
899  roots.insert(f);
900  }
901 
902  if (requires_libdevice) {
903  for (llvm::Function& F : *module) {
904  // Some libdevice functions calls another functions that starts with "__internal_"
905  // prefix.
906  // __internal_trig_reduction_slowpathd
907  // __internal_accurate_pow
908  // __internal_lgamma_pos
909  // Those functions have a "noinline" attribute which prevents the optimizer from
910  // inlining them into the body of @query_func
911  if (F.hasName() && F.getName().startswith("__internal") && !F.isDeclaration()) {
912  roots.insert(&F);
913  }
914  legalize_nvvm_ir(&F);
915  }
916  }
917 
918  // Prevent the udf function(s) from being removed the way the runtime functions are
919  std::unordered_set<std::string> udf_declarations;
920  if (is_udf_module_present()) {
921  for (auto& f : udf_gpu_module->getFunctionList()) {
922  llvm::Function* udf_function = module->getFunction(f.getName());
923 
924  if (udf_function) {
925  legalize_nvvm_ir(udf_function);
926  roots.insert(udf_function);
927 
928  // If we have a udf that declares a external function
929  // note it so we can avoid duplicate declarations
930  if (f.isDeclaration()) {
931  udf_declarations.insert(f.getName().str());
932  }
933  }
934  }
935  }
936 
937  if (is_rt_udf_module_present()) {
938  for (auto& f : rt_udf_gpu_module->getFunctionList()) {
939  llvm::Function* udf_function = module->getFunction(f.getName());
940  if (udf_function) {
941  legalize_nvvm_ir(udf_function);
942  roots.insert(udf_function);
943 
944  // If we have a udf that declares a external function
945  // note it so we can avoid duplicate declarations
946  if (f.isDeclaration()) {
947  udf_declarations.insert(f.getName().str());
948  }
949  }
950  }
951  }
952 
953  std::vector<llvm::Function*> rt_funcs;
954  for (auto& Fn : *module) {
955  if (roots.count(&Fn)) {
956  continue;
957  }
958  rt_funcs.push_back(&Fn);
959  }
960  for (auto& pFn : rt_funcs) {
961  pFn->removeFromParent();
962  }
963 
964  if (requires_libdevice) {
965  add_intrinsics_to_module(module);
966  }
967 
968  module->print(os, nullptr);
969  os.flush();
970 
971  for (auto& pFn : rt_funcs) {
972  module->getFunctionList().push_back(pFn);
973  }
974  module->eraseNamedMetadata(md);
975 
976  auto cuda_llir = ss.str() + cuda_rt_decls + extension_function_decls(udf_declarations);
977  std::string ptx;
978  try {
979  ptx = generatePTX(
980  cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
981  } catch (ParseIRError& e) {
982  LOG(WARNING) << "Failed to generate PTX: " << e.what()
983  << ". Switching to CPU execution target.";
984  throw QueryMustRunOnCpu();
985  }
986  LOG(PTX) << "PTX for the GPU:\n" << ptx << "\nEnd of PTX";
987 
988  auto cubin_result = ptx_to_cubin(ptx, gpu_target.block_size, gpu_target.cuda_mgr);
989  auto& option_keys = cubin_result.option_keys;
990  auto& option_values = cubin_result.option_values;
991  auto cubin = cubin_result.cubin;
992  auto link_state = cubin_result.link_state;
993  const auto num_options = option_keys.size();
994 
995  auto func_name = wrapper_func->getName().str();
996  auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
997  for (int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
998  ++device_id) {
999  gpu_compilation_context->addDeviceCode(
1000  std::make_unique<GpuDeviceCompilationContext>(cubin,
1001  func_name,
1002  device_id,
1003  gpu_target.cuda_mgr,
1004  num_options,
1005  &option_keys[0],
1006  &option_values[0]));
1007  }
1008 
1009  checkCudaErrors(cuLinkDestroy(link_state));
1010  return gpu_compilation_context;
1011 #else
1012  return {};
1013 #endif
1014 }
1015 
1016 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenGPU(
1017  llvm::Function* query_func,
1018  llvm::Function* multifrag_query_func,
1019  std::unordered_set<llvm::Function*>& live_funcs,
1020  const bool no_inline,
1021  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
1022  const CompilationOptions& co) {
1023 #ifdef HAVE_CUDA
1024  auto module = multifrag_query_func->getParent();
1025 
1026  CHECK(cuda_mgr);
1027  CodeCacheKey key{serialize_llvm_object(query_func),
1028  serialize_llvm_object(cgen_state_->row_func_)};
1029  if (cgen_state_->filter_func_) {
1030  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
1031  }
1032  for (const auto helper : cgen_state_->helper_functions_) {
1033  key.push_back(serialize_llvm_object(helper));
1034  }
1035  auto cached_code = getCodeFromCache(key, gpu_code_cache_);
1036  if (cached_code) {
1037  return cached_code;
1038  }
1039 
1040  bool row_func_not_inlined = false;
1041  if (no_inline) {
1042  for (auto it = llvm::inst_begin(cgen_state_->row_func_),
1043  e = llvm::inst_end(cgen_state_->row_func_);
1044  it != e;
1045  ++it) {
1046  if (llvm::isa<llvm::CallInst>(*it)) {
1047  auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
1048  if (get_gv_call.getCalledFunction()->getName() == "array_size" ||
1049  get_gv_call.getCalledFunction()->getName() == "linear_probabilistic_count") {
1050  mark_function_never_inline(cgen_state_->row_func_);
1051  row_func_not_inlined = true;
1052  break;
1053  }
1054  }
1055  }
1056  }
1057 
1058  initializeNVPTXBackend();
1059  CodeGenerator::GPUTarget gpu_target{nvptx_target_machine_.get(),
1060  cuda_mgr,
1061  blockSize(),
1062  cgen_state_.get(),
1063  row_func_not_inlined};
1064  std::shared_ptr<GpuCompilationContext> compilation_context;
1065 
1066  if (check_module_requires_libdevice(module)) {
1067  if (g_rt_libdevice_module == nullptr) {
1068  // raise error
1069  throw std::runtime_error(
1070  "libdevice library is not available but required by the UDF module");
1071  }
1072 
1073  // Bind libdevice it to the current module
1074  CodeGenerator::link_udf_module(g_rt_libdevice_module,
1075  *module,
1076  cgen_state_.get(),
1077  llvm::Linker::Flags::OverrideFromSrc);
1078 
1079  // activate nvvm-reflect-ftz flag on the module
1080  module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", (int)1);
1081  for (llvm::Function& fn : *module) {
1082  fn.addFnAttr("nvptx-f32ftz", "true");
1083  }
1084  }
1085 
1086  try {
1087  compilation_context = CodeGenerator::generateNativeGPUCode(
1088  query_func, multifrag_query_func, live_funcs, co, gpu_target);
1089  addCodeToCache(key, compilation_context, module, gpu_code_cache_);
1090  } catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1091  if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1092  // Thrown if memory not able to be allocated on gpu
1093  // Retry once after evicting portion of code cache
1094  LOG(WARNING) << "Failed to allocate GPU memory for generated code. Evicting "
1096  << "% of GPU code cache and re-trying.";
1097  gpu_code_cache_.evictFractionEntries(g_fraction_code_cache_to_evict);
1098  compilation_context = CodeGenerator::generateNativeGPUCode(
1099  query_func, multifrag_query_func, live_funcs, co, gpu_target);
1100  addCodeToCache(key, compilation_context, module, gpu_code_cache_);
1101  } else {
1102  throw;
1103  }
1104  }
1105  CHECK(compilation_context);
1106  return compilation_context;
1107 #else
1108  return nullptr;
1109 #endif
1110 }
1111 
1112 std::string CodeGenerator::generatePTX(const std::string& cuda_llir,
1113  llvm::TargetMachine* nvptx_target_machine,
1114  llvm::LLVMContext& context) {
1115  auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir, "", false);
1116 
1117  llvm::SMDiagnostic parse_error;
1118 
1119  auto module = llvm::parseIR(mem_buff->getMemBufferRef(), parse_error, context);
1120  if (!module) {
1121  LOG(IR) << "CodeGenerator::generatePTX:NVVM IR:\n" << cuda_llir << "\nEnd of NNVM IR";
1122  throw_parseIR_error(parse_error, "generatePTX", /* is_gpu= */ true);
1123  }
1124 
1125  llvm::SmallString<256> code_str;
1126  llvm::raw_svector_ostream formatted_os(code_str);
1127  CHECK(nvptx_target_machine);
1128  {
1129  llvm::legacy::PassManager ptxgen_pm;
1130  module->setDataLayout(nvptx_target_machine->createDataLayout());
1131 
1132 #if LLVM_VERSION_MAJOR >= 10
1133  nvptx_target_machine->addPassesToEmitFile(
1134  ptxgen_pm, formatted_os, nullptr, llvm::CGFT_AssemblyFile);
1135 #else
1136  nvptx_target_machine->addPassesToEmitFile(
1137  ptxgen_pm, formatted_os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1138 #endif
1139  ptxgen_pm.run(*module);
1140  }
1141 
1142 #if LLVM_VERSION_MAJOR >= 11
1143  return std::string(code_str);
1144 #else
1145  return code_str.str();
1146 #endif
1147 }
1148 
1149 std::unique_ptr<llvm::TargetMachine> CodeGenerator::initializeNVPTXBackend(
1151  llvm::InitializeAllTargets();
1152  llvm::InitializeAllTargetMCs();
1153  llvm::InitializeAllAsmPrinters();
1154  std::string err;
1155  auto target = llvm::TargetRegistry::lookupTarget("nvptx64", err);
1156  if (!target) {
1157  LOG(FATAL) << err;
1158  }
1159  return std::unique_ptr<llvm::TargetMachine>(
1160  target->createTargetMachine("nvptx64-nvidia-cuda",
1162  "",
1163  llvm::TargetOptions(),
1164  llvm::Reloc::Static));
1165 }
1166 
1167 std::string Executor::generatePTX(const std::string& cuda_llir) const {
1169  cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1170 }
1171 
1172 void Executor::initializeNVPTXBackend() const {
1173  if (nvptx_target_machine_) {
1174  return;
1175  }
1176  const auto arch = cudaMgr()->getDeviceArch();
1177  nvptx_target_machine_ = CodeGenerator::initializeNVPTXBackend(arch);
1178 }
1179 
1180 // A small number of runtime functions don't get through CgenState::emitCall. List them
1181 // explicitly here and always clone their implementation from the runtime module.
1182 bool CodeGenerator::alwaysCloneRuntimeFunction(const llvm::Function* func) {
1183  return func->getName() == "query_stub_hoisted_literals" ||
1184  func->getName() == "multifrag_query_hoisted_literals" ||
1185  func->getName() == "query_stub" || func->getName() == "multifrag_query" ||
1186  func->getName() == "fixed_width_int_decode" ||
1187  func->getName() == "fixed_width_unsigned_decode" ||
1188  func->getName() == "diff_fixed_width_int_decode" ||
1189  func->getName() == "fixed_width_double_decode" ||
1190  func->getName() == "fixed_width_float_decode" ||
1191  func->getName() == "fixed_width_small_date_decode" ||
1192  func->getName() == "record_error_code" || func->getName() == "get_error_code" ||
1193  func->getName() == "pos_start_impl" || func->getName() == "pos_step_impl" ||
1194  func->getName() == "group_buff_idx_impl" ||
1195  func->getName() == "init_shared_mem" ||
1196  func->getName() == "init_shared_mem_nop" || func->getName() == "write_back_nop";
1197 }
1198 
1199 llvm::Module* read_template_module(llvm::LLVMContext& context) {
1200  llvm::SMDiagnostic err;
1201 
1202  auto buffer_or_error = llvm::MemoryBuffer::getFile(omnisci::get_root_abs_path() +
1203  "/QueryEngine/RuntimeFunctions.bc");
1204  CHECK(!buffer_or_error.getError()) << "root path=" << omnisci::get_root_abs_path();
1205  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1206 
1207  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1208  CHECK(!owner.takeError());
1209  auto module = owner.get().release();
1210  CHECK(module);
1211 
1212  return module;
1213 }
1214 
1215 #ifdef HAVE_CUDA
1216 llvm::Module* read_libdevice_module(llvm::LLVMContext& context) {
1217  llvm::SMDiagnostic err;
1218  const auto env = get_cuda_home();
1219 
1220  boost::filesystem::path cuda_path{env};
1221  cuda_path /= "nvvm";
1222  cuda_path /= "libdevice";
1223  cuda_path /= "libdevice.10.bc";
1224 
1225  if (!boost::filesystem::exists(cuda_path)) {
1226  LOG(WARNING) << "Could not find CUDA libdevice; support for some UDF "
1227  "functions might not be available.";
1228  return nullptr;
1229  }
1230 
1231  auto buffer_or_error = llvm::MemoryBuffer::getFile(cuda_path.c_str());
1232  CHECK(!buffer_or_error.getError()) << "cuda_path=" << cuda_path.c_str();
1233  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1234 
1235  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1236  CHECK(!owner.takeError());
1237  auto module = owner.get().release();
1238  CHECK(module);
1239 
1240  return module;
1241 }
1242 #endif
1243 
1244 #ifdef ENABLE_GEOS
1245 llvm::Module* read_geos_module(llvm::LLVMContext& context) {
1246  llvm::SMDiagnostic err;
1247 
1248  auto buffer_or_error = llvm::MemoryBuffer::getFile(omnisci::get_root_abs_path() +
1249  "/QueryEngine/GeosRuntime.bc");
1250  CHECK(!buffer_or_error.getError()) << "root path=" << omnisci::get_root_abs_path();
1251  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1252 
1253  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1254  CHECK(!owner.takeError());
1255  auto module = owner.get().release();
1256  CHECK(module);
1257 
1258  return module;
1259 }
1260 #endif
1261 
1262 namespace {
1263 
1264 void bind_pos_placeholders(const std::string& pos_fn_name,
1265  const bool use_resume_param,
1266  llvm::Function* query_func,
1267  llvm::Module* module) {
1268  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1269  ++it) {
1270  if (!llvm::isa<llvm::CallInst>(*it)) {
1271  continue;
1272  }
1273  auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1274  if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
1275  if (use_resume_param) {
1276  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1277  llvm::ReplaceInstWithInst(
1278  &pos_call,
1279  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl"),
1280  error_code_arg));
1281  } else {
1282  llvm::ReplaceInstWithInst(
1283  &pos_call,
1284  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl")));
1285  }
1286  break;
1287  }
1288  }
1289 }
1290 
1291 void set_row_func_argnames(llvm::Function* row_func,
1292  const size_t in_col_count,
1293  const size_t agg_col_count,
1294  const bool hoist_literals) {
1295  auto arg_it = row_func->arg_begin();
1296 
1297  if (agg_col_count) {
1298  for (size_t i = 0; i < agg_col_count; ++i) {
1299  arg_it->setName("out");
1300  ++arg_it;
1301  }
1302  } else {
1303  arg_it->setName("group_by_buff");
1304  ++arg_it;
1305  arg_it->setName("varlen_output_buff");
1306  ++arg_it;
1307  arg_it->setName("crt_matched");
1308  ++arg_it;
1309  arg_it->setName("total_matched");
1310  ++arg_it;
1311  arg_it->setName("old_total_matched");
1312  ++arg_it;
1313  arg_it->setName("max_matched");
1314  ++arg_it;
1315  }
1316 
1317  arg_it->setName("agg_init_val");
1318  ++arg_it;
1319 
1320  arg_it->setName("pos");
1321  ++arg_it;
1322 
1323  arg_it->setName("frag_row_off");
1324  ++arg_it;
1325 
1326  arg_it->setName("num_rows_per_scan");
1327  ++arg_it;
1328 
1329  if (hoist_literals) {
1330  arg_it->setName("literals");
1331  ++arg_it;
1332  }
1334  for (size_t i = 0; i < in_col_count; ++i) {
1335  arg_it->setName("col_buf" + std::to_string(i));
1336  ++arg_it;
1337  }
1338 
1339  arg_it->setName("join_hash_tables");
1340 }
1341 
1342 llvm::Function* create_row_function(const size_t in_col_count,
1343  const size_t agg_col_count,
1344  const bool hoist_literals,
1345  llvm::Module* module,
1346  llvm::LLVMContext& context) {
1347  std::vector<llvm::Type*> row_process_arg_types;
1348 
1349  if (agg_col_count) {
1350  // output (aggregate) arguments
1351  for (size_t i = 0; i < agg_col_count; ++i) {
1352  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1353  }
1354  } else {
1355  // group by buffer
1356  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1357  // varlen output buffer
1358  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1359  // current match count
1360  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1361  // total match count passed from the caller
1362  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1363  // old total match count returned to the caller
1364  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1365  // max matched (total number of slots in the output buffer)
1366  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1367  }
1368 
1369  // aggregate init values
1370  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1371 
1372  // position argument
1373  row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1374 
1375  // fragment row offset argument
1376  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1377 
1378  // number of rows for each scan
1379  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1380 
1381  // literals buffer argument
1382  if (hoist_literals) {
1383  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1384  }
1385 
1386  // column buffer arguments
1387  for (size_t i = 0; i < in_col_count; ++i) {
1388  row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1389  }
1390 
1391  // join hash table argument
1392  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1394  // generate the function
1395  auto ft =
1396  llvm::FunctionType::get(get_int_type(32, context), row_process_arg_types, false);
1397 
1398  auto row_func =
1399  llvm::Function::Create(ft, llvm::Function::ExternalLinkage, "row_func", module);
1400 
1401  // set the row function argument names; for debugging purposes only
1402  set_row_func_argnames(row_func, in_col_count, agg_col_count, hoist_literals);
1404  return row_func;
1405 }
1406 
1407 // Iterate through multifrag_query_func, replacing calls to query_fname with query_func.
1408 void bind_query(llvm::Function* query_func,
1409  const std::string& query_fname,
1410  llvm::Function* multifrag_query_func,
1411  llvm::Module* module) {
1412  std::vector<llvm::CallInst*> query_stubs;
1413  for (auto it = llvm::inst_begin(multifrag_query_func),
1414  e = llvm::inst_end(multifrag_query_func);
1415  it != e;
1416  ++it) {
1417  if (!llvm::isa<llvm::CallInst>(*it)) {
1418  continue;
1419  }
1420  auto& query_call = llvm::cast<llvm::CallInst>(*it);
1421  if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
1422  query_stubs.push_back(&query_call);
1423  }
1424  }
1425  for (auto& S : query_stubs) {
1426  std::vector<llvm::Value*> args;
1427  for (size_t i = 0; i < S->getNumArgOperands(); ++i) {
1428  args.push_back(S->getArgOperand(i));
1429  }
1430  llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args, ""));
1431  }
1432 }
1433 
1434 std::vector<std::string> get_agg_fnames(const std::vector<Analyzer::Expr*>& target_exprs,
1435  const bool is_group_by) {
1436  std::vector<std::string> result;
1437  for (size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1438  ++target_idx, ++agg_col_idx) {
1439  const auto target_expr = target_exprs[target_idx];
1440  CHECK(target_expr);
1441  const auto target_type_info = target_expr->get_type_info();
1442  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
1443  const bool is_varlen =
1444  (target_type_info.is_string() &&
1445  target_type_info.get_compression() == kENCODING_NONE) ||
1446  target_type_info.is_array(); // TODO: should it use is_varlen_array() ?
1447  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
1448  result.emplace_back(target_type_info.is_fp() ? "agg_id_double" : "agg_id");
1449  if (is_varlen) {
1450  result.emplace_back("agg_id");
1451  }
1452  if (target_type_info.is_geometry()) {
1453  result.emplace_back("agg_id");
1454  for (auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1455  result.emplace_back("agg_id");
1456  }
1457  }
1458  continue;
1459  }
1460  const auto agg_type = agg_expr->get_aggtype();
1461  const auto& agg_type_info =
1462  agg_type != kCOUNT ? agg_expr->get_arg()->get_type_info() : target_type_info;
1463  switch (agg_type) {
1464  case kAVG: {
1465  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1466  !agg_type_info.is_fp()) {
1467  throw std::runtime_error("AVG is only valid on integer and floating point");
1468  }
1469  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1470  ? "agg_sum"
1471  : "agg_sum_double");
1472  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1473  ? "agg_count"
1474  : "agg_count_double");
1475  break;
1476  }
1477  case kMIN: {
1478  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1479  agg_type_info.is_geometry()) {
1480  throw std::runtime_error(
1481  "MIN on strings, arrays or geospatial types not supported yet");
1482  }
1483  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1484  ? "agg_min"
1485  : "agg_min_double");
1486  break;
1487  }
1488  case kMAX: {
1489  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1490  agg_type_info.is_geometry()) {
1491  throw std::runtime_error(
1492  "MAX on strings, arrays or geospatial types not supported yet");
1493  }
1494  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1495  ? "agg_max"
1496  : "agg_max_double");
1497  break;
1498  }
1499  case kSUM: {
1500  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1501  !agg_type_info.is_fp()) {
1502  throw std::runtime_error("SUM is only valid on integer and floating point");
1503  }
1504  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1505  ? "agg_sum"
1506  : "agg_sum_double");
1507  break;
1508  }
1509  case kCOUNT:
1510  result.emplace_back(agg_expr->get_is_distinct() ? "agg_count_distinct"
1511  : "agg_count");
1512  break;
1513  case kSINGLE_VALUE: {
1514  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1515  break;
1516  }
1517  case kSAMPLE: {
1518  // Note that varlen SAMPLE arguments are handled separately above
1519  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1520  break;
1521  }
1523  result.emplace_back("agg_approximate_count_distinct");
1524  break;
1525  case kAPPROX_QUANTILE:
1526  result.emplace_back("agg_approx_quantile");
1527  break;
1528  default:
1529  CHECK(false);
1530  }
1531  }
1532  return result;
1533 }
1534 
1535 } // namespace
1536 
1537 std::unique_ptr<llvm::Module> g_rt_module(read_template_module(getGlobalLLVMContext()));
1538 
1539 #ifdef ENABLE_GEOS
1540 std::unique_ptr<llvm::Module> g_rt_geos_module(read_geos_module(getGlobalLLVMContext()));
1541 #endif
1542 
1543 #ifdef HAVE_CUDA
1544 std::unique_ptr<llvm::Module> g_rt_libdevice_module(
1545  read_libdevice_module(getGlobalLLVMContext()));
1546 #endif
1547 
1548 bool is_rt_udf_module_present(bool cpu_only) {
1549  return (cpu_only || rt_udf_gpu_module != nullptr) && (rt_udf_cpu_module != nullptr);
1550 }
1551 
1552 namespace {
1553 
1554 void read_udf_gpu_module(const std::string& udf_ir_filename) {
1555  llvm::SMDiagnostic parse_error;
1556 
1557  llvm::StringRef file_name_arg(udf_ir_filename);
1558  udf_gpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1559 
1560  if (!udf_gpu_module) {
1561  throw_parseIR_error(parse_error, udf_ir_filename, /* is_gpu= */ true);
1562  }
1564  llvm::Triple gpu_triple(udf_gpu_module->getTargetTriple());
1565  if (!gpu_triple.isNVPTX()) {
1566  LOG(WARNING)
1567  << "Expected triple nvptx64-nvidia-cuda for NVVM IR of loadtime UDFs but got "
1568  << gpu_triple.str() << ". Disabling the NVVM IR module.";
1569  udf_gpu_module = nullptr;
1570  }
1571 }
1572 
1573 void read_udf_cpu_module(const std::string& udf_ir_filename) {
1574  llvm::SMDiagnostic parse_error;
1575 
1576  llvm::StringRef file_name_arg(udf_ir_filename);
1577 
1578  udf_cpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1579  if (!udf_cpu_module) {
1580  throw_parseIR_error(parse_error, udf_ir_filename);
1581  }
1582 }
1583 
1584 } // namespace
1585 
1586 void Executor::addUdfIrToModule(const std::string& udf_ir_filename,
1587  const bool is_cuda_ir) {
1588  if (is_cuda_ir) {
1589  read_udf_gpu_module(udf_ir_filename);
1590  } else {
1591  read_udf_cpu_module(udf_ir_filename);
1592  }
1593 }
1594 
1595 void read_rt_udf_gpu_module(const std::string& udf_ir_string) {
1596  llvm::SMDiagnostic parse_error;
1597 
1598  auto buf =
1599  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for GPU");
1600 
1601  rt_udf_gpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1602  if (!rt_udf_gpu_module) {
1603  LOG(IR) << "read_rt_udf_gpu_module:NVVM IR:\n" << udf_ir_string << "\nEnd of NNVM IR";
1604  throw_parseIR_error(parse_error, "", /* is_gpu= */ true);
1605  }
1606 
1607  llvm::Triple gpu_triple(rt_udf_gpu_module->getTargetTriple());
1608  if (!gpu_triple.isNVPTX()) {
1609  LOG(IR) << "read_rt_udf_gpu_module:NVVM IR:\n" << udf_ir_string << "\nEnd of NNVM IR";
1610  LOG(WARNING) << "Expected triple nvptx64-nvidia-cuda for NVVM IR but got "
1611  << gpu_triple.str()
1612  << ". Executing runtime UDFs on GPU will be disabled.";
1613  rt_udf_gpu_module = nullptr;
1614  return;
1615  }
1616 }
1617 
1618 void read_rt_udf_cpu_module(const std::string& udf_ir_string) {
1619  llvm::SMDiagnostic parse_error;
1620 
1621  auto buf =
1622  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for CPU");
1623 
1624  rt_udf_cpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1625  if (!rt_udf_cpu_module) {
1626  LOG(IR) << "read_rt_udf_cpu_module:LLVM IR:\n" << udf_ir_string << "\nEnd of LLVM IR";
1627  throw_parseIR_error(parse_error);
1628  }
1630 
1631 std::unordered_set<llvm::Function*> CodeGenerator::markDeadRuntimeFuncs(
1632  llvm::Module& module,
1633  const std::vector<llvm::Function*>& roots,
1634  const std::vector<llvm::Function*>& leaves) {
1635  std::unordered_set<llvm::Function*> live_funcs;
1636  live_funcs.insert(roots.begin(), roots.end());
1637  live_funcs.insert(leaves.begin(), leaves.end());
1638 
1639  if (auto F = module.getFunction("init_shared_mem_nop")) {
1640  live_funcs.insert(F);
1641  }
1642  if (auto F = module.getFunction("write_back_nop")) {
1643  live_funcs.insert(F);
1644  }
1645 
1646  for (const llvm::Function* F : roots) {
1647  for (const llvm::BasicBlock& BB : *F) {
1648  for (const llvm::Instruction& I : BB) {
1649  if (const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1650  live_funcs.insert(CI->getCalledFunction());
1651  }
1652  }
1653  }
1654  }
1656  for (llvm::Function& F : module) {
1657  if (!live_funcs.count(&F) && !F.isDeclaration()) {
1658  F.setLinkage(llvm::GlobalValue::InternalLinkage);
1659  }
1660  }
1661 
1662  return live_funcs;
1663 }
1664 
1665 namespace {
1666 // searches for a particular variable within a specific basic block (or all if bb_name is
1667 // empty)
1668 template <typename InstType>
1669 llvm::Value* find_variable_in_basic_block(llvm::Function* func,
1670  std::string bb_name,
1671  std::string variable_name) {
1672  llvm::Value* result = nullptr;
1673  if (func == nullptr || variable_name.empty()) {
1674  return result;
1675  }
1676  bool is_found = false;
1677  for (auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1678  if (!bb_name.empty() && bb_it->getName() != bb_name) {
1679  continue;
1680  }
1681  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1682  if (llvm::isa<InstType>(*inst_it)) {
1683  if (inst_it->getName() == variable_name) {
1684  result = &*inst_it;
1685  is_found = true;
1686  break;
1687  }
1688  }
1689  }
1690  }
1691  return result;
1692 }
1693 }; // namespace
1694 
1696  llvm::Function* query_func,
1697  bool run_with_dynamic_watchdog,
1698  bool run_with_allowing_runtime_interrupt,
1699  ExecutorDeviceType device_type,
1700  const std::vector<InputTableInfo>& input_table_infos) {
1701  AUTOMATIC_IR_METADATA(cgen_state_.get());
1702 
1703  // check whether the row processing was successful; currently, it can
1704  // fail by running out of group by buffer slots
1705 
1706  if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1707  // when both dynamic watchdog and runtime interrupt turns on
1708  // we use dynamic watchdog
1709  run_with_allowing_runtime_interrupt = false;
1710  }
1711 
1712  {
1713  // disable injecting query interrupt checker if the session info is invalid
1714  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
1715  if (current_query_session_.empty()) {
1716  run_with_allowing_runtime_interrupt = false;
1717  }
1718  }
1719 
1720  llvm::Value* row_count = nullptr;
1721  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1722  device_type == ExecutorDeviceType::GPU) {
1723  row_count =
1724  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1725  }
1726 
1727  bool done_splitting = false;
1728  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1729  ++bb_it) {
1730  llvm::Value* pos = nullptr;
1731  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1732  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1733  llvm::isa<llvm::PHINode>(*inst_it)) {
1734  if (inst_it->getName() == "pos") {
1735  pos = &*inst_it;
1736  }
1737  continue;
1738  }
1739  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1740  continue;
1741  }
1742  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1743  if (std::string(row_func_call.getCalledFunction()->getName()) == "row_process") {
1744  auto next_inst_it = inst_it;
1745  ++next_inst_it;
1746  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1747  auto& br_instr = bb_it->back();
1748  llvm::IRBuilder<> ir_builder(&br_instr);
1749  llvm::Value* err_lv = &*inst_it;
1750  llvm::Value* err_lv_returned_from_row_func = nullptr;
1751  if (run_with_dynamic_watchdog) {
1752  CHECK(pos);
1753  llvm::Value* call_watchdog_lv = nullptr;
1754  if (device_type == ExecutorDeviceType::GPU) {
1755  // In order to make sure all threads within a block see the same barrier,
1756  // only those blocks whose none of their threads have experienced the critical
1757  // edge will go through the dynamic watchdog computation
1758  CHECK(row_count);
1759  auto crit_edge_rem =
1760  (blockSize() & (blockSize() - 1))
1761  ? ir_builder.CreateSRem(
1762  row_count,
1763  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1764  : ir_builder.CreateAnd(
1765  row_count,
1766  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1767  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1768  crit_edge_threshold->setName("crit_edge_threshold");
1770  // only those threads where pos < crit_edge_threshold go through dynamic
1771  // watchdog call
1772  call_watchdog_lv =
1773  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1774  } else {
1775  // CPU path: run watchdog for every 64th row
1776  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1777  call_watchdog_lv = ir_builder.CreateICmp(
1778  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1779  }
1780  CHECK(call_watchdog_lv);
1781  auto error_check_bb = bb_it->splitBasicBlock(
1782  llvm::BasicBlock::iterator(br_instr), ".error_check");
1783  auto& watchdog_br_instr = bb_it->back();
1784 
1785  auto watchdog_check_bb = llvm::BasicBlock::Create(
1786  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1787  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1788  auto detected_timeout = watchdog_ir_builder.CreateCall(
1789  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1790  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1791  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1792  watchdog_ir_builder.CreateBr(error_check_bb);
1793 
1794  llvm::ReplaceInstWithInst(
1795  &watchdog_br_instr,
1796  llvm::BranchInst::Create(
1797  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1798  ir_builder.SetInsertPoint(&br_instr);
1799  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1800 
1801  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1802  unified_err_lv->addIncoming(err_lv, &*bb_it);
1803  err_lv = unified_err_lv;
1804  } else if (run_with_allowing_runtime_interrupt) {
1805  CHECK(pos);
1806  llvm::Value* call_check_interrupt_lv = nullptr;
1807  if (device_type == ExecutorDeviceType::GPU) {
1808  // approximate how many times the %pos variable
1809  // is increased --> the number of iteration
1810  // here we calculate the # bit shift by considering grid/block/fragment sizes
1811  // since if we use the fixed one (i.e., per 64-th increment)
1812  // some CUDA threads cannot enter the interrupt checking block depending on
1813  // the fragment size --> a thread may not take care of 64 threads if an outer
1814  // table is not sufficiently large, and so cannot be interrupted
1815  int32_t num_shift_by_gridDim = shared::getExpOfTwo(gridSize());
1816  int32_t num_shift_by_blockDim = shared::getExpOfTwo(blockSize());
1817  int64_t total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1818  uint64_t interrupt_checking_freq = 32;
1819  auto freq_control_knob = g_running_query_interrupt_freq;
1820  CHECK_GT(freq_control_knob, 0);
1821  CHECK_LE(freq_control_knob, 1.0);
1822  if (!input_table_infos.empty()) {
1823  const auto& outer_table_info = *input_table_infos.begin();
1824  auto num_outer_table_tuples = outer_table_info.info.getNumTuples();
1825  if (outer_table_info.table_id < 0) {
1826  auto* rs = (*outer_table_info.info.fragments.begin()).resultSet;
1827  CHECK(rs);
1828  num_outer_table_tuples = rs->entryCount();
1829  } else {
1830  auto num_frags = outer_table_info.info.fragments.size();
1831  if (num_frags > 0) {
1832  num_outer_table_tuples =
1833  outer_table_info.info.fragments.begin()->getNumTuples();
1834  }
1835  }
1836  if (num_outer_table_tuples > 0) {
1837  // gridSize * blockSize --> pos_step (idx of the next row per thread)
1838  // we additionally multiply two to pos_step since the number of
1839  // dispatched blocks are double of the gridSize
1840  // # tuples (of fragment) / pos_step --> maximum # increment (K)
1841  // also we multiply 1 / freq_control_knob to K to control the frequency
1842  // So, needs to check the interrupt status more frequently? make K smaller
1843  auto max_inc = uint64_t(
1844  floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1845  if (max_inc < 2) {
1846  // too small `max_inc`, so this correction is necessary to make
1847  // `interrupt_checking_freq` be valid (i.e., larger than zero)
1848  max_inc = 2;
1849  }
1850  auto calibrated_inc = uint64_t(floor(max_inc * (1 - freq_control_knob)));
1851  interrupt_checking_freq =
1852  uint64_t(pow(2, shared::getExpOfTwo(calibrated_inc)));
1853  // add the coverage when interrupt_checking_freq > K
1854  // if so, some threads still cannot be branched to the interrupt checker
1855  // so we manually use smaller but close to the max_inc as freq
1856  if (interrupt_checking_freq > max_inc) {
1857  interrupt_checking_freq = max_inc / 2;
1858  }
1859  if (interrupt_checking_freq < 8) {
1860  // such small freq incurs too frequent interrupt status checking,
1861  // so we fixup to the minimum freq value at some reasonable degree
1862  interrupt_checking_freq = 8;
1863  }
1864  }
1865  }
1866  VLOG(1) << "Set the running query interrupt checking frequency: "
1867  << interrupt_checking_freq;
1868  // check the interrupt flag for every interrupt_checking_freq-th iteration
1869  llvm::Value* pos_shifted_per_iteration =
1870  ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1871  auto interrupt_predicate =
1872  ir_builder.CreateAnd(pos_shifted_per_iteration, interrupt_checking_freq);
1873  call_check_interrupt_lv =
1874  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1875  interrupt_predicate,
1876  cgen_state_->llInt(int64_t(0LL)));
1877  } else {
1878  // CPU path: run interrupt checker for every 64th row
1879  auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1880  call_check_interrupt_lv =
1881  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1882  interrupt_predicate,
1883  cgen_state_->llInt(int64_t(0LL)));
1884  }
1885  CHECK(call_check_interrupt_lv);
1886  auto error_check_bb = bb_it->splitBasicBlock(
1887  llvm::BasicBlock::iterator(br_instr), ".error_check");
1888  auto& check_interrupt_br_instr = bb_it->back();
1889 
1890  auto interrupt_check_bb = llvm::BasicBlock::Create(
1891  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
1892  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1893  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1894  cgen_state_->module_->getFunction("check_interrupt"), {});
1895  auto interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1896  detected_interrupt, cgen_state_->llInt(Executor::ERR_INTERRUPTED), err_lv);
1897  interrupt_checker_ir_builder.CreateBr(error_check_bb);
1898 
1899  llvm::ReplaceInstWithInst(
1900  &check_interrupt_br_instr,
1901  llvm::BranchInst::Create(
1902  interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
1903  ir_builder.SetInsertPoint(&br_instr);
1904  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1905 
1906  unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
1907  unified_err_lv->addIncoming(err_lv, &*bb_it);
1908  err_lv = unified_err_lv;
1909  }
1910  if (!err_lv_returned_from_row_func) {
1911  err_lv_returned_from_row_func = err_lv;
1912  }
1913  if (device_type == ExecutorDeviceType::GPU && g_enable_dynamic_watchdog) {
1914  // let kernel execution finish as expected, regardless of the observed error,
1915  // unless it is from the dynamic watchdog where all threads within that block
1916  // return together.
1917  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1918  err_lv,
1919  cgen_state_->llInt(Executor::ERR_OUT_OF_TIME));
1920  } else {
1921  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1922  err_lv,
1923  cgen_state_->llInt(static_cast<int32_t>(0)));
1924  }
1925  auto error_bb = llvm::BasicBlock::Create(
1926  cgen_state_->context_, ".error_exit", query_func, new_bb);
1927  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1928  llvm::CallInst::Create(
1929  cgen_state_->module_->getFunction("record_error_code"),
1930  std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
1931  "",
1932  error_bb);
1933  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1934  llvm::ReplaceInstWithInst(&br_instr,
1935  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1936  done_splitting = true;
1937  break;
1938  }
1939  }
1940  }
1941  CHECK(done_splitting);
1942 }
1943 
1944 std::vector<llvm::Value*> Executor::inlineHoistedLiterals() {
1945  AUTOMATIC_IR_METADATA(cgen_state_.get());
1946 
1947  std::vector<llvm::Value*> hoisted_literals;
1948 
1949  // row_func_ is using literals whose defs have been hoisted up to the query_func_,
1950  // extend row_func_ signature to include extra args to pass these literal values.
1951  std::vector<llvm::Type*> row_process_arg_types;
1952 
1953  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1954  E = cgen_state_->row_func_->arg_end();
1955  I != E;
1956  ++I) {
1957  row_process_arg_types.push_back(I->getType());
1958  }
1959 
1960  for (auto& element : cgen_state_->query_func_literal_loads_) {
1961  for (auto value : element.second) {
1962  row_process_arg_types.push_back(value->getType());
1963  }
1964  }
1965 
1966  auto ft = llvm::FunctionType::get(
1967  get_int_type(32, cgen_state_->context_), row_process_arg_types, false);
1968  auto row_func_with_hoisted_literals =
1969  llvm::Function::Create(ft,
1970  llvm::Function::ExternalLinkage,
1971  "row_func_hoisted_literals",
1972  cgen_state_->row_func_->getParent());
1973 
1974  auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
1975  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1976  E = cgen_state_->row_func_->arg_end();
1977  I != E;
1978  ++I) {
1979  if (I->hasName()) {
1980  row_func_arg_it->setName(I->getName());
1981  }
1982  ++row_func_arg_it;
1983  }
1984 
1985  decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{nullptr};
1986  decltype(row_func_arg_it) filter_func_arg_it{nullptr};
1987  if (cgen_state_->filter_func_) {
1988  // filter_func_ is using literals whose defs have been hoisted up to the row_func_,
1989  // extend filter_func_ signature to include extra args to pass these literal values.
1990  std::vector<llvm::Type*> filter_func_arg_types;
1991 
1992  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
1993  E = cgen_state_->filter_func_->arg_end();
1994  I != E;
1995  ++I) {
1996  filter_func_arg_types.push_back(I->getType());
1997  }
1998 
1999  for (auto& element : cgen_state_->query_func_literal_loads_) {
2000  for (auto value : element.second) {
2001  filter_func_arg_types.push_back(value->getType());
2002  }
2003  }
2004 
2005  auto ft2 = llvm::FunctionType::get(
2006  get_int_type(32, cgen_state_->context_), filter_func_arg_types, false);
2007  filter_func_with_hoisted_literals =
2008  llvm::Function::Create(ft2,
2009  llvm::Function::ExternalLinkage,
2010  "filter_func_hoisted_literals",
2011  cgen_state_->filter_func_->getParent());
2012 
2013  filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
2014  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2015  E = cgen_state_->filter_func_->arg_end();
2016  I != E;
2017  ++I) {
2018  if (I->hasName()) {
2019  filter_func_arg_it->setName(I->getName());
2020  }
2021  ++filter_func_arg_it;
2022  }
2023  }
2024 
2025  std::unordered_map<int, std::vector<llvm::Value*>>
2026  query_func_literal_loads_function_arguments,
2027  query_func_literal_loads_function_arguments2;
2028 
2029  for (auto& element : cgen_state_->query_func_literal_loads_) {
2030  std::vector<llvm::Value*> argument_values, argument_values2;
2031 
2032  for (auto value : element.second) {
2033  hoisted_literals.push_back(value);
2034  argument_values.push_back(&*row_func_arg_it);
2035  if (cgen_state_->filter_func_) {
2036  argument_values2.push_back(&*filter_func_arg_it);
2037  cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
2038  }
2039  if (value->hasName()) {
2040  row_func_arg_it->setName("arg_" + value->getName());
2041  if (cgen_state_->filter_func_) {
2042  filter_func_arg_it->getContext();
2043  filter_func_arg_it->setName("arg_" + value->getName());
2044  }
2045  }
2046  ++row_func_arg_it;
2047  ++filter_func_arg_it;
2048  }
2049 
2050  query_func_literal_loads_function_arguments[element.first] = argument_values;
2051  query_func_literal_loads_function_arguments2[element.first] = argument_values2;
2052  }
2053 
2054  // copy the row_func function body over
2055  // see
2056  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2057  row_func_with_hoisted_literals->getBasicBlockList().splice(
2058  row_func_with_hoisted_literals->begin(),
2059  cgen_state_->row_func_->getBasicBlockList());
2060 
2061  // also replace row_func arguments with the arguments from row_func_hoisted_literals
2062  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2063  E = cgen_state_->row_func_->arg_end(),
2064  I2 = row_func_with_hoisted_literals->arg_begin();
2065  I != E;
2066  ++I) {
2067  I->replaceAllUsesWith(&*I2);
2068  I2->takeName(&*I);
2069  cgen_state_->filter_func_args_.replace(&*I, &*I2);
2070  ++I2;
2071  }
2072 
2073  cgen_state_->row_func_ = row_func_with_hoisted_literals;
2074 
2075  // and finally replace literal placeholders
2076  std::vector<llvm::Instruction*> placeholders;
2077  std::string prefix("__placeholder__literal_");
2078  for (auto it = llvm::inst_begin(row_func_with_hoisted_literals),
2079  e = llvm::inst_end(row_func_with_hoisted_literals);
2080  it != e;
2081  ++it) {
2082  if (it->hasName() && it->getName().startswith(prefix)) {
2083  auto offset_and_index_entry =
2084  cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
2085  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2086 
2087  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2088  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2089 
2090  it->replaceAllUsesWith(
2091  query_func_literal_loads_function_arguments[lit_off][lit_idx]);
2092  placeholders.push_back(&*it);
2093  }
2094  }
2095  for (auto placeholder : placeholders) {
2096  placeholder->removeFromParent();
2097  }
2098 
2099  if (cgen_state_->filter_func_) {
2100  // copy the filter_func function body over
2101  // see
2102  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2103  filter_func_with_hoisted_literals->getBasicBlockList().splice(
2104  filter_func_with_hoisted_literals->begin(),
2105  cgen_state_->filter_func_->getBasicBlockList());
2106 
2107  // also replace filter_func arguments with the arguments from
2108  // filter_func_hoisted_literals
2109  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2110  E = cgen_state_->filter_func_->arg_end(),
2111  I2 = filter_func_with_hoisted_literals->arg_begin();
2112  I != E;
2113  ++I) {
2114  I->replaceAllUsesWith(&*I2);
2115  I2->takeName(&*I);
2116  ++I2;
2117  }
2118 
2119  cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2120 
2121  // and finally replace literal placeholders
2122  std::vector<llvm::Instruction*> placeholders;
2123  std::string prefix("__placeholder__literal_");
2124  for (auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2125  e = llvm::inst_end(filter_func_with_hoisted_literals);
2126  it != e;
2127  ++it) {
2128  if (it->hasName() && it->getName().startswith(prefix)) {
2129  auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2130  llvm::dyn_cast<llvm::Value>(&*it));
2131  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2132 
2133  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2134  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2135 
2136  it->replaceAllUsesWith(
2137  query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2138  placeholders.push_back(&*it);
2139  }
2140  }
2141  for (auto placeholder : placeholders) {
2142  placeholder->removeFromParent();
2143  }
2144  }
2145 
2146  return hoisted_literals;
2147 }
2148 
2149 namespace {
2150 
2151 size_t get_shared_memory_size(const bool shared_mem_used,
2152  const QueryMemoryDescriptor* query_mem_desc_ptr) {
2153  return shared_mem_used
2154  ? (query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount())
2155  : 0;
2156 }
2157 
2158 bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor* query_mem_desc_ptr,
2159  const RelAlgExecutionUnit& ra_exe_unit,
2160  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2161  const ExecutorDeviceType device_type,
2162  const unsigned gpu_blocksize,
2163  const unsigned num_blocks_per_mp) {
2164  if (device_type == ExecutorDeviceType::CPU) {
2165  return false;
2166  }
2167  if (query_mem_desc_ptr->didOutputColumnar()) {
2168  return false;
2169  }
2170  CHECK(query_mem_desc_ptr);
2171  CHECK(cuda_mgr);
2172  /*
2173  * We only use shared memory strategy if GPU hardware provides native shared
2174  * memory atomics support. From CUDA Toolkit documentation:
2175  * https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html#atomic-ops "Like
2176  * Maxwell, Pascal [and Volta] provides native shared memory atomic operations
2177  * for 32-bit integer arithmetic, along with native 32 or 64-bit compare-and-swap
2178  * (CAS)."
2179  *
2180  **/
2181  if (!cuda_mgr->isArchMaxwellOrLaterForAll()) {
2182  return false;
2183  }
2184 
2185  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2188  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty()) {
2189  // TODO: relax this, if necessary
2190  if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2191  return false;
2192  }
2193  // skip shared memory usage when dealing with 1) variable length targets, 2)
2194  // not a COUNT aggregate
2195  const auto target_infos =
2196  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2197  std::unordered_set<SQLAgg> supported_aggs{kCOUNT};
2198  if (std::find_if(target_infos.begin(),
2199  target_infos.end(),
2200  [&supported_aggs](const TargetInfo& ti) {
2201  if (ti.sql_type.is_varlen() ||
2202  !supported_aggs.count(ti.agg_kind)) {
2203  return true;
2204  } else {
2205  return false;
2206  }
2207  }) == target_infos.end()) {
2208  return true;
2209  }
2210  }
2211  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2222  if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2223  return false;
2224  }
2225 
2226  // Fundamentally, we should use shared memory whenever the output buffer
2227  // is small enough so that we can fit it in the shared memory and yet expect
2228  // good occupancy.
2229  // For now, we allow keyless, row-wise layout, and only for perfect hash
2230  // group by operations.
2231  if (query_mem_desc_ptr->hasKeylessHash() &&
2232  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty() &&
2233  !query_mem_desc_ptr->useStreamingTopN()) {
2234  const size_t shared_memory_threshold_bytes = std::min(
2236  cuda_mgr->getMinSharedMemoryPerBlockForAllDevices() / num_blocks_per_mp);
2237  const auto output_buffer_size =
2238  query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount();
2239  if (output_buffer_size > shared_memory_threshold_bytes) {
2240  return false;
2241  }
2242 
2243  // skip shared memory usage when dealing with 1) variable length targets, 2)
2244  // non-basic aggregates (COUNT, SUM, MIN, MAX, AVG)
2245  // TODO: relax this if necessary
2246  const auto target_infos =
2247  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2248  std::unordered_set<SQLAgg> supported_aggs{kCOUNT};
2250  supported_aggs = {kCOUNT, kMIN, kMAX, kSUM, kAVG};
2251  }
2252  if (std::find_if(target_infos.begin(),
2253  target_infos.end(),
2254  [&supported_aggs](const TargetInfo& ti) {
2255  if (ti.sql_type.is_varlen() ||
2256  !supported_aggs.count(ti.agg_kind)) {
2257  return true;
2258  } else {
2259  return false;
2260  }
2261  }) == target_infos.end()) {
2262  return true;
2263  }
2264  }
2265  }
2266  return false;
2267 }
2268 
2269 #ifndef NDEBUG
2270 std::string serialize_llvm_metadata_footnotes(llvm::Function* query_func,
2271  CgenState* cgen_state) {
2272  std::string llvm_ir;
2273  std::unordered_set<llvm::MDNode*> md;
2274 
2275  // Loop over all instructions in the query function.
2276  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2277  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2278  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2279  instr_it->getAllMetadata(imd);
2280  for (auto [kind, node] : imd) {
2281  md.insert(node);
2282  }
2283  }
2284  }
2285 
2286  // Loop over all instructions in the row function.
2287  for (auto bb_it = cgen_state->row_func_->begin(); bb_it != cgen_state->row_func_->end();
2288  ++bb_it) {
2289  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2290  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2291  instr_it->getAllMetadata(imd);
2292  for (auto [kind, node] : imd) {
2293  md.insert(node);
2294  }
2295  }
2296  }
2297 
2298  // Loop over all instructions in the filter function.
2299  if (cgen_state->filter_func_) {
2300  for (auto bb_it = cgen_state->filter_func_->begin();
2301  bb_it != cgen_state->filter_func_->end();
2302  ++bb_it) {
2303  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2304  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2305  instr_it->getAllMetadata(imd);
2306  for (auto [kind, node] : imd) {
2307  md.insert(node);
2308  }
2309  }
2310  }
2311  }
2312 
2313  // Sort the metadata by canonical number and convert to text.
2314  if (!md.empty()) {
2315  std::map<size_t, std::string> sorted_strings;
2316  for (auto p : md) {
2317  std::string str;
2318  llvm::raw_string_ostream os(str);
2319  p->print(os, cgen_state->module_, true);
2320  os.flush();
2321  auto fields = split(str, {}, 1);
2322  if (fields.empty() || fields[0].empty()) {
2323  continue;
2324  }
2325  sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2326  }
2327  llvm_ir += "\n";
2328  for (auto [id, text] : sorted_strings) {
2329  llvm_ir += text;
2330  llvm_ir += "\n";
2331  }
2332  }
2333 
2334  return llvm_ir;
2335 }
2336 #endif // NDEBUG
2337 
2338 } // namespace
2339 
2340 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2341 Executor::compileWorkUnit(const std::vector<InputTableInfo>& query_infos,
2342  const PlanState::DeletedColumnsMap& deleted_cols_map,
2343  const RelAlgExecutionUnit& ra_exe_unit,
2344  const CompilationOptions& co,
2345  const ExecutionOptions& eo,
2346  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2347  const bool allow_lazy_fetch,
2348  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2349  const size_t max_groups_buffer_entry_guess,
2350  const int8_t crt_min_byte_width,
2351  const bool has_cardinality_estimation,
2352  ColumnCacheMap& column_cache,
2353  RenderInfo* render_info) {
2354  auto timer = DEBUG_TIMER(__func__);
2355 
2357  const auto cuda_mgr = data_mgr_->getCudaMgr();
2358  if (!cuda_mgr) {
2359  throw QueryMustRunOnCpu();
2360  }
2361  }
2362 
2363 #ifndef NDEBUG
2364  static std::uint64_t counter = 0;
2365  ++counter;
2366  VLOG(1) << "CODEGEN #" << counter << ":";
2367  LOG(IR) << "CODEGEN #" << counter << ":";
2368  LOG(PTX) << "CODEGEN #" << counter << ":";
2369  LOG(ASM) << "CODEGEN #" << counter << ":";
2370 #endif
2371 
2372  nukeOldState(allow_lazy_fetch, query_infos, deleted_cols_map, &ra_exe_unit);
2373 
2374  addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
2375 
2376  GroupByAndAggregate group_by_and_aggregate(
2377  this,
2378  co.device_type,
2379  ra_exe_unit,
2380  query_infos,
2381  row_set_mem_owner,
2382  has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2383  : std::nullopt);
2384  auto query_mem_desc =
2385  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
2386  max_groups_buffer_entry_guess,
2387  crt_min_byte_width,
2388  render_info,
2390 
2391  if (query_mem_desc->getQueryDescriptionType() ==
2393  !has_cardinality_estimation &&
2394  (!render_info || !render_info->isPotentialInSituRender()) && !eo.just_explain) {
2395  const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2396  throw CardinalityEstimationRequired(col_range_info.max - col_range_info.min);
2397  }
2398 
2399  const bool output_columnar = query_mem_desc->didOutputColumnar();
2400  const bool gpu_shared_mem_optimization =
2402  ra_exe_unit,
2403  cuda_mgr,
2404  co.device_type,
2405  cuda_mgr ? this->blockSize() : 1,
2406  cuda_mgr ? this->numBlocksPerMP() : 1);
2407  if (gpu_shared_mem_optimization) {
2408  // disable interleaved bins optimization on the GPU
2409  query_mem_desc->setHasInterleavedBinsOnGpu(false);
2410  LOG(DEBUG1) << "GPU shared memory is used for the " +
2411  query_mem_desc->queryDescTypeToString() + " query(" +
2412  std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
2413  query_mem_desc.get())) +
2414  " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
2415  }
2416 
2417  const GpuSharedMemoryContext gpu_smem_context(
2418  get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
2419 
2421  const size_t num_count_distinct_descs =
2422  query_mem_desc->getCountDistinctDescriptorsSize();
2423  for (size_t i = 0; i < num_count_distinct_descs; i++) {
2424  const auto& count_distinct_descriptor =
2425  query_mem_desc->getCountDistinctDescriptor(i);
2426  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::StdSet ||
2427  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
2428  !co.hoist_literals)) {
2429  throw QueryMustRunOnCpu();
2430  }
2431  }
2432  }
2433 
2434  // Read the module template and target either CPU or GPU
2435  // by binding the stream position functions to the right implementation:
2436  // stride access for GPU, contiguous for CPU
2437  auto rt_module_copy = llvm::CloneModule(
2438  *g_rt_module.get(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
2439  auto func = llvm::dyn_cast<llvm::Function>(gv);
2440  if (!func) {
2441  return true;
2442  }
2443  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2444  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
2446  });
2448  if (is_udf_module_present(true)) {
2449  CodeGenerator::link_udf_module(udf_cpu_module, *rt_module_copy, cgen_state_.get());
2450  }
2451  if (is_rt_udf_module_present(true)) {
2453  rt_udf_cpu_module, *rt_module_copy, cgen_state_.get());
2454  }
2455  } else {
2456  rt_module_copy->setDataLayout(get_gpu_data_layout());
2457  rt_module_copy->setTargetTriple(get_gpu_target_triple_string());
2458  if (is_udf_module_present()) {
2459  CodeGenerator::link_udf_module(udf_gpu_module, *rt_module_copy, cgen_state_.get());
2460  }
2461  if (is_rt_udf_module_present()) {
2463  rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
2464  }
2465  }
2466 
2467  cgen_state_->module_ = rt_module_copy.release();
2468  AUTOMATIC_IR_METADATA(cgen_state_.get());
2469 
2470  auto agg_fnames =
2471  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
2472 
2473  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
2474 
2475  const bool is_group_by{query_mem_desc->isGroupBy()};
2476  auto [query_func, row_func_call] = is_group_by
2477  ? query_group_by_template(cgen_state_->module_,
2478  co.hoist_literals,
2479  *query_mem_desc,
2480  co.device_type,
2481  ra_exe_unit.scan_limit,
2482  gpu_smem_context)
2483  : query_template(cgen_state_->module_,
2484  agg_slot_count,
2485  co.hoist_literals,
2486  !!ra_exe_unit.estimator,
2487  gpu_smem_context);
2488  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
2489  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
2490  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
2492  cgen_state_->query_func_ = query_func;
2493  cgen_state_->row_func_call_ = row_func_call;
2494  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2495  &query_func->getEntryBlock().front());
2496 
2497  // Generate the function signature and column head fetches s.t.
2498  // double indirection isn't needed in the inner loop
2499  auto& fetch_bb = query_func->front();
2500  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2501  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2502  auto col_heads = generate_column_heads_load(ra_exe_unit.input_col_descs.size(),
2503  query_func->args().begin(),
2504  fetch_ir_builder,
2505  cgen_state_->context_);
2506  CHECK_EQ(ra_exe_unit.input_col_descs.size(), col_heads.size());
2507 
2508  cgen_state_->row_func_ = create_row_function(ra_exe_unit.input_col_descs.size(),
2509  is_group_by ? 0 : agg_slot_count,
2510  co.hoist_literals,
2511  cgen_state_->module_,
2512  cgen_state_->context_);
2513  CHECK(cgen_state_->row_func_);
2514  cgen_state_->row_func_bb_ =
2515  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
2516 
2518  auto filter_func_ft =
2519  llvm::FunctionType::get(get_int_type(32, cgen_state_->context_), {}, false);
2520  cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2521  llvm::Function::ExternalLinkage,
2522  "filter_func",
2523  cgen_state_->module_);
2524  CHECK(cgen_state_->filter_func_);
2525  cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2526  cgen_state_->context_, "entry", cgen_state_->filter_func_);
2527  }
2528 
2529  cgen_state_->current_func_ = cgen_state_->row_func_;
2530  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2531 
2532  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
2533  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
2534  const auto join_loops =
2535  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2536 
2537  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
2538  for (auto& simple_qual : ra_exe_unit.simple_quals) {
2539  plan_state_->addSimpleQual(simple_qual);
2540  }
2541  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2542  if (is_not_deleted_bb) {
2543  cgen_state_->row_func_bb_ = is_not_deleted_bb;
2544  }
2545  if (!join_loops.empty()) {
2546  codegenJoinLoops(join_loops,
2547  body_execution_unit,
2548  group_by_and_aggregate,
2549  query_func,
2550  cgen_state_->row_func_bb_,
2551  *(query_mem_desc.get()),
2552  co,
2553  eo);
2554  } else {
2555  const bool can_return_error = compileBody(
2556  ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
2557  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
2559  createErrorCheckControlFlow(query_func,
2563  group_by_and_aggregate.query_infos_);
2564  }
2565  }
2566  std::vector<llvm::Value*> hoisted_literals;
2567 
2568  if (co.hoist_literals) {
2569  VLOG(1) << "number of hoisted literals: "
2570  << cgen_state_->query_func_literal_loads_.size()
2571  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2572  << " bytes";
2573  }
2574 
2575  if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2576  // we have some hoisted literals...
2577  hoisted_literals = inlineHoistedLiterals();
2578  }
2579 
2580  // replace the row func placeholder call with the call to the actual row func
2581  std::vector<llvm::Value*> row_func_args;
2582  for (size_t i = 0; i < cgen_state_->row_func_call_->getNumArgOperands(); ++i) {
2583  row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2584  }
2585  row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2586  row_func_args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
2587  // push hoisted literals arguments, if any
2588  row_func_args.insert(
2589  row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2590  llvm::ReplaceInstWithInst(
2591  cgen_state_->row_func_call_,
2592  llvm::CallInst::Create(cgen_state_->row_func_, row_func_args, ""));
2593 
2594  // replace the filter func placeholder call with the call to the actual filter func
2595  if (cgen_state_->filter_func_) {
2596  std::vector<llvm::Value*> filter_func_args;
2597  for (auto arg_it = cgen_state_->filter_func_args_.begin();
2598  arg_it != cgen_state_->filter_func_args_.end();
2599  ++arg_it) {
2600  filter_func_args.push_back(arg_it->first);
2601  }
2602  llvm::ReplaceInstWithInst(
2603  cgen_state_->filter_func_call_,
2604  llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args, ""));
2605  }
2606 
2607  // Aggregate
2608  plan_state_->init_agg_vals_ =
2609  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
2610 
2611  /*
2612  * If we have decided to use GPU shared memory (decision is not made here), then
2613  * we generate proper code for extra components that it needs (buffer initialization and
2614  * gpu reduction from shared memory to global memory). We then replace these functions
2615  * into the already compiled query_func (replacing two placeholders, write_back_nop and
2616  * init_smem_nop). The rest of the code should be as before (row_func, etc.).
2617  */
2618  if (gpu_smem_context.isSharedMemoryUsed()) {
2619  if (query_mem_desc->getQueryDescriptionType() ==
2621  GpuSharedMemCodeBuilder gpu_smem_code(
2622  cgen_state_->module_,
2623  cgen_state_->context_,
2624  *query_mem_desc,
2626  plan_state_->init_agg_vals_);
2627  gpu_smem_code.codegen();
2628  gpu_smem_code.injectFunctionsInto(query_func);
2629 
2630  // helper functions are used for caching purposes later
2631  cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2632  cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2633  LOG(IR) << gpu_smem_code.toString();
2634  }
2635  }
2636 
2637  auto multifrag_query_func = cgen_state_->module_->getFunction(
2638  "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
2639  CHECK(multifrag_query_func);
2640 
2642  insertErrorCodeChecker(
2643  multifrag_query_func, co.hoist_literals, eo.allow_runtime_query_interrupt);
2644  }
2645 
2646  bind_query(query_func,
2647  "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
2648  multifrag_query_func,
2649  cgen_state_->module_);
2650 
2651  std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2652  if (cgen_state_->filter_func_) {
2653  root_funcs.push_back(cgen_state_->filter_func_);
2654  }
2655  auto live_funcs = CodeGenerator::markDeadRuntimeFuncs(
2656  *cgen_state_->module_, root_funcs, {multifrag_query_func});
2657 
2658  // Always inline the row function and the filter function.
2659  // We don't want register spills in the inner loops.
2660  // LLVM seems to correctly free up alloca instructions
2661  // in these functions even when they are inlined.
2662  mark_function_always_inline(cgen_state_->row_func_);
2663  if (cgen_state_->filter_func_) {
2664  mark_function_always_inline(cgen_state_->filter_func_);
2665  }
2666 
2667 #ifndef NDEBUG
2668  // Add helpful metadata to the LLVM IR for debugging.
2670 #endif
2671 
2672  // Serialize the important LLVM IR functions to text for SQL EXPLAIN.
2673  std::string llvm_ir;
2674  if (eo.just_explain) {
2676 #ifdef WITH_JIT_DEBUG
2677  throw std::runtime_error(
2678  "Explain optimized not available when JIT runtime debug symbols are enabled");
2679 #else
2680  // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
2681  // optimized IR after NVVM reflect
2682  llvm::legacy::PassManager pass_manager;
2683  optimize_ir(query_func, cgen_state_->module_, pass_manager, live_funcs, co);
2684 #endif // WITH_JIT_DEBUG
2685  }
2686  llvm_ir =
2687  serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
2688  serialize_llvm_object(cgen_state_->row_func_) +
2689  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2690  : "");
2691 
2692 #ifndef NDEBUG
2693  llvm_ir += serialize_llvm_metadata_footnotes(query_func, cgen_state_.get());
2694 #endif
2695  }
2696 
2697  LOG(IR) << "\n\n" << query_mem_desc->toString() << "\n";
2698  LOG(IR) << "IR for the "
2699  << (co.device_type == ExecutorDeviceType::CPU ? "CPU:\n" : "GPU:\n");
2700 #ifdef NDEBUG
2701  LOG(IR) << serialize_llvm_object(query_func)
2702  << serialize_llvm_object(cgen_state_->row_func_)
2703  << (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2704  : "")
2705  << "\nEnd of IR";
2706 #else
2707  LOG(IR) << serialize_llvm_object(cgen_state_->module_) << "\nEnd of IR";
2708 #endif
2709 
2710  // Run some basic validation checks on the LLVM IR before code is generated below.
2711  verify_function_ir(cgen_state_->row_func_);
2712  if (cgen_state_->filter_func_) {
2713  verify_function_ir(cgen_state_->filter_func_);
2714  }
2715 
2716  // Generate final native code from the LLVM IR.
2717  return std::make_tuple(
2720  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2721  : optimizeAndCodegenGPU(query_func,
2722  multifrag_query_func,
2723  live_funcs,
2724  is_group_by || ra_exe_unit.estimator,
2725  cuda_mgr,
2726  co),
2727  cgen_state_->getLiterals(),
2728  output_columnar,
2729  llvm_ir,
2730  std::move(gpu_smem_context)},
2731  std::move(query_mem_desc));
2732 }
2733 
2734 void Executor::insertErrorCodeChecker(llvm::Function* query_func,
2735  bool hoist_literals,
2736  bool allow_runtime_query_interrupt) {
2737  auto query_stub_func_name =
2738  "query_stub" + std::string(hoist_literals ? "_hoisted_literals" : "");
2739  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2740  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
2741  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
2742  continue;
2743  }
2744  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
2745  if (std::string(row_func_call.getCalledFunction()->getName()) ==
2746  query_stub_func_name) {
2747  auto next_inst_it = inst_it;
2748  ++next_inst_it;
2749  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
2750  auto& br_instr = bb_it->back();
2751  llvm::IRBuilder<> ir_builder(&br_instr);
2752  llvm::Value* err_lv = &*inst_it;
2753  auto error_check_bb =
2754  bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr), ".error_check");
2755  llvm::Value* error_code_arg = nullptr;
2756  auto arg_cnt = 0;
2757  for (auto arg_it = query_func->arg_begin(); arg_it != query_func->arg_end();
2758  arg_it++, ++arg_cnt) {
2759  // since multi_frag_* func has anonymous arguments so we use arg_offset
2760  // explicitly to capture "error_code" argument in the func's argument list
2761  if (hoist_literals) {
2762  if (arg_cnt == 9) {
2763  error_code_arg = &*arg_it;
2764  break;
2765  }
2766  } else {
2767  if (arg_cnt == 8) {
2768  error_code_arg = &*arg_it;
2769  break;
2770  }
2771  }
2772  }
2773  CHECK(error_code_arg);
2774  llvm::Value* err_code = nullptr;
2775  if (allow_runtime_query_interrupt) {
2776  // decide the final error code with a consideration of interrupt status
2777  auto& check_interrupt_br_instr = bb_it->back();
2778  auto interrupt_check_bb = llvm::BasicBlock::Create(
2779  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
2780  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
2781  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
2782  cgen_state_->module_->getFunction("check_interrupt"), {});
2783  auto detected_error = interrupt_checker_ir_builder.CreateCall(
2784  cgen_state_->module_->getFunction("get_error_code"),
2785  std::vector<llvm::Value*>{error_code_arg});
2786  err_code = interrupt_checker_ir_builder.CreateSelect(
2787  detected_interrupt,
2788  cgen_state_->llInt(Executor::ERR_INTERRUPTED),
2789  detected_error);
2790  interrupt_checker_ir_builder.CreateBr(error_check_bb);
2791  llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
2792  llvm::BranchInst::Create(interrupt_check_bb));
2793  ir_builder.SetInsertPoint(&br_instr);
2794  } else {
2795  // uses error code returned from row_func and skip to check interrupt status
2796  ir_builder.SetInsertPoint(&br_instr);
2797  err_code =
2798  ir_builder.CreateCall(cgen_state_->module_->getFunction("get_error_code"),
2799  std::vector<llvm::Value*>{error_code_arg});
2800  }
2801  err_lv = ir_builder.CreateICmp(
2802  llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
2803  auto error_bb = llvm::BasicBlock::Create(
2804  cgen_state_->context_, ".error_exit", query_func, new_bb);
2805  llvm::CallInst::Create(cgen_state_->module_->getFunction("record_error_code"),
2806  std::vector<llvm::Value*>{err_code, error_code_arg},
2807  "",
2808  error_bb);
2809  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2810  llvm::ReplaceInstWithInst(&br_instr,
2811  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2812  break;
2813  }
2814  }
2815  }
2816 }
2817 
2819  const RelAlgExecutionUnit& ra_exe_unit,
2820  const CompilationOptions& co) {
2821  AUTOMATIC_IR_METADATA(cgen_state_.get());
2822  if (!co.filter_on_deleted_column) {
2823  return nullptr;
2824  }
2825  CHECK(!ra_exe_unit.input_descs.empty());
2826  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
2827  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
2828  return nullptr;
2829  }
2830  const auto deleted_cd =
2831  plan_state_->getDeletedColForTable(outer_input_desc.getTableId());
2832  if (!deleted_cd) {
2833  return nullptr;
2834  }
2835  CHECK(deleted_cd->columnType.is_boolean());
2836  const auto deleted_expr =
2837  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
2838  outer_input_desc.getTableId(),
2839  deleted_cd->columnId,
2840  outer_input_desc.getNestLevel());
2841  CodeGenerator code_generator(this);
2842  const auto is_deleted =
2843  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
2844  const auto is_deleted_bb = llvm::BasicBlock::Create(
2845  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
2846  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
2847  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
2848  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
2849  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
2850  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2851  cgen_state_->ir_builder_.SetInsertPoint(bb);
2852  return bb;
2853 }
2854 
2855 bool Executor::compileBody(const RelAlgExecutionUnit& ra_exe_unit,
2856  GroupByAndAggregate& group_by_and_aggregate,
2858  const CompilationOptions& co,
2859  const GpuSharedMemoryContext& gpu_smem_context) {
2860  AUTOMATIC_IR_METADATA(cgen_state_.get());
2861 
2862  // Switch the code generation into a separate filter function if enabled.
2863  // Note that accesses to function arguments are still codegenned from the
2864  // row function's arguments, then later automatically forwarded and
2865  // remapped into filter function arguments by redeclareFilterFunction().
2866  cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
2867  llvm::Value* loop_done{nullptr};
2868  std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
2869  if (cgen_state_->filter_func_) {
2870  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2871  auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
2872  cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
2873  row_func_entry_bb->begin());
2874  loop_done = cgen_state_->ir_builder_.CreateAlloca(
2875  get_int_type(1, cgen_state_->context_), nullptr, "loop_done");
2876  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2877  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(true), loop_done);
2878  }
2879  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
2880  cgen_state_->current_func_ = cgen_state_->filter_func_;
2881  fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
2882  }
2883 
2884  // generate the code for the filter
2885  std::vector<Analyzer::Expr*> primary_quals;
2886  std::vector<Analyzer::Expr*> deferred_quals;
2887  bool short_circuited = CodeGenerator::prioritizeQuals(
2888  ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
2889  if (short_circuited) {
2890  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
2891  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
2892  << " quals";
2893  }
2894  llvm::Value* filter_lv = cgen_state_->llBool(true);
2895  CodeGenerator code_generator(this);
2896  for (auto expr : primary_quals) {
2897  // Generate the filter for primary quals
2898  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
2899  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
2900  }
2901  CHECK(filter_lv->getType()->isIntegerTy(1));
2902  llvm::BasicBlock* sc_false{nullptr};
2903  if (!deferred_quals.empty()) {
2904  auto sc_true = llvm::BasicBlock::Create(
2905  cgen_state_->context_, "sc_true", cgen_state_->current_func_);
2906  sc_false = llvm::BasicBlock::Create(
2907  cgen_state_->context_, "sc_false", cgen_state_->current_func_);
2908  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
2909  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
2910  if (ra_exe_unit.join_quals.empty()) {
2911  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
2912  }
2913  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
2914  filter_lv = cgen_state_->llBool(true);
2915  }
2916  for (auto expr : deferred_quals) {
2917  filter_lv = cgen_state_->ir_builder_.CreateAnd(
2918  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
2919  }
2920 
2921  CHECK(filter_lv->getType()->isIntegerTy(1));
2922  auto ret = group_by_and_aggregate.codegen(
2923  filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
2924 
2925  // Switch the code generation back to the row function if a filter
2926  // function was enabled.
2927  if (cgen_state_->filter_func_) {
2928  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2929  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(false), loop_done);
2930  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2931  }
2932 
2933  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2934  cgen_state_->current_func_ = cgen_state_->row_func_;
2935  cgen_state_->filter_func_call_ =
2936  cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
2937 
2938  // Create real filter function declaration after placeholder call
2939  // is emitted.
2940  redeclareFilterFunction();
2941 
2942  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2943  auto loop_done_true = llvm::BasicBlock::Create(
2944  cgen_state_->context_, "loop_done_true", cgen_state_->row_func_);
2945  auto loop_done_false = llvm::BasicBlock::Create(
2946  cgen_state_->context_, "loop_done_false", cgen_state_->row_func_);
2947  auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(loop_done);
2948  cgen_state_->ir_builder_.CreateCondBr(
2949  loop_done_flag, loop_done_true, loop_done_false);
2950  cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
2951  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2952  cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
2953  } else {
2954  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2955  }
2956  }
2957  return ret;
2958 }
2959 
2960 std::unique_ptr<llvm::Module> runtime_module_shallow_copy(CgenState* cgen_state) {
2961  return llvm::CloneModule(
2962  *g_rt_module.get(), cgen_state->vmap_, [](const llvm::GlobalValue* gv) {
2963  auto func = llvm::dyn_cast<llvm::Function>(gv);
2964  if (!func) {
2965  return true;
2966  }
2967  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2968  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage);
2969  });
2970 }
2971 
2972 std::vector<llvm::Value*> generate_column_heads_load(const int num_columns,
2973  llvm::Value* byte_stream_arg,
2974  llvm::IRBuilder<>& ir_builder,
2975  llvm::LLVMContext& ctx) {
2976  CHECK(byte_stream_arg);
2977  const auto max_col_local_id = num_columns - 1;
2978 
2979  std::vector<llvm::Value*> col_heads;
2980  for (int col_id = 0; col_id <= max_col_local_id; ++col_id) {
2981  col_heads.emplace_back(ir_builder.CreateLoad(ir_builder.CreateGEP(
2982  byte_stream_arg, llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id))));
2983  }
2984  return col_heads;
2985 }
2986 
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
void read_rt_udf_gpu_module(const std::string &udf_ir)
std::vector< Analyzer::Expr * > target_exprs
bool is_udf_module_present(bool cpu_only=false)
#define CHECK_EQ(x, y)
Definition: Logger.h:217
double g_running_query_interrupt_freq
Definition: Execute.cpp:122
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
std::unique_ptr< llvm::Module > rt_udf_cpu_module
bool countDistinctDescriptorsLogicallyEmpty() const
std::unique_ptr< llvm::Module > runtime_module_shallow_copy(CgenState *cgen_state)
void read_udf_cpu_module(const std::string &udf_ir_filename)
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1163
void mark_function_never_inline(llvm::Function *func)
std::unique_ptr< llvm::Module > udf_gpu_module
void show_defined(llvm::Module &module)
ExecutorDeviceType
#define SIZE_MAX
void read_rt_udf_cpu_module(const std::string &udf_ir)
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:203
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
std::string join(T const &container, std::string const &delim)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::vector< std::string > CodeCacheKey
Definition: CodeCache.h:25
ExecutorOptLevel opt_level
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:77
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
void optimize_ir(llvm::Function *query_func, llvm::Module *module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
#define CHECK_GT(x, y)
Definition: Logger.h:221
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
gpu_code_cache_(code_cache_size)
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *module)
llvm::Function * row_func_
Definition: CgenState.h:330
cpu_code_cache_(code_cache_size)
std::shared_ptr< CompilationContext > getCodeFromCache(const CodeCacheKey &, const CodeCache &)
bool g_enable_smem_non_grouped_agg
Definition: Execute.cpp:131
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co, const GPUTarget &gpu_target)
Definition: sqldefs.h:73
unsigned getExpOfTwo(unsigned n)
Definition: MathUtils.cpp:23
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned gpu_blocksize, const unsigned num_blocks_per_mp)
std::string get_cuda_home(void)
Definition: CudaMgr.cpp:404
llvm::StringRef get_gpu_target_triple_string()
llvm::Module * module_
Definition: CgenState.h:329
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
void verify_function_ir(const llvm::Function *func)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:164
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > g_rt_module
ExecutorExplainType explain_type
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
void insertErrorCodeChecker(llvm::Function *query_func, bool hoist_literals, bool allow_runtime_query_interrupt)
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1162
void initializeNVPTXBackend() const
Definition: sqldefs.h:75
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:114
const_list_iterator_t cend() const
Definition: LruCache.hpp:58
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
char * to
QueryDescriptionType getQueryDescriptionType() const
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
Definition: CudaMgr.h:148
static void addCodeToCache(const CodeCacheKey &, std::shared_ptr< CompilationContext >, llvm::Module *, CodeCache &)
#define AUTOMATIC_IR_METADATA_DONE()
ExecutorDeviceType device_type
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
llvm::Function * filter_func_
Definition: CgenState.h:331
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
static void addUdfIrToModule(const std::string &udf_ir_filename, const bool is_cuda_ir)
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:287
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
#define CHECK_LE(x, y)
Definition: Logger.h:220
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
llvm::Module * read_template_module(llvm::LLVMContext &context)
bool g_enable_smem_grouped_non_count_agg
Definition: Execute.cpp:128
Definition: sqldefs.h:76
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
std::unique_ptr< llvm::Module > udf_cpu_module
int CUdevice
Definition: nocuda.h:20
bool g_enable_filter_function
Definition: Execute.cpp:80
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
llvm::LLVMContext & getGlobalLLVMContext()
float g_fraction_code_cache_to_evict
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
Definition: LogicalIR.cpp:157
data_mgr_(data_mgr)
SQLAgg get_aggtype() const
Definition: Analyzer.h:1249
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:63
#define CHECK(condition)
Definition: Logger.h:209
#define DEBUG_TIMER(name)
Definition: Logger.h:352
llvm::ValueToValueMapTy vmap_
Definition: CgenState.h:339
std::string get_root_abs_path()
char * f
std::vector< llvm::Value * > inlineHoistedLiterals()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *module, llvm::LLVMContext &context)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
void read_udf_gpu_module(const std::string &udf_ir_filename)
Definition: sqldefs.h:74
int cpu_threads()
Definition: thread_count.h:24
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls, const bool is_gpu=false)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
CubinResult ptx_to_cubin(const std::string &ptx, const unsigned block_size, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
Definition: sqldefs.h:72
bool is_rt_udf_module_present(bool cpu_only=false)
void put(const key_t &key, value_t &&value)
Definition: LruCache.hpp:27
const_list_iterator_t find(const key_t &key) const
Definition: LruCache.hpp:49
#define VLOG(n)
Definition: Logger.h:303
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
size_t g_gpu_smem_threshold
Definition: Execute.cpp:123