OmniSciDB  f17484ade4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
NativeCodegen.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryEngine/Execute.h"
18 
19 #if LLVM_VERSION_MAJOR < 9
20 static_assert(false, "LLVM Version >= 9 is required.");
21 #endif
22 
23 #include <llvm/Analysis/ScopedNoAliasAA.h>
24 #include <llvm/Analysis/TypeBasedAliasAnalysis.h>
25 #include <llvm/Bitcode/BitcodeReader.h>
26 #include <llvm/Bitcode/BitcodeWriter.h>
27 #include <llvm/ExecutionEngine/MCJIT.h>
28 #include <llvm/IR/Attributes.h>
29 #include <llvm/IR/GlobalValue.h>
30 #include <llvm/IR/InstIterator.h>
31 #include <llvm/IR/IntrinsicInst.h>
32 #include <llvm/IR/Intrinsics.h>
33 #include <llvm/IR/LegacyPassManager.h>
34 #include <llvm/IR/Verifier.h>
35 #include <llvm/IRReader/IRReader.h>
36 #if 14 <= LLVM_VERSION_MAJOR
37 #include <llvm/MC/TargetRegistry.h>
38 #else
39 #include <llvm/Support/TargetRegistry.h>
40 #endif
41 #include <llvm/Support/Casting.h>
42 #include <llvm/Support/FileSystem.h>
43 #include <llvm/Support/FormattedStream.h>
44 #include <llvm/Support/MemoryBuffer.h>
45 #include <llvm/Support/SourceMgr.h>
46 #include <llvm/Support/TargetSelect.h>
47 #include <llvm/Support/raw_os_ostream.h>
48 #include <llvm/Support/raw_ostream.h>
49 #include <llvm/Transforms/IPO.h>
50 #include <llvm/Transforms/IPO/AlwaysInliner.h>
51 #include <llvm/Transforms/IPO/InferFunctionAttrs.h>
52 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
53 #include <llvm/Transforms/InstCombine/InstCombine.h>
54 #include <llvm/Transforms/Instrumentation.h>
55 #include <llvm/Transforms/Scalar.h>
56 #include <llvm/Transforms/Scalar/GVN.h>
57 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
58 #include <llvm/Transforms/Utils.h>
59 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
60 #include <llvm/Transforms/Utils/Cloning.h>
61 
62 #if LLVM_VERSION_MAJOR >= 11
63 #include <llvm/Support/Host.h>
64 #endif
65 
66 #include "CudaMgr/CudaMgr.h"
78 #include "Shared/MathUtils.h"
79 #include "StreamingTopN.h"
80 
82 
83 #ifdef ENABLE_GEOS
84 
85 #include <llvm/Support/DynamicLibrary.h>
86 
87 // from Geospatial/GeosValidation.cpp
88 extern std::unique_ptr<std::string> g_libgeos_so_filename;
89 
90 static llvm::sys::DynamicLibrary geos_dynamic_library;
91 static std::mutex geos_init_mutex;
92 
93 namespace {
94 
95 void load_geos_dynamic_library() {
96  std::lock_guard<std::mutex> guard(geos_init_mutex);
97 
98  if (!geos_dynamic_library.isValid()) {
99  if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
100  LOG(WARNING) << "Misconfigured GEOS library file name, trying 'libgeos_c.so'";
101  g_libgeos_so_filename.reset(new std::string("libgeos_c.so"));
102  }
103  auto filename = *g_libgeos_so_filename;
104  std::string error_message;
105  geos_dynamic_library =
106  llvm::sys::DynamicLibrary::getPermanentLibrary(filename.c_str(), &error_message);
107  if (!geos_dynamic_library.isValid()) {
108  LOG(ERROR) << "Failed to load GEOS library '" + filename + "'";
109  std::string exception_message = "Failed to load GEOS library: " + error_message;
110  throw std::runtime_error(exception_message);
111  } else {
112  LOG(INFO) << "Loaded GEOS library '" + filename + "'";
113  }
114  }
115 }
116 
117 } // namespace
118 #endif
119 
120 namespace {
121 
122 void throw_parseIR_error(const llvm::SMDiagnostic& parse_error,
123  std::string src = "",
124  const bool is_gpu = false) {
125  std::string excname = (is_gpu ? "NVVM IR ParseError: " : "LLVM IR ParseError: ");
126  llvm::raw_string_ostream ss(excname);
127  parse_error.print(src.c_str(), ss, false, false);
128  throw ParseIRError(ss.str());
129 }
130 
131 /* SHOW_DEFINED(<llvm::Module instance>) prints the function names
132  that are defined in the given LLVM Module instance.
133 
134  SHOW_FUNCTIONS(<llvm::Module instance>) prints the function names
135  of all used functions in the given LLVM Module
136  instance. Declarations are marked with `[decl]` as a name suffix.
137 
138  Useful for debugging.
139 */
140 
141 #define SHOW_DEFINED(MODULE) \
142  { \
143  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
144  ::show_defined(MODULE); \
145  }
146 
147 #define SHOW_FUNCTIONS(MODULE) \
148  { \
149  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
150  ::show_functions(MODULE); \
151  }
152 
153 template <typename T = void>
154 void show_defined(llvm::Module& llvm_module) {
155  std::cout << "defines: ";
156  for (auto& f : llvm_module.getFunctionList()) {
157  if (!f.isDeclaration()) {
158  std::cout << f.getName().str() << ", ";
159  }
160  }
161  std::cout << std::endl;
162 }
163 
164 template <typename T = void>
165 void show_defined(llvm::Module* llvm_module) {
166  if (llvm_module == nullptr) {
167  std::cout << "is null" << std::endl;
168  } else {
169  show_defined(*llvm_module);
170  }
171 }
172 
173 template <typename T = void>
174 void show_defined(std::unique_ptr<llvm::Module>& llvm_module) {
175  show_defined(llvm_module.get());
176 }
177 
178 /*
179  scan_function_calls(module, defined, undefined, ignored) computes
180  defined and undefined sets of function names:
181 
182  - defined functions are those that are defined in the given module
183 
184  - undefined functions are those that are called by defined functions
185  but that are not defined in the given module
186 
187  - ignored functions are functions that may be undefined but will not
188  be listed in the set of undefined functions.
189 
190  Useful for debugging.
191 */
192 template <typename T = void>
193 void scan_function_calls(llvm::Function& F,
194  std::unordered_set<std::string>& defined,
195  std::unordered_set<std::string>& undefined,
196  const std::unordered_set<std::string>& ignored) {
197  for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
198  if (auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
199  auto* F2 = CI->getCalledFunction();
200  if (F2 != nullptr) {
201  auto F2name = F2->getName().str();
202  if (F2->isDeclaration()) {
203  if (F2name.rfind("__", 0) !=
204  0 // assume symbols with double underscore are defined
205  && F2name.rfind("llvm.", 0) !=
206  0 // TODO: this may give false positive for NVVM intrinsics
207  && ignored.find(F2name) == ignored.end() // not in ignored list
208  ) {
209  undefined.emplace(F2name);
210  }
211  } else {
212  if (defined.find(F2name) == defined.end()) {
213  defined.emplace(F2name);
214  scan_function_calls<T>(*F2, defined, undefined, ignored);
215  }
216  }
217  }
218  }
219  }
220 }
221 
222 template <typename T = void>
223 void scan_function_calls(llvm::Module& llvm_module,
224  std::unordered_set<std::string>& defined,
225  std::unordered_set<std::string>& undefined,
226  const std::unordered_set<std::string>& ignored) {
227  for (auto& F : llvm_module) {
228  if (!F.isDeclaration()) {
229  scan_function_calls(F, defined, undefined, ignored);
230  }
231  }
232 }
233 
234 template <typename T = void>
235 std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>>
236 scan_function_calls(llvm::Module& llvm_module,
237  const std::unordered_set<std::string>& ignored = {}) {
238  std::unordered_set<std::string> defined, undefined;
239  scan_function_calls(llvm_module, defined, undefined, ignored);
240  return std::make_tuple(defined, undefined);
241 }
242 
243 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
245  llvm::Module& M,
246  const std::unordered_set<llvm::Function*>& live_funcs) {
247  std::vector<llvm::Function*> dead_funcs;
248  for (auto& F : M) {
249  bool bAlive = false;
250  if (live_funcs.count(&F)) {
251  continue;
252  }
253  for (auto U : F.users()) {
254  auto* C = llvm::dyn_cast<const llvm::CallInst>(U);
255  if (!C || C->getParent()->getParent() != &F) {
256  bAlive = true;
257  break;
258  }
259  }
260  if (!bAlive) {
261  dead_funcs.push_back(&F);
262  }
263  }
264  for (auto pFn : dead_funcs) {
265  pFn->eraseFromParent();
266  }
267 }
268 
269 #ifdef HAVE_CUDA
270 
271 // check if linking with libdevice is required
272 // libdevice functions have a __nv_* prefix
273 bool check_module_requires_libdevice(llvm::Module* llvm_module) {
274  auto timer = DEBUG_TIMER(__func__);
275  for (llvm::Function& F : *llvm_module) {
276  if (F.hasName() && F.getName().startswith("__nv_")) {
277  LOG(INFO) << "Module requires linking with libdevice: " << std::string(F.getName());
278  return true;
279  }
280  }
281  LOG(DEBUG1) << "module does not require linking against libdevice";
282  return false;
283 }
284 
285 // Adds the missing intrinsics declarations to the given module
286 void add_intrinsics_to_module(llvm::Module* llvm_module) {
287  for (llvm::Function& F : *llvm_module) {
288  for (llvm::Instruction& I : instructions(F)) {
289  if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
290  if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
291  llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
292  llvm::Function& decl_fn =
293  *llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID(), Tys);
294  ii->setCalledFunction(&decl_fn);
295  } else {
296  // inserts the declaration into the module if not present
297  llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID());
298  }
299  }
300  }
301  }
302 }
303 
304 #endif
305 
306 void optimize_ir(llvm::Function* query_func,
307  llvm::Module* llvm_module,
308  llvm::legacy::PassManager& pass_manager,
309  const std::unordered_set<llvm::Function*>& live_funcs,
310  const bool is_gpu_smem_used,
311  const CompilationOptions& co) {
312  auto timer = DEBUG_TIMER(__func__);
313  // the always inliner legacy pass must always run first
314  pass_manager.add(llvm::createVerifierPass());
315  pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
316 
317  pass_manager.add(new AnnotateInternalFunctionsPass());
318 
319  pass_manager.add(llvm::createSROAPass());
320  // mem ssa drops unused load and store instructions, e.g. passing variables directly
321  // where possible
322  pass_manager.add(
323  llvm::createEarlyCSEPass(/*enable_mem_ssa=*/true)); // Catch trivial redundancies
324 
325  if (!is_gpu_smem_used) {
326  // thread jumps can change the execution order around SMEM sections guarded by
327  // `__syncthreads()`, which results in race conditions. For now, disable jump
328  // threading for shared memory queries. In the future, consider handling shared
329  // memory aggregations with a separate kernel launch
330  pass_manager.add(llvm::createJumpThreadingPass()); // Thread jumps.
331  }
332  pass_manager.add(llvm::createCFGSimplificationPass());
333 
334  // remove load/stores in PHIs if instructions can be accessed directly post thread jumps
335  pass_manager.add(llvm::createNewGVNPass());
336 
337  pass_manager.add(llvm::createDeadStoreEliminationPass());
338  pass_manager.add(llvm::createLICMPass());
339 
340  pass_manager.add(llvm::createInstructionCombiningPass());
341 
342  // module passes
343  pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
344  pass_manager.add(llvm::createGlobalOptimizerPass());
345 
346  pass_manager.add(llvm::createCFGSimplificationPass()); // cleanup after everything
347 
348  pass_manager.run(*llvm_module);
349 
350  eliminate_dead_self_recursive_funcs(*llvm_module, live_funcs);
351 }
352 #endif
353 
354 } // namespace
355 
357 
358 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine)
359  : execution_engine_(execution_engine) {}
360 
361 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine,
362  const CompilationOptions& co)
363  : execution_engine_(execution_engine) {
364  if (execution_engine_) {
366 #ifdef ENABLE_INTEL_JIT_LISTENER
367  intel_jit_listener_.reset(llvm::JITEventListener::createIntelJITEventListener());
369  execution_engine_->RegisterJITEventListener(intel_jit_listener_.get());
370  LOG(INFO) << "Registered IntelJITEventListener";
371 #else
372  LOG(WARNING) << "This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
373  "listener configuration parameter.";
374 #endif // ENABLE_INTEL_JIT_LISTENER
375  }
376  }
377 }
378 
380  llvm::ExecutionEngine* execution_engine) {
381  execution_engine_.reset(execution_engine);
382  intel_jit_listener_ = nullptr;
383  return *this;
384 }
385 
386 void verify_function_ir(const llvm::Function* func) {
387  std::stringstream err_ss;
388  llvm::raw_os_ostream err_os(err_ss);
389  err_os << "\n-----\n";
390  if (llvm::verifyFunction(*func, &err_os)) {
391  err_os << "\n-----\n";
392  func->print(err_os, nullptr);
393  err_os << "\n-----\n";
394  LOG(FATAL) << err_ss.str();
395  }
396 }
397 
398 namespace {
399 
400 std::string assemblyForCPU(ExecutionEngineWrapper& execution_engine,
401  llvm::Module* llvm_module) {
402  llvm::legacy::PassManager pass_manager;
403  auto cpu_target_machine = execution_engine->getTargetMachine();
404  CHECK(cpu_target_machine);
405  llvm::SmallString<256> code_str;
406  llvm::raw_svector_ostream os(code_str);
407 #if LLVM_VERSION_MAJOR >= 10
408  cpu_target_machine->addPassesToEmitFile(
409  pass_manager, os, nullptr, llvm::CGFT_AssemblyFile);
410 #else
411  cpu_target_machine->addPassesToEmitFile(
412  pass_manager, os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
413 #endif
414  pass_manager.run(*llvm_module);
415  return "Assembly for the CPU:\n" + std::string(code_str.str()) + "\nEnd of assembly";
416 }
417 
419  llvm::EngineBuilder& eb,
420  const CompilationOptions& co) {
421  auto timer = DEBUG_TIMER(__func__);
422  ExecutionEngineWrapper execution_engine(eb.create(), co);
423  CHECK(execution_engine.get());
424  // Force the module data layout to match the layout for the selected target
425  llvm_module->setDataLayout(execution_engine->getDataLayout());
426 
427  LOG(ASM) << assemblyForCPU(execution_engine, llvm_module);
428 
429  execution_engine->finalizeObject();
430  return execution_engine;
431 }
432 
433 } // namespace
434 
436 
438  llvm::Function* func,
439  const std::unordered_set<llvm::Function*>& live_funcs,
440  const CompilationOptions& co) {
441  auto timer = DEBUG_TIMER(__func__);
442  llvm::Module* llvm_module = func->getParent();
443  CHECK(llvm_module);
444  // run optimizations
445 #ifndef WITH_JIT_DEBUG
446  llvm::legacy::PassManager pass_manager;
447  optimize_ir(
448  func, llvm_module, pass_manager, live_funcs, /*is_gpu_smem_used=*/false, co);
449 #endif // WITH_JIT_DEBUG
450 
451  // The following lock avoids a data race in two places:
452  // 1) in initializaiton of the CPU backend targets
453  // 1) in llvm::sys::DynamicLibrary::getPermanentLibrary and
454  // GDBJITRegistrationListener::notifyObjectLoaded while creating a
455  // new ExecutionEngine instance in the child call create_execution_engine.
456 
457  // Todo: Initialize backend CPU (and perhaps GPU?) targets at startup
458  // instead of for every compilation, and see if we can reduce
459  // the scope of the below lock
460 
461  std::lock_guard<std::mutex> lock(initialize_cpu_backend_mutex_);
462  auto init_err = llvm::InitializeNativeTarget();
463  CHECK(!init_err);
464 
465  llvm::InitializeAllTargetMCs();
466  llvm::InitializeNativeTargetAsmPrinter();
467  llvm::InitializeNativeTargetAsmParser();
468 
469  std::string err_str;
470  std::unique_ptr<llvm::Module> owner(llvm_module);
471  CHECK(owner);
472  llvm::EngineBuilder eb(std::move(owner));
473  eb.setErrorStr(&err_str);
474  eb.setEngineKind(llvm::EngineKind::JIT);
475  llvm::TargetOptions to;
476  to.EnableFastISel = true;
477  eb.setTargetOptions(to);
479  eb.setOptLevel(llvm::CodeGenOpt::None);
480  }
481 
482  return create_execution_engine(llvm_module, eb, co);
483 }
484 
485 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenCPU(
486  llvm::Function* query_func,
487  llvm::Function* multifrag_query_func,
488  const std::unordered_set<llvm::Function*>& live_funcs,
489  const CompilationOptions& co) {
490  CodeCacheKey key{serialize_llvm_object(query_func),
491  serialize_llvm_object(cgen_state_->row_func_)};
492 
493  llvm::Module* M = query_func->getParent();
494  auto* flag = llvm::mdconst::extract_or_null<llvm::ConstantInt>(
495  M->getModuleFlag("manage_memory_buffer"));
496  if (flag and flag->getZExtValue() == 1 and M->getFunction("allocate_varlen_buffer") and
497  M->getFunction("register_buffer_with_executor_rsm")) {
498  LOG(INFO) << "including executor addr to cache key\n";
499  key.push_back(std::to_string(reinterpret_cast<int64_t>(this)));
500  }
501  if (cgen_state_->filter_func_) {
502  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
503  }
504  for (const auto helper : cgen_state_->helper_functions_) {
505  key.push_back(serialize_llvm_object(helper));
506  }
507  auto cached_code = QueryEngine::getInstance()->cpu_code_accessor->get_value(key);
508  if (cached_code) {
509  return cached_code;
510  }
511  if (cgen_state_->needs_geos_) {
512 #ifdef ENABLE_GEOS
513  auto llvm_module = multifrag_query_func->getParent();
514  load_geos_dynamic_library();
515 
516  // Read geos runtime module and bind GEOS API function references to GEOS library
517  auto rt_geos_module_copy = llvm::CloneModule(
518  *get_geos_module(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
519  auto func = llvm::dyn_cast<llvm::Function>(gv);
520  if (!func) {
521  return true;
522  }
523  switch (func->getLinkage()) {
524  case llvm::GlobalValue::LinkageTypes::InternalLinkage:
525  case llvm::GlobalValue::LinkageTypes::PrivateLinkage:
526  case llvm::GlobalValue::LinkageTypes::ExternalLinkage:
527  case llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage:
528  return true;
529  default:
530  return false;
531  }
532  });
533  CodeGenerator::link_udf_module(rt_geos_module_copy,
534  *llvm_module,
535  cgen_state_.get(),
536  llvm::Linker::Flags::LinkOnlyNeeded);
537 #else
538  throw std::runtime_error("GEOS is disabled in this build");
539 #endif
540  }
541 
542  auto execution_engine =
543  CodeGenerator::generateNativeCPUCode(query_func, live_funcs, co);
544  auto cpu_compilation_context =
545  std::make_shared<CpuCompilationContext>(std::move(execution_engine));
546  cpu_compilation_context->setFunctionPointer(multifrag_query_func);
547  QueryEngine::getInstance()->cpu_code_accessor->put(key, cpu_compilation_context);
548  return std::dynamic_pointer_cast<CompilationContext>(cpu_compilation_context);
549 }
550 
551 void CodeGenerator::link_udf_module(const std::unique_ptr<llvm::Module>& udf_module,
552  llvm::Module& llvm_module,
553  CgenState* cgen_state,
554  llvm::Linker::Flags flags) {
555  auto timer = DEBUG_TIMER(__func__);
556  // throw a runtime error if the target module contains functions
557  // with the same name as in module of UDF functions.
558  for (auto& f : *udf_module) {
559  auto func = llvm_module.getFunction(f.getName());
560  if (!(func == nullptr) && !f.isDeclaration() && flags == llvm::Linker::Flags::None) {
561  LOG(ERROR) << " Attempt to overwrite " << f.getName().str() << " in "
562  << llvm_module.getModuleIdentifier() << " from `"
563  << udf_module->getModuleIdentifier() << "`" << std::endl;
564  throw std::runtime_error(
565  "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
566  "function ***");
567  } else {
568  VLOG(1) << " Adding " << f.getName().str() << " to "
569  << llvm_module.getModuleIdentifier() << " from `"
570  << udf_module->getModuleIdentifier() << "`" << std::endl;
571  }
572  }
573 
574  auto udf_module_copy = llvm::CloneModule(*udf_module, cgen_state->vmap_);
575 
576  udf_module_copy->setDataLayout(llvm_module.getDataLayout());
577  udf_module_copy->setTargetTriple(llvm_module.getTargetTriple());
578 
579  // Initialize linker with module for RuntimeFunctions.bc
580  llvm::Linker ld(llvm_module);
581  bool link_error = false;
582 
583  link_error = ld.linkInModule(std::move(udf_module_copy), flags);
584 
585  if (link_error) {
586  throw std::runtime_error("link_udf_module: *** error linking module ***");
587  }
588 }
589 
590 namespace {
591 
592 std::string cpp_to_llvm_name(const std::string& s) {
593  if (s == "int8_t") {
594  return "i8";
595  }
596  if (s == "int16_t") {
597  return "i16";
598  }
599  if (s == "int32_t") {
600  return "i32";
601  }
602  if (s == "int64_t") {
603  return "i64";
604  }
605  CHECK(s == "float" || s == "double");
606  return s;
607 }
608 
609 std::string gen_array_any_all_sigs() {
610  std::string result;
611  for (const std::string any_or_all : {"any", "all"}) {
612  for (const std::string elem_type :
613  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
614  for (const std::string needle_type :
615  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
616  for (const std::string op_name : {"eq", "ne", "lt", "le", "gt", "ge"}) {
617  result += ("declare i1 @array_" + any_or_all + "_" + op_name + "_" + elem_type +
618  "_" + needle_type + "(i8*, i64, " + cpp_to_llvm_name(needle_type) +
619  ", " + cpp_to_llvm_name(elem_type) + ");\n");
620  }
621  }
622  }
623  }
624  return result;
625 }
626 
628  std::string result;
629  for (const std::string key_type : {"int8_t", "int16_t", "int32_t", "int64_t"}) {
630  const auto key_llvm_type = cpp_to_llvm_name(key_type);
631  result += "declare i64 @translate_null_key_" + key_type + "(" + key_llvm_type + ", " +
632  key_llvm_type + ", i64);\n";
633  }
634  return result;
635 }
636 
637 const std::string cuda_rt_decls =
638  R"( declare void @llvm.dbg.declare(metadata, metadata, metadata) declare void @llvm.dbg.value(metadata, metadata, metadata) declare double @llvm.fmuladd.f64(double, double, double) declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind declare i64 @get_thread_index(); declare i64 @get_block_index(); declare i32 @pos_start_impl(i32*); declare i32 @group_buff_idx_impl(); declare i32 @pos_step_impl(); declare i8 @thread_warp_idx(i8); declare i64* @init_shared_mem(i64*, i32); declare i64* @init_shared_mem_nop(i64*, i32); declare i64* @declare_dynamic_shared_memory(); declare void @write_back_nop(i64*, i64*, i32); declare void @write_back_non_grouped_agg(i64*, i64*, i32); declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8); declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32); declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32); declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32); declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32); declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32); declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32); declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64); declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64); declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64); declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64); declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64); declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double); declare i64 @get_bucket_key_for_range_double(i8*, i64, double); declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double); declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64); declare i64 @agg_count_shared(i64*, i64); declare i64 @agg_count_skip_val_shared(i64*, i64, i64); declare i32 @agg_count_int32_shared(i32*, i32); declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32); declare i64 @agg_count_double_shared(i64*, double); declare i64 @agg_count_double_skip_val_shared(i64*, double, double); declare i32 @agg_count_float_shared(i32*, float); declare i32 @agg_count_float_skip_val_shared(i32*, float, float); declare i64 @agg_count_if_shared(i64*, i64); declare i64 @agg_count_if_skip_val_shared(i64*, i64, i64); declare i32 @agg_count_if_int32_shared(i32*, i32); declare i32 @agg_count_if_int32_skip_val_shared(i32*, i32, i32); declare i64 @agg_sum_shared(i64*, i64); declare i64 @agg_sum_skip_val_shared(i64*, i64, i64); declare i32 @agg_sum_int32_shared(i32*, i32); declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32); declare void @agg_sum_double_shared(i64*, double); declare void @agg_sum_double_skip_val_shared(i64*, double, double); declare void @agg_sum_float_shared(i32*, float); declare void @agg_sum_float_skip_val_shared(i32*, float, float); declare i64 @agg_sum_if_shared(i64*, i64, i8); declare i64 @agg_sum_if_skip_val_shared(i64*, i64, i64, i8); declare i32 @agg_sum_if_int32_shared(i32*, i32, i8); declare i32 @agg_sum_if_int32_skip_val_shared(i32*, i32, i32, i8); declare void @agg_sum_if_double_shared(i64*, double, i8); declare void @agg_sum_if_double_skip_val_shared(i64*, double, double, i8); declare void @agg_sum_if_float_shared(i32*, float, i8); declare void @agg_sum_if_float_skip_val_shared(i32*, float, float, i8); declare void @agg_max_shared(i64*, i64); declare void @agg_max_skip_val_shared(i64*, i64, i64); declare void @agg_max_int32_shared(i32*, i32); declare void @agg_max_int32_skip_val_shared(i32*, i32, i32); declare void @agg_max_int16_shared(i16*, i16); declare void @agg_max_int16_skip_val_shared(i16*, i16, i16); declare void @agg_max_int8_shared(i8*, i8); declare void @agg_max_int8_skip_val_shared(i8*, i8, i8); declare void @agg_max_double_shared(i64*, double); declare void @agg_max_double_skip_val_shared(i64*, double, double); declare void @agg_max_float_shared(i32*, float); declare void @agg_max_float_skip_val_shared(i32*, float, float); declare void @agg_min_shared(i64*, i64); declare void @agg_min_skip_val_shared(i64*, i64, i64); declare void @agg_min_int32_shared(i32*, i32); declare void @agg_min_int32_skip_val_shared(i32*, i32, i32); declare void @agg_min_int16_shared(i16*, i16); declare void @agg_min_int16_skip_val_shared(i16*, i16, i16); declare void @agg_min_int8_shared(i8*, i8); declare void @agg_min_int8_skip_val_shared(i8*, i8, i8); declare void @agg_min_double_shared(i64*, double); declare void @agg_min_double_skip_val_shared(i64*, double, double); declare void @agg_min_float_shared(i32*, float); declare void @agg_min_float_skip_val_shared(i32*, float, float); declare void @agg_id_shared(i64*, i64); declare i8* @agg_id_varlen_shared(i8*, i64, i8*, i64); declare void @agg_id_int32_shared(i32*, i32); declare void @agg_id_int16_shared(i16*, i16); declare void @agg_id_int8_shared(i8*, i8); declare void @agg_id_double_shared(i64*, double); declare void @agg_id_double_shared_slow(i64*, double*); declare void @agg_id_float_shared(i32*, float); declare i32 @checked_single_agg_id_shared(i64*, i64, i64); declare i32 @checked_single_agg_id_double_shared(i64*, double, double); declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double); declare i32 @checked_single_agg_id_float_shared(i32*, float, float); declare i1 @slotEmptyKeyCAS(i64*, i64, i64); declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32); declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16); declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8); declare i64 @datetrunc_century(i64); declare i64 @datetrunc_day(i64); declare i64 @datetrunc_decade(i64); declare i64 @datetrunc_hour(i64); declare i64 @datetrunc_millennium(i64); declare i64 @datetrunc_minute(i64); declare i64 @datetrunc_month(i64); declare i64 @datetrunc_quarter(i64); declare i64 @datetrunc_quarterday(i64); declare i64 @datetrunc_week_monday(i64); declare i64 @datetrunc_week_sunday(i64); declare i64 @datetrunc_week_saturday(i64); declare i64 @datetrunc_year(i64); declare i64 @extract_epoch(i64); declare i64 @extract_dateepoch(i64); declare i64 @extract_quarterday(i64); declare i64 @extract_hour(i64); declare i64 @extract_minute(i64); declare i64 @extract_second(i64); declare i64 @extract_millisecond(i64); declare i64 @extract_microsecond(i64); declare i64 @extract_nanosecond(i64); declare i64 @extract_dow(i64); declare i64 @extract_isodow(i64); declare i64 @extract_day(i64); declare i64 @extract_week_monday(i64); declare i64 @extract_week_sunday(i64); declare i64 @extract_week_saturday(i64); declare i64 @extract_day_of_year(i64); declare i64 @extract_month(i64); declare i64 @extract_quarter(i64); declare i64 @extract_year(i64); declare i64 @ExtractTimeFromHPTimestamp(i64,i64); declare i64 @ExtractTimeFromHPTimestampNullable(i64,i64,i64); declare i64 @ExtractTimeFromLPTimestamp(i64); declare i64 @ExtractTimeFromLPTimestampNullable(i64,i64); declare i64 @DateTruncateHighPrecisionToDate(i64, i64); declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64); declare i64 @DateDiff(i32, i64, i64); declare i64 @DateDiffNullable(i32, i64, i64, i64); declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32); declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64); declare i64 @DateAdd(i32, i64, i64); declare i64 @DateAddNullable(i32, i64, i64, i64); declare i64 @DateAddHighPrecision(i32, i64, i64, i32); declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64); declare {i8*,i64} @string_decode(i8*, i64); declare i32 @array_size(i8*, i64, i32); declare i32 @array_size_nullable(i8*, i64, i32, i32); declare i32 @array_size_1_nullable(i8*, i64, i32); declare i32 @fast_fixlen_array_size(i8*, i32); declare i1 @array_is_null(i8*, i64); declare i1 @point_coord_array_is_null(i8*, i64); declare i8* @array_buff(i8*, i64); declare i8* @fast_fixlen_array_buff(i8*, i64); declare i64 @determine_fixed_array_len(i8*, i64); declare i8 @array_at_int8_t(i8*, i64, i32); declare i16 @array_at_int16_t(i8*, i64, i32); declare i32 @array_at_int32_t(i8*, i64, i32); declare i64 @array_at_int64_t(i8*, i64, i32); declare float @array_at_float(i8*, i64, i32); declare double @array_at_double(i8*, i64, i32); declare i8 @varlen_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_array_at_int64_t(i8*, i64, i32); declare float @varlen_array_at_float(i8*, i64, i32); declare double @varlen_array_at_double(i8*, i64, i32); declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32); declare float @varlen_notnull_array_at_float(i8*, i64, i32); declare double @varlen_notnull_array_at_double(i8*, i64, i32); declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8); declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16); declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32); declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64); declare float @array_at_float_checked(i8*, i64, i64, float); declare double @array_at_double_checked(i8*, i64, i64, double); declare i32 @char_length(i8*, i32); declare i32 @char_length_nullable(i8*, i32, i32); declare i32 @char_length_encoded(i8*, i32); declare i32 @char_length_encoded_nullable(i8*, i32, i32); declare i32 @key_for_string_encoded(i32); declare i1 @sample_ratio(double, i64); declare double @width_bucket(double, double, double, double, i32); declare double @width_bucket_reverse(double, double, double, double, i32); declare double @width_bucket_nullable(double, double, double, double, i32, double); declare double @width_bucket_reversed_nullable(double, double, double, double, i32, double); declare double @width_bucket_no_oob_check(double, double, double); declare double @width_bucket_reverse_no_oob_check(double, double, double); declare double @width_bucket_expr(double, i1, double, double, i32); declare double @width_bucket_expr_nullable(double, i1, double, double, i32, double); declare double @width_bucket_expr_no_oob_check(double, i1, double, double, i32); declare i1 @string_like(i8*, i32, i8*, i32, i8); declare i1 @string_ilike(i8*, i32, i8*, i32, i8); declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8); declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8); declare i1 @string_like_simple(i8*, i32, i8*, i32); declare i1 @string_ilike_simple(i8*, i32, i8*, i32); declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8); declare i1 @string_lt(i8*, i32, i8*, i32); declare i1 @string_le(i8*, i32, i8*, i32); declare i1 @string_gt(i8*, i32, i8*, i32); declare i1 @string_ge(i8*, i32, i8*, i32); declare i1 @string_eq(i8*, i32, i8*, i32); declare i1 @string_ne(i8*, i32, i8*, i32); declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8); declare i1 @regexp_like(i8*, i32, i8*, i32, i8); declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8); declare void @linear_probabilistic_count(i8*, i32, i8*, i32); declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64); declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64); declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64); declare void @record_error_code(i32, i32*); declare i32 @get_error_code(i32*); declare i1 @dynamic_watchdog(); declare i1 @check_interrupt(); declare void @force_sync(); declare void @sync_warp(); declare void @sync_warp_protected(i64, i64); declare void @sync_threadblock(); declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32); declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64); declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float); declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double); declare double @decompress_x_coord_geoint(i32); declare double @decompress_y_coord_geoint(i32); declare i32 @compress_x_coord_geoint(double); declare i32 @compress_y_coord_geoint(double); declare i64 @fixed_width_date_encode(i64, i32, i64); declare i64 @fixed_width_date_decode(i64, i32, i64); )" + gen_array_any_all_sigs() +
640 
641 #ifdef HAVE_CUDA
642 
643 namespace {
644 bool check_any_operand_is_stacksave_intrinsic(llvm::Instruction& inst) {
645  for (auto op_it = inst.op_begin(); op_it != inst.op_end(); op_it++) {
646  if (const llvm::IntrinsicInst* inst2 = llvm::dyn_cast<llvm::IntrinsicInst>(*op_it)) {
647  if (inst2->getIntrinsicID() == llvm::Intrinsic::stacksave) {
648  return true;
649  }
650  }
651  }
652  return false;
653 }
654 } // namespace
655 
656 std::string extension_function_decls(const std::unordered_set<std::string>& udf_decls) {
657  const auto decls =
658  ExtensionFunctionsWhitelist::getLLVMDeclarations(udf_decls, /*is_gpu=*/true);
659  return boost::algorithm::join(decls, "\n");
660 }
661 
662 void legalize_nvvm_ir(llvm::Function* query_func) {
663  // optimizations might add attributes to the function
664  // and NVPTX doesn't understand all of them; play it
665  // safe and clear all attributes
666  clear_function_attributes(query_func);
667  verify_function_ir(query_func);
668 
669  std::vector<llvm::Instruction*> stackrestore_intrinsics;
670  std::vector<llvm::Instruction*> stacksave_intrinsics;
671  std::vector<llvm::Instruction*> lifetime;
672  for (auto& BB : *query_func) {
673  for (llvm::Instruction& I : BB) {
674  if (llvm::dyn_cast<llvm::PHINode>(&I)) {
675  if (check_any_operand_is_stacksave_intrinsic(I)) {
676  // AFAIK, the only case we have to remove an non-intrinsic inst is a PHI node
677  // iff at least its one of operands is llvm::stacksave intrinsic
678  stacksave_intrinsics.push_back(&I);
679  VLOG(2) << "Remove PHI node having llvm::stacksave intrinsic as its operand";
680  }
681  } else if (const llvm::IntrinsicInst* II =
682  llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
683  if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
684  stacksave_intrinsics.push_back(&I);
685  } else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
686  stackrestore_intrinsics.push_back(&I);
687  } else if (II->getIntrinsicID() == llvm::Intrinsic::lifetime_start ||
688  II->getIntrinsicID() == llvm::Intrinsic::lifetime_end) {
689  lifetime.push_back(&I);
690  }
691  }
692  }
693  }
694 
695  // stacksave and stackrestore intrinsics appear together, and
696  // stackrestore uses stacksaved result as its argument
697  // so it should be removed first.
698  for (auto& II : stackrestore_intrinsics) {
699  II->eraseFromParent();
700  }
701  for (auto& II : stacksave_intrinsics) {
702  II->eraseFromParent();
703  }
704  // Remove lifetime intrinsics as well. NVPTX don't like them
705  for (auto& II : lifetime) {
706  II->eraseFromParent();
707  }
708 }
709 #endif // HAVE_CUDA
710 
711 } // namespace
712 
713 llvm::StringRef get_gpu_target_triple_string() {
714  return llvm::StringRef("nvptx64-nvidia-cuda");
715 }
716 
717 llvm::StringRef get_gpu_data_layout() {
718  return llvm::StringRef(
719  "e-p:64:64:64-i1:8:8-i8:8:8-"
720  "i16:16:16-i32:32:32-i64:64:64-"
721  "f32:32:32-f64:64:64-v16:16:16-"
722  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
723 }
724 
725 std::map<std::string, std::string> get_device_parameters(bool cpu_only) {
726  std::map<std::string, std::string> result;
727 
728  result.insert(std::make_pair("cpu_name", llvm::sys::getHostCPUName()));
729  result.insert(std::make_pair("cpu_triple", llvm::sys::getProcessTriple()));
730  result.insert(
731  std::make_pair("cpu_cores", std::to_string(llvm::sys::getHostNumPhysicalCores())));
732  result.insert(std::make_pair("cpu_threads", std::to_string(cpu_threads())));
733 
734  // https://en.cppreference.com/w/cpp/language/types
735  std::string sizeof_types;
736  sizeof_types += "bool:" + std::to_string(sizeof(bool)) + ";";
737  sizeof_types += "size_t:" + std::to_string(sizeof(size_t)) + ";";
738  sizeof_types += "ssize_t:" + std::to_string(sizeof(ssize_t)) + ";";
739  sizeof_types += "char:" + std::to_string(sizeof(char)) + ";";
740  sizeof_types += "uchar:" + std::to_string(sizeof(unsigned char)) + ";";
741  sizeof_types += "short:" + std::to_string(sizeof(short)) + ";";
742  sizeof_types += "ushort:" + std::to_string(sizeof(unsigned short int)) + ";";
743  sizeof_types += "int:" + std::to_string(sizeof(int)) + ";";
744  sizeof_types += "uint:" + std::to_string(sizeof(unsigned int)) + ";";
745  sizeof_types += "long:" + std::to_string(sizeof(long int)) + ";";
746  sizeof_types += "ulong:" + std::to_string(sizeof(unsigned long int)) + ";";
747  sizeof_types += "longlong:" + std::to_string(sizeof(long long int)) + ";";
748  sizeof_types += "ulonglong:" + std::to_string(sizeof(unsigned long long int)) + ";";
749  sizeof_types += "float:" + std::to_string(sizeof(float)) + ";";
750  sizeof_types += "double:" + std::to_string(sizeof(double)) + ";";
751  sizeof_types += "longdouble:" + std::to_string(sizeof(long double)) + ";";
752  sizeof_types += "voidptr:" + std::to_string(sizeof(void*)) + ";";
753 
754  result.insert(std::make_pair("type_sizeof", sizeof_types));
755 
756  std::string null_values;
757  null_values += "boolean1:" + std::to_string(serialized_null_value<bool>()) + ";";
758  null_values += "boolean8:" + std::to_string(serialized_null_value<int8_t>()) + ";";
759  null_values += "int8:" + std::to_string(serialized_null_value<int8_t>()) + ";";
760  null_values += "int16:" + std::to_string(serialized_null_value<int16_t>()) + ";";
761  null_values += "int32:" + std::to_string(serialized_null_value<int32_t>()) + ";";
762  null_values += "int64:" + std::to_string(serialized_null_value<int64_t>()) + ";";
763  null_values += "uint8:" + std::to_string(serialized_null_value<uint8_t>()) + ";";
764  null_values += "uint16:" + std::to_string(serialized_null_value<uint16_t>()) + ";";
765  null_values += "uint32:" + std::to_string(serialized_null_value<uint32_t>()) + ";";
766  null_values += "uint64:" + std::to_string(serialized_null_value<uint64_t>()) + ";";
767  null_values += "float32:" + std::to_string(serialized_null_value<float>()) + ";";
768  null_values += "float64:" + std::to_string(serialized_null_value<double>()) + ";";
769  null_values +=
770  "Array<boolean8>:" + std::to_string(serialized_null_value<int8_t, true>()) + ";";
771  null_values +=
772  "Array<int8>:" + std::to_string(serialized_null_value<int8_t, true>()) + ";";
773  null_values +=
774  "Array<int16>:" + std::to_string(serialized_null_value<int16_t, true>()) + ";";
775  null_values +=
776  "Array<int32>:" + std::to_string(serialized_null_value<int32_t, true>()) + ";";
777  null_values +=
778  "Array<int64>:" + std::to_string(serialized_null_value<int64_t, true>()) + ";";
779  null_values +=
780  "Array<float32>:" + std::to_string(serialized_null_value<float, true>()) + ";";
781  null_values +=
782  "Array<float64>:" + std::to_string(serialized_null_value<double, true>()) + ";";
783 
784  result.insert(std::make_pair("null_values", null_values));
785 
786  llvm::StringMap<bool> cpu_features;
787  if (llvm::sys::getHostCPUFeatures(cpu_features)) {
788  std::string features_str = "";
789  for (auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
790  features_str += (it->getValue() ? " +" : " -");
791  features_str += it->getKey().str();
792  }
793  result.insert(std::make_pair("cpu_features", features_str));
794  }
795 
796  result.insert(std::make_pair("llvm_version",
797  std::to_string(LLVM_VERSION_MAJOR) + "." +
798  std::to_string(LLVM_VERSION_MINOR) + "." +
799  std::to_string(LLVM_VERSION_PATCH)));
800 
801 #ifdef HAVE_CUDA
802  if (!cpu_only) {
803  int device_count = 0;
804  checkCudaErrors(cuDeviceGetCount(&device_count));
805  if (device_count) {
806  CUdevice device{};
807  char device_name[256];
808  int major = 0, minor = 0;
809  int driver_version;
810  checkCudaErrors(cuDeviceGet(&device, 0)); // assuming homogeneous multi-GPU system
811  checkCudaErrors(cuDeviceGetName(device_name, 256, device));
812  checkCudaErrors(cuDeviceGetAttribute(
813  &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
814  checkCudaErrors(cuDeviceGetAttribute(
815  &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
816  checkCudaErrors(cuDriverGetVersion(&driver_version));
817 
818  result.insert(std::make_pair("gpu_name", device_name));
819  result.insert(std::make_pair("gpu_count", std::to_string(device_count)));
820  result.insert(std::make_pair("gpu_compute_capability",
821  std::to_string(major) + "." + std::to_string(minor)));
822  result.insert(std::make_pair("gpu_triple", get_gpu_target_triple_string()));
823  result.insert(std::make_pair("gpu_datalayout", get_gpu_data_layout()));
824  result.insert(std::make_pair("gpu_driver",
825  "CUDA " + std::to_string(driver_version / 1000) + "." +
826  std::to_string((driver_version % 1000) / 10)));
827 
828  auto rt_libdevice_path = get_cuda_libdevice_dir() + "/libdevice.10.bc";
829  result.insert(
830  std::make_pair("gpu_has_libdevice",
831  std::to_string(boost::filesystem::exists(rt_libdevice_path))));
832  }
833  }
834 #endif
835 
836  return result;
837 }
838 
839 namespace {
840 
841 #ifdef HAVE_CUDA
842 std::unordered_set<llvm::Function*> findAliveRuntimeFuncs(
843  llvm::Module& llvm_module,
844  const std::vector<llvm::Function*>& roots) {
845  std::queue<llvm::Function*> queue;
846  std::unordered_set<llvm::Function*> visited;
847  for (llvm::Function* F : roots) {
848  queue.push(F);
849  }
850 
851  while (!queue.empty()) {
852  llvm::Function* F = queue.front();
853  queue.pop();
854  if (visited.find(F) != visited.end()) {
855  continue;
856  }
857  visited.insert(F);
858 
859  for (llvm::inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
860  if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
861  if (CI->isInlineAsm()) { // libdevice calls inline assembly code
862  continue;
863  }
864  llvm::Function* called = CI->getCalledFunction();
865  if (!called || visited.find(called) != visited.end()) {
866  continue;
867  }
868  queue.push(called);
869  }
870  }
871  }
872  return visited;
873 }
874 #endif
875 
876 } // namespace
877 
879  Executor* executor,
880  llvm::Module& llvm_module,
881  llvm::PassManagerBuilder& pass_manager_builder,
882  const GPUTarget& gpu_target) {
883 #ifdef HAVE_CUDA
884  auto timer = DEBUG_TIMER(__func__);
885 
886  if (!executor->has_libdevice_module()) {
887  // raise error
888  throw std::runtime_error(
889  "libdevice library is not available but required by the UDF module");
890  }
891 
892  // Saves functions \in module
893  std::vector<llvm::Function*> roots;
894  for (llvm::Function& fn : llvm_module) {
895  if (!fn.isDeclaration()) {
896  roots.emplace_back(&fn);
897  }
898  }
899 
900  // Bind libdevice to the current module
901  CodeGenerator::link_udf_module(executor->get_libdevice_module(),
902  llvm_module,
903  gpu_target.cgen_state,
904  llvm::Linker::Flags::OverrideFromSrc);
905 
906  std::unordered_set<llvm::Function*> live_funcs =
907  findAliveRuntimeFuncs(llvm_module, roots);
908 
909  std::vector<llvm::Function*> funcs_to_delete;
910  for (llvm::Function& fn : llvm_module) {
911  if (!live_funcs.count(&fn)) {
912  // deleting the function were would invalidate the iterator
913  funcs_to_delete.emplace_back(&fn);
914  }
915  }
916 
917  for (llvm::Function* f : funcs_to_delete) {
918  f->eraseFromParent();
919  }
920 
921  // activate nvvm-reflect-ftz flag on the module
922 #if LLVM_VERSION_MAJOR >= 11
923  llvm::LLVMContext& ctx = llvm_module.getContext();
924  llvm_module.setModuleFlag(llvm::Module::Override,
925  "nvvm-reflect-ftz",
926  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
927  llvm::Type::getInt32Ty(ctx), uint32_t(1))));
928 #else
929  llvm_module.addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", uint32_t(1));
930 #endif
931  for (llvm::Function& fn : llvm_module) {
932  fn.addFnAttr("nvptx-f32ftz", "true");
933  }
934 
935  // add nvvm reflect pass replacing any NVVM conditionals with constants
936  gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
937  llvm::legacy::FunctionPassManager FPM(&llvm_module);
938  pass_manager_builder.populateFunctionPassManager(FPM);
939 
940  // Run the NVVMReflectPass here rather than inside optimize_ir
941  FPM.doInitialization();
942  for (auto& F : llvm_module) {
943  FPM.run(F);
944  }
945  FPM.doFinalization();
946 #endif
947 }
948 
949 std::shared_ptr<GpuCompilationContext> CodeGenerator::generateNativeGPUCode(
950  Executor* executor,
951  llvm::Function* func,
952  llvm::Function* wrapper_func,
953  const std::unordered_set<llvm::Function*>& live_funcs,
954  const bool is_gpu_smem_used,
956  const GPUTarget& gpu_target) {
957 #ifdef HAVE_CUDA
958  auto timer = DEBUG_TIMER(__func__);
959  auto llvm_module = func->getParent();
960  /*
961  `func` is one of the following generated functions:
962  - `call_table_function(i8** %input_col_buffers, i64*
963  %input_row_count, i64** %output_buffers, i64* %output_row_count)`
964  that wraps the user-defined table function.
965  - `multifrag_query`
966  - `multifrag_query_hoisted_literals`
967  - ...
968 
969  `wrapper_func` is table_func_kernel(i32*, i8**, i64*, i64**,
970  i64*) that wraps `call_table_function`.
971 
972  `llvm_module` is from `build/QueryEngine/RuntimeFunctions.bc` and it
973  contains `func` and `wrapper_func`. `llvm_module` should also contain
974  the definitions of user-defined table functions.
975 
976  `live_funcs` contains table_func_kernel and call_table_function
977 
978  `gpu_target.cgen_state->module_` appears to be the same as `llvm_module`
979  */
980  CHECK(gpu_target.cgen_state->module_ == llvm_module);
981  CHECK(func->getParent() == wrapper_func->getParent());
982  llvm_module->setDataLayout(
983  "e-p:64:64:64-i1:8:8-i8:8:8-"
984  "i16:16:16-i32:32:32-i64:64:64-"
985  "f32:32:32-f64:64:64-v16:16:16-"
986  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
987  llvm_module->setTargetTriple("nvptx64-nvidia-cuda");
988  CHECK(gpu_target.nvptx_target_machine);
989  llvm::PassManagerBuilder pass_manager_builder = llvm::PassManagerBuilder();
990 
991  pass_manager_builder.OptLevel = 0;
992  llvm::legacy::PassManager module_pass_manager;
993  pass_manager_builder.populateModulePassManager(module_pass_manager);
994 
995  bool requires_libdevice = check_module_requires_libdevice(llvm_module);
996 
997  if (requires_libdevice) {
998  linkModuleWithLibdevice(executor, *llvm_module, pass_manager_builder, gpu_target);
999  }
1000 
1001  // run optimizations
1002  optimize_ir(func, llvm_module, module_pass_manager, live_funcs, is_gpu_smem_used, co);
1003  legalize_nvvm_ir(func);
1004 
1005  std::stringstream ss;
1006  llvm::raw_os_ostream os(ss);
1007 
1008  llvm::LLVMContext& ctx = llvm_module->getContext();
1009  // Get "nvvm.annotations" metadata node
1010  llvm::NamedMDNode* md = llvm_module->getOrInsertNamedMetadata("nvvm.annotations");
1011 
1012  llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
1013  llvm::MDString::get(ctx, "kernel"),
1014  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
1015  llvm::Type::getInt32Ty(ctx), 1))};
1016 
1017  // Append metadata to nvvm.annotations
1018  md->addOperand(llvm::MDNode::get(ctx, md_vals));
1019 
1020  std::unordered_set<llvm::Function*> roots{wrapper_func, func};
1021  if (gpu_target.row_func_not_inlined) {
1022  clear_function_attributes(gpu_target.cgen_state->row_func_);
1023  roots.insert(gpu_target.cgen_state->row_func_);
1024  if (gpu_target.cgen_state->filter_func_) {
1025  roots.insert(gpu_target.cgen_state->filter_func_);
1026  }
1027  }
1028 
1029  // prevent helper functions from being removed
1030  for (auto f : gpu_target.cgen_state->helper_functions_) {
1031  roots.insert(f);
1032  }
1033 
1034  if (requires_libdevice) {
1035  for (llvm::Function& F : *llvm_module) {
1036  // Some libdevice functions calls another functions that starts with "__internal_"
1037  // prefix.
1038  // __internal_trig_reduction_slowpathd
1039  // __internal_accurate_pow
1040  // __internal_lgamma_pos
1041  // Those functions have a "noinline" attribute which prevents the optimizer from
1042  // inlining them into the body of @query_func
1043  if (F.hasName() && F.getName().startswith("__internal") && !F.isDeclaration()) {
1044  roots.insert(&F);
1045  }
1046  legalize_nvvm_ir(&F);
1047  }
1048  }
1049 
1050  // Prevent the udf function(s) from being removed the way the runtime functions are
1051  std::unordered_set<std::string> udf_declarations;
1052 
1053  if (executor->has_udf_module(/*is_gpu=*/true)) {
1054  for (auto& f : executor->get_udf_module(/*is_gpu=*/true)->getFunctionList()) {
1055  llvm::Function* udf_function = llvm_module->getFunction(f.getName());
1056 
1057  if (udf_function) {
1058  legalize_nvvm_ir(udf_function);
1059  roots.insert(udf_function);
1060 
1061  // If we have a udf that declares a external function
1062  // note it so we can avoid duplicate declarations
1063  if (f.isDeclaration()) {
1064  udf_declarations.insert(f.getName().str());
1065  }
1066  }
1067  }
1068  }
1069 
1070  if (executor->has_rt_udf_module(/*is_gpu=*/true)) {
1071  for (auto& f : executor->get_rt_udf_module(/*is_gpu=*/true)->getFunctionList()) {
1072  llvm::Function* udf_function = llvm_module->getFunction(f.getName());
1073  if (udf_function) {
1074  legalize_nvvm_ir(udf_function);
1075  roots.insert(udf_function);
1076 
1077  // If we have a udf that declares a external function
1078  // note it so we can avoid duplicate declarations
1079  if (f.isDeclaration()) {
1080  udf_declarations.insert(f.getName().str());
1081  }
1082  }
1083  }
1084  }
1085 
1086  std::vector<llvm::Function*> rt_funcs;
1087  for (auto& Fn : *llvm_module) {
1088  if (roots.count(&Fn)) {
1089  continue;
1090  }
1091  rt_funcs.push_back(&Fn);
1092  }
1093  for (auto& pFn : rt_funcs) {
1094  pFn->removeFromParent();
1095  }
1096 
1097  if (requires_libdevice) {
1098  add_intrinsics_to_module(llvm_module);
1099  }
1100 
1101  if (!llvm_module->getModuleFlag("Debug Info Version")) {
1102  // Fixes QE-705
1103  llvm_module->addModuleFlag(
1104  llvm::Module::Error, "Debug Info Version", llvm::DEBUG_METADATA_VERSION);
1105  }
1106 
1107  llvm_module->print(os, nullptr);
1108  os.flush();
1109 
1110  for (auto& pFn : rt_funcs) {
1111  llvm_module->getFunctionList().push_back(pFn);
1112  }
1113  llvm_module->eraseNamedMetadata(md);
1114 
1115  auto cuda_llir = ss.str() + cuda_rt_decls + extension_function_decls(udf_declarations);
1116  std::string ptx;
1117  try {
1118  ptx = generatePTX(
1119  cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
1120  } catch (ParseIRError& e) {
1121  LOG(WARNING) << "Failed to generate PTX: " << e.what()
1122  << ". Switching to CPU execution target.";
1123  throw QueryMustRunOnCpu();
1124  }
1125  LOG(PTX) << "PTX for the GPU:\n" << ptx << "\nEnd of PTX";
1126 
1127  auto cubin_result = ptx_to_cubin(ptx, gpu_target.cuda_mgr);
1128  auto& option_keys = cubin_result.option_keys;
1129  auto& option_values = cubin_result.option_values;
1130  auto cubin = cubin_result.cubin;
1131  auto link_state = cubin_result.link_state;
1132  const auto num_options = option_keys.size();
1133 
1134  auto func_name = wrapper_func->getName().str();
1135  auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
1136  for (int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
1137  ++device_id) {
1138  gpu_compilation_context->addDeviceCode(
1139  std::make_unique<GpuDeviceCompilationContext>(cubin,
1140  cubin_result.cubin_size,
1141  func_name,
1142  device_id,
1143  gpu_target.cuda_mgr,
1144  num_options,
1145  &option_keys[0],
1146  &option_values[0]));
1147  }
1148 
1149  checkCudaErrors(cuLinkDestroy(link_state));
1150  try {
1151  QueryEngine::getInstance()->addGpuKernelSize(cubin_result.cubin_size);
1152  } catch (std::runtime_error const& e) {
1153  if (strcmp(e.what(), "QueryEngine instance hasn't been created")) {
1154  LOG(WARNING) << "QueryEngine::getInstance() failed: " << e.what();
1155  }
1156  } catch (...) {
1157  LOG(WARNING) << "Fail to get QueryEngine instance";
1158  throw;
1159  }
1160  return gpu_compilation_context;
1161 #else
1162  return {};
1163 #endif
1164 }
1165 
1166 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenGPU(
1167  llvm::Function* query_func,
1168  llvm::Function* multifrag_query_func,
1169  std::unordered_set<llvm::Function*>& live_funcs,
1170  const bool no_inline,
1171  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
1172  const bool is_gpu_smem_used,
1173  const CompilationOptions& co) {
1174 #ifdef HAVE_CUDA
1175  auto timer = DEBUG_TIMER(__func__);
1176 
1177  CHECK(cuda_mgr);
1178  CodeCacheKey key{serialize_llvm_object(query_func),
1179  serialize_llvm_object(cgen_state_->row_func_)};
1180  if (cgen_state_->filter_func_) {
1181  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
1182  }
1183  for (const auto helper : cgen_state_->helper_functions_) {
1184  key.push_back(serialize_llvm_object(helper));
1185  }
1186  auto cached_code = QueryEngine::getInstance()->gpu_code_accessor->get_value(key);
1187  if (cached_code) {
1188  return cached_code;
1189  }
1190  bool row_func_not_inlined = false;
1191  if (no_inline) {
1192  for (auto it = llvm::inst_begin(cgen_state_->row_func_),
1193  e = llvm::inst_end(cgen_state_->row_func_);
1194  it != e;
1195  ++it) {
1196  if (llvm::isa<llvm::CallInst>(*it)) {
1197  auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
1198  auto const func_name = CodegenUtil::getCalledFunctionName(get_gv_call);
1199  if (func_name &&
1200  (*func_name == "array_size" || *func_name == "linear_probabilistic_count")) {
1201  mark_function_never_inline(cgen_state_->row_func_);
1202  row_func_not_inlined = true;
1203  break;
1204  }
1205  }
1206  }
1207  }
1208 
1209  initializeNVPTXBackend();
1210  CodeGenerator::GPUTarget gpu_target{
1211  nvptx_target_machine_.get(), cuda_mgr, cgen_state_.get(), row_func_not_inlined};
1212  std::shared_ptr<GpuCompilationContext> compilation_context;
1213 
1214  try {
1215  compilation_context = CodeGenerator::generateNativeGPUCode(this,
1216  query_func,
1217  multifrag_query_func,
1218  live_funcs,
1219  is_gpu_smem_used,
1220  co,
1221  gpu_target);
1222  } catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1223  if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1224  // Thrown if memory not able to be allocated on gpu
1225  // Retry once after evicting portion of code cache
1226  auto& code_cache_accessor = QueryEngine::getInstance()->gpu_code_accessor;
1227  auto const num_entries_to_evict =
1228  code_cache_accessor->computeNumEntriesToEvict(g_fraction_code_cache_to_evict);
1229  auto evicted_kernels_size =
1230  code_cache_accessor->getSumSizeEvicted(num_entries_to_evict);
1231  LOG(WARNING) << "Failed to allocate GPU memory for generated code. Evicting "
1232  << num_entries_to_evict << " (" << evicted_kernels_size
1233  << " bytes) cached GPU code and re-trying.";
1234  try {
1235  QueryEngine::getInstance()->subGpuKernelSize(evicted_kernels_size);
1236  } catch (std::runtime_error const& e) {
1237  if (strcmp(e.what(), "QueryEngine instance hasn't been created")) {
1238  LOG(WARNING) << "QueryEngine::getInstance() failed: " << e.what();
1239  }
1240  } catch (...) {
1241  LOG(WARNING) << "Fail to get QueryEngine instance";
1242  }
1243  code_cache_accessor->evictEntries(num_entries_to_evict);
1244  compilation_context = CodeGenerator::generateNativeGPUCode(this,
1245  query_func,
1246  multifrag_query_func,
1247  live_funcs,
1248  is_gpu_smem_used,
1249  co,
1250  gpu_target);
1251  } else {
1252  throw;
1253  }
1254  }
1255  QueryEngine::getInstance()->gpu_code_accessor->put(key, compilation_context);
1256  return std::dynamic_pointer_cast<CompilationContext>(compilation_context);
1257 #else
1258  return nullptr;
1259 #endif
1260 }
1261 
1262 std::string CodeGenerator::generatePTX(const std::string& cuda_llir,
1263  llvm::TargetMachine* nvptx_target_machine,
1264  llvm::LLVMContext& context) {
1265  auto timer = DEBUG_TIMER(__func__);
1266  auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir, "", false);
1267 
1268  llvm::SMDiagnostic parse_error;
1269 
1270  auto llvm_module = llvm::parseIR(mem_buff->getMemBufferRef(), parse_error, context);
1271  if (!llvm_module) {
1272  LOG(IR) << "CodeGenerator::generatePTX:NVVM IR:\n" << cuda_llir << "\nEnd of NNVM IR";
1273  throw_parseIR_error(parse_error, "generatePTX", /* is_gpu= */ true);
1274  }
1275 
1276  llvm::SmallString<256> code_str;
1277  llvm::raw_svector_ostream formatted_os(code_str);
1278  CHECK(nvptx_target_machine);
1279  {
1280  llvm::legacy::PassManager ptxgen_pm;
1281  llvm_module->setDataLayout(nvptx_target_machine->createDataLayout());
1282 
1283 #if LLVM_VERSION_MAJOR >= 10
1284  nvptx_target_machine->addPassesToEmitFile(
1285  ptxgen_pm, formatted_os, nullptr, llvm::CGFT_AssemblyFile);
1286 #else
1287  nvptx_target_machine->addPassesToEmitFile(
1288  ptxgen_pm, formatted_os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1289 #endif
1290  ptxgen_pm.run(*llvm_module);
1291  }
1292 
1293 #if LLVM_VERSION_MAJOR >= 11
1294  return std::string(code_str);
1295 #else
1296  return code_str.str();
1297 #endif
1298 }
1299 
1301 
1302 std::unique_ptr<llvm::TargetMachine> CodeGenerator::initializeNVPTXBackend(
1304  auto timer = DEBUG_TIMER(__func__);
1305 
1306  std::lock_guard<std::mutex> lock(initialize_nvptx_mutex_);
1307 
1308  llvm::InitializeAllTargets();
1309  llvm::InitializeAllTargetMCs();
1310  llvm::InitializeAllAsmPrinters();
1311  std::string err;
1312  auto target = llvm::TargetRegistry::lookupTarget("nvptx64", err);
1313  if (!target) {
1314  LOG(FATAL) << err;
1315  }
1316  return std::unique_ptr<llvm::TargetMachine>(
1317  target->createTargetMachine("nvptx64-nvidia-cuda",
1319  "",
1320  llvm::TargetOptions(),
1321  llvm::Reloc::Static));
1322 }
1323 
1324 std::string Executor::generatePTX(const std::string& cuda_llir) const {
1326  cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1327 }
1328 
1329 void Executor::initializeNVPTXBackend() const {
1330  if (nvptx_target_machine_) {
1331  return;
1332  }
1333  const auto arch = cudaMgr()->getDeviceArch();
1334  nvptx_target_machine_ = CodeGenerator::initializeNVPTXBackend(arch);
1335 }
1336 
1337 // A small number of runtime functions don't get through CgenState::emitCall. List them
1338 // explicitly here and always clone their implementation from the runtime module.
1339 constexpr std::array<std::string_view, 18> TARGET_RUNTIME_FUNCTIONS_FOR_MODULE_CLONING{
1340  {"query_stub_hoisted_literals",
1341  "multifrag_query_hoisted_literals",
1342  "query_stub",
1343  "multifrag_query",
1344  "fixed_width_int_decode",
1345  "fixed_width_unsigned_decode",
1346  "diff_fixed_width_int_decode",
1347  "fixed_width_double_decode",
1348  "fixed_width_float_decode",
1349  "fixed_width_small_date_decode",
1350  "record_error_code",
1351  "get_error_code",
1352  "pos_start_impl",
1353  "pos_step_impl",
1354  "group_buff_idx_impl",
1355  "init_shared_mem",
1356  "init_shared_mem_nop",
1357  "write_back_nop"}};
1358 bool CodeGenerator::alwaysCloneRuntimeFunction(const llvm::Function* func) {
1359  auto const candidate_func_name = func->getName().str();
1362  [candidate_func_name](std::string_view func_name) {
1363  return candidate_func_name == func_name;
1364  });
1365 }
1366 
1367 std::unique_ptr<llvm::Module> read_llvm_module_from_bc_file(
1368  const std::string& bc_filename,
1369  llvm::LLVMContext& context) {
1370  llvm::SMDiagnostic err;
1371 
1372  auto buffer_or_error = llvm::MemoryBuffer::getFile(bc_filename);
1373  CHECK(!buffer_or_error.getError()) << "bc_filename=" << bc_filename;
1374  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1375 
1376  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1377  CHECK(!owner.takeError());
1378  CHECK(owner->get());
1379  return std::move(owner.get());
1380 }
1381 
1382 std::unique_ptr<llvm::Module> read_llvm_module_from_ir_file(
1383  const std::string& udf_ir_filename,
1384  llvm::LLVMContext& ctx,
1385  bool is_gpu = false) {
1386  llvm::SMDiagnostic parse_error;
1387 
1388  llvm::StringRef file_name_arg(udf_ir_filename);
1389 
1390  auto owner = llvm::parseIRFile(file_name_arg, parse_error, ctx);
1391  if (!owner) {
1392  throw_parseIR_error(parse_error, udf_ir_filename, is_gpu);
1393  }
1394 
1395  if (is_gpu) {
1396  llvm::Triple gpu_triple(owner->getTargetTriple());
1397  if (!gpu_triple.isNVPTX()) {
1398  LOG(WARNING)
1399  << "Expected triple nvptx64-nvidia-cuda for NVVM IR of loadtime UDFs but got "
1400  << gpu_triple.str() << ". Disabling the NVVM IR module.";
1401  return std::unique_ptr<llvm::Module>();
1402  }
1403  }
1404  return owner;
1405 }
1406 
1407 std::unique_ptr<llvm::Module> read_llvm_module_from_ir_string(
1408  const std::string& udf_ir_string,
1409  llvm::LLVMContext& ctx,
1410  bool is_gpu = false) {
1411  llvm::SMDiagnostic parse_error;
1412 
1413  auto buf = std::make_unique<llvm::MemoryBufferRef>(udf_ir_string,
1414  "Runtime UDF/UDTF LLVM/NVVM IR");
1415 
1416  auto owner = llvm::parseIR(*buf, parse_error, ctx);
1417  if (!owner) {
1418  LOG(IR) << "read_llvm_module_from_ir_string:\n"
1419  << udf_ir_string << "\nEnd of LLVM/NVVM IR";
1420  throw_parseIR_error(parse_error, "", /* is_gpu= */ is_gpu);
1421  }
1422 
1423  if (is_gpu) {
1424  llvm::Triple gpu_triple(owner->getTargetTriple());
1425  if (!gpu_triple.isNVPTX()) {
1426  LOG(IR) << "read_llvm_module_from_ir_string:\n"
1427  << udf_ir_string << "\nEnd of NNVM IR";
1428  LOG(WARNING) << "Expected triple nvptx64-nvidia-cuda for NVVM IR but got "
1429  << gpu_triple.str()
1430  << ". Executing runtime UDF/UDTFs on GPU will be disabled.";
1431  return std::unique_ptr<llvm::Module>();
1432  ;
1433  }
1434  }
1435  return owner;
1436 }
1437 
1438 namespace {
1439 
1440 void bind_pos_placeholders(const std::string& pos_fn_name,
1441  const bool use_resume_param,
1442  llvm::Function* query_func,
1443  llvm::Module* llvm_module) {
1444  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1445  ++it) {
1446  if (!llvm::isa<llvm::CallInst>(*it)) {
1447  continue;
1448  }
1449  auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1450  auto const func_name = CodegenUtil::getCalledFunctionName(pos_call);
1451  if (func_name && *func_name == pos_fn_name) {
1452  if (use_resume_param) {
1453  auto* const row_index_resume = get_arg_by_name(query_func, "row_index_resume");
1454  llvm::ReplaceInstWithInst(
1455  &pos_call,
1456  llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name + "_impl"),
1457  row_index_resume));
1458  } else {
1459  llvm::ReplaceInstWithInst(
1460  &pos_call,
1461  llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name + "_impl")));
1462  }
1463  break;
1464  }
1465  }
1466 }
1467 
1468 void set_row_func_argnames(llvm::Function* row_func,
1469  const size_t in_col_count,
1470  const size_t agg_col_count,
1471  const bool hoist_literals) {
1472  auto arg_it = row_func->arg_begin();
1473 
1474  if (agg_col_count) {
1475  for (size_t i = 0; i < agg_col_count; ++i) {
1476  arg_it->setName("out");
1477  ++arg_it;
1478  }
1479  } else {
1480  arg_it->setName("group_by_buff");
1481  ++arg_it;
1482  arg_it->setName("varlen_output_buff");
1483  ++arg_it;
1484  arg_it->setName("crt_matched");
1485  ++arg_it;
1486  arg_it->setName("total_matched");
1487  ++arg_it;
1488  arg_it->setName("old_total_matched");
1489  ++arg_it;
1490  arg_it->setName("max_matched");
1491  ++arg_it;
1492  }
1493 
1494  arg_it->setName("agg_init_val");
1495  ++arg_it;
1496 
1497  arg_it->setName("pos");
1498  ++arg_it;
1499 
1500  arg_it->setName("frag_row_off");
1501  ++arg_it;
1502 
1503  arg_it->setName("num_rows_per_scan");
1504  ++arg_it;
1505 
1506  if (hoist_literals) {
1507  arg_it->setName("literals");
1508  ++arg_it;
1509  }
1510 
1511  for (size_t i = 0; i < in_col_count; ++i) {
1512  arg_it->setName("col_buf" + std::to_string(i));
1513  ++arg_it;
1514  }
1515 
1516  arg_it->setName("join_hash_tables");
1517  ++arg_it;
1518  arg_it->setName("row_func_mgr");
1519 }
1520 
1521 llvm::Function* create_row_function(const size_t in_col_count,
1522  const size_t agg_col_count,
1523  const bool hoist_literals,
1524  llvm::Module* llvm_module,
1525  llvm::LLVMContext& context) {
1526  std::vector<llvm::Type*> row_process_arg_types;
1527 
1528  if (agg_col_count) {
1529  // output (aggregate) arguments
1530  for (size_t i = 0; i < agg_col_count; ++i) {
1531  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1532  }
1533  } else {
1534  // group by buffer
1535  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1536  // varlen output buffer
1537  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1538  // current match count
1539  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1540  // total match count passed from the caller
1541  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1542  // old total match count returned to the caller
1543  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1544  // max matched (total number of slots in the output buffer)
1545  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1546  }
1547 
1548  // aggregate init values
1549  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1550 
1551  // position argument
1552  row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1553 
1554  // fragment row offset argument
1555  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1556 
1557  // number of rows for each scan
1558  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1559 
1560  // literals buffer argument
1561  if (hoist_literals) {
1562  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1563  }
1564 
1565  // column buffer arguments
1566  for (size_t i = 0; i < in_col_count; ++i) {
1567  row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1568  }
1569 
1570  // join hash table argument
1571  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1572 
1573  // row function manager
1574  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1575 
1576  // generate the function
1577  auto ft =
1578  llvm::FunctionType::get(get_int_type(32, context), row_process_arg_types, false);
1579 
1580  auto row_func = llvm::Function::Create(
1581  ft, llvm::Function::ExternalLinkage, "row_func", llvm_module);
1583  // set the row function argument names; for debugging purposes only
1584  set_row_func_argnames(row_func, in_col_count, agg_col_count, hoist_literals);
1585 
1586  return row_func;
1587 }
1588 
1589 // Iterate through multifrag_query_func, replacing calls to query_fname with query_func.
1590 void bind_query(llvm::Function* query_func,
1591  const std::string& query_fname,
1592  llvm::Function* multifrag_query_func,
1593  llvm::Module* llvm_module) {
1594  std::vector<llvm::CallInst*> query_stubs;
1595  for (auto it = llvm::inst_begin(multifrag_query_func),
1596  e = llvm::inst_end(multifrag_query_func);
1597  it != e;
1598  ++it) {
1599  if (!llvm::isa<llvm::CallInst>(*it)) {
1600  continue;
1601  }
1602  auto& query_call = llvm::cast<llvm::CallInst>(*it);
1603  auto const call_func_name = CodegenUtil::getCalledFunctionName(query_call);
1604  if (call_func_name && *call_func_name == query_fname) {
1605  query_stubs.push_back(&query_call);
1606  }
1607  }
1608  for (auto& S : query_stubs) {
1609  std::vector<llvm::Value*> args;
1610  for (size_t i = 0; i < S->getNumOperands() - 1; ++i) {
1611  args.push_back(S->getArgOperand(i));
1612  }
1613  llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args, ""));
1614  }
1615 }
1616 
1617 std::vector<std::string> get_agg_fnames(const std::vector<Analyzer::Expr*>& target_exprs,
1618  const bool is_group_by) {
1619  std::vector<std::string> result;
1620  for (size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1621  ++target_idx, ++agg_col_idx) {
1622  const auto target_expr = target_exprs[target_idx];
1623  CHECK(target_expr);
1624  const auto target_type_info = target_expr->get_type_info();
1625  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
1626  const bool is_varlen =
1627  (target_type_info.is_string() &&
1628  target_type_info.get_compression() == kENCODING_NONE) ||
1629  target_type_info.is_array(); // TODO: should it use is_varlen_array() ?
1630  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
1631  result.emplace_back(target_type_info.is_fp() ? "agg_id_double" : "agg_id");
1632  if (is_varlen) {
1633  result.emplace_back("agg_id");
1634  }
1635  if (target_type_info.is_geometry()) {
1636  result.emplace_back("agg_id");
1637  for (auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1638  result.emplace_back("agg_id");
1639  }
1640  }
1641  continue;
1642  }
1643  const auto agg_type = agg_expr->get_aggtype();
1644  SQLTypeInfo agg_type_info;
1645  switch (agg_type) {
1646  case kCOUNT:
1647  case kCOUNT_IF:
1648  agg_type_info = target_type_info;
1649  break;
1650  default:
1651  agg_type_info = agg_expr->get_arg()->get_type_info();
1652  break;
1653  }
1654  switch (agg_type) {
1655  case kAVG: {
1656  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1657  !agg_type_info.is_fp()) {
1658  throw std::runtime_error("AVG is only valid on integer and floating point");
1659  }
1660  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1661  ? "agg_sum"
1662  : "agg_sum_double");
1663  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1664  ? "agg_count"
1665  : "agg_count_double");
1666  break;
1667  }
1668  case kMIN: {
1669  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1670  agg_type_info.is_geometry()) {
1671  throw std::runtime_error(
1672  "MIN on strings, arrays or geospatial types not supported yet");
1673  }
1674  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1675  ? "agg_min"
1676  : "agg_min_double");
1677  break;
1678  }
1679  case kMAX: {
1680  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1681  agg_type_info.is_geometry()) {
1682  throw std::runtime_error(
1683  "MAX on strings, arrays or geospatial types not supported yet");
1684  }
1685  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1686  ? "agg_max"
1687  : "agg_max_double");
1688  break;
1689  }
1690  case kSUM:
1691  case kSUM_IF: {
1692  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1693  !agg_type_info.is_fp()) {
1694  throw std::runtime_error(
1695  "SUM and SUM_IF is only valid on integer and floating point");
1696  }
1697  std::string func_name = (agg_type_info.is_integer() || agg_type_info.is_time())
1698  ? "agg_sum"
1699  : "agg_sum_double";
1700  if (agg_type == kSUM_IF) {
1701  func_name += "_if";
1702  }
1703  result.emplace_back(func_name);
1704  break;
1705  }
1706  case kCOUNT:
1707  result.emplace_back(agg_expr->get_is_distinct() ? "agg_count_distinct"
1708  : "agg_count");
1709  break;
1711  result.emplace_back("agg_count_if");
1712  break;
1713  case kSINGLE_VALUE: {
1714  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1715  break;
1716  }
1717  case kSAMPLE: {
1718  // Note that varlen SAMPLE arguments are handled separately above
1719  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1720  break;
1721  }
1723  result.emplace_back("agg_approximate_count_distinct");
1724  break;
1725  case kAPPROX_QUANTILE:
1726  result.emplace_back("agg_approx_quantile");
1727  break;
1728  case kMODE:
1729  result.emplace_back("agg_mode_func");
1730  break;
1731  default:
1732  UNREACHABLE() << "Usupported agg_type: " << agg_type;
1733  }
1734  }
1735  return result;
1736 }
1737 
1738 } // namespace
1739 
1740 void Executor::addUdfIrToModule(const std::string& udf_ir_filename,
1741  const bool is_cuda_ir) {
1745  udf_ir_filename;
1746 }
1747 
1748 std::unordered_set<llvm::Function*> CodeGenerator::markDeadRuntimeFuncs(
1749  llvm::Module& llvm_module,
1750  const std::vector<llvm::Function*>& roots,
1751  const std::vector<llvm::Function*>& leaves) {
1752  auto timer = DEBUG_TIMER(__func__);
1753  std::unordered_set<llvm::Function*> live_funcs;
1754  live_funcs.insert(roots.begin(), roots.end());
1755  live_funcs.insert(leaves.begin(), leaves.end());
1756 
1757  if (auto F = llvm_module.getFunction("init_shared_mem_nop")) {
1758  live_funcs.insert(F);
1759  }
1760  if (auto F = llvm_module.getFunction("write_back_nop")) {
1761  live_funcs.insert(F);
1762  }
1764  for (const llvm::Function* F : roots) {
1765  for (const llvm::BasicBlock& BB : *F) {
1766  for (const llvm::Instruction& I : BB) {
1767  if (const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1768  live_funcs.insert(CI->getCalledFunction());
1769  }
1770  }
1771  }
1772  }
1773 
1774  for (llvm::Function& F : llvm_module) {
1775  if (!live_funcs.count(&F) && !F.isDeclaration()) {
1776  F.setLinkage(llvm::GlobalValue::InternalLinkage);
1777  }
1778  }
1779 
1780  return live_funcs;
1781 }
1782 
1783 namespace {
1784 // searches for a particular variable within a specific basic block (or all if bb_name is
1785 // empty)
1786 template <typename InstType>
1787 llvm::Value* find_variable_in_basic_block(llvm::Function* func,
1788  std::string bb_name,
1789  std::string variable_name) {
1790  llvm::Value* result = nullptr;
1791  if (func == nullptr || variable_name.empty()) {
1792  return result;
1793  }
1794  bool is_found = false;
1795  for (auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1796  if (!bb_name.empty() && bb_it->getName() != bb_name) {
1797  continue;
1798  }
1799  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1800  if (llvm::isa<InstType>(*inst_it)) {
1801  if (inst_it->getName() == variable_name) {
1802  result = &*inst_it;
1803  is_found = true;
1804  break;
1805  }
1806  }
1807  }
1808  }
1809  return result;
1810 }
1811 }; // namespace
1812 
1814  llvm::Function* query_func,
1815  bool run_with_dynamic_watchdog,
1816  bool run_with_allowing_runtime_interrupt,
1817  const std::vector<JoinLoop>& join_loops,
1818  ExecutorDeviceType device_type,
1819  const std::vector<InputTableInfo>& input_table_infos) {
1820  AUTOMATIC_IR_METADATA(cgen_state_.get());
1821 
1822  // check whether the row processing was successful; currently, it can
1823  // fail by running out of group by buffer slots
1824 
1825  if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1826  // when both dynamic watchdog and runtime interrupt turns on
1827  // we use dynamic watchdog
1828  run_with_allowing_runtime_interrupt = false;
1829  }
1830 
1831  {
1832  // disable injecting query interrupt checker if the session info is invalid
1834  executor_session_mutex_);
1835  if (current_query_session_.empty()) {
1836  run_with_allowing_runtime_interrupt = false;
1837  }
1838  }
1839 
1840  llvm::Value* row_count = nullptr;
1841  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1842  device_type == ExecutorDeviceType::GPU) {
1843  row_count =
1844  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1845  }
1846 
1847  bool done_splitting = false;
1848  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1849  ++bb_it) {
1850  llvm::Value* pos = nullptr;
1851  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1852  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1853  llvm::isa<llvm::PHINode>(*inst_it)) {
1854  if (inst_it->getName() == "pos") {
1855  pos = &*inst_it;
1856  }
1857  continue;
1858  }
1859  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1860  continue;
1861  }
1862  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1863  auto const row_func_name = CodegenUtil::getCalledFunctionName(row_func_call);
1864  if (row_func_name && *row_func_name == "row_process") {
1865  auto next_inst_it = inst_it;
1866  ++next_inst_it;
1867  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1868  auto& br_instr = bb_it->back();
1869  llvm::IRBuilder<> ir_builder(&br_instr);
1870  llvm::Value* err_lv = &*inst_it;
1871  llvm::Value* err_lv_returned_from_row_func = nullptr;
1872  if (run_with_dynamic_watchdog) {
1873  CHECK(pos);
1874  llvm::Value* call_watchdog_lv = nullptr;
1875  if (device_type == ExecutorDeviceType::GPU) {
1876  // In order to make sure all threads within a block see the same barrier,
1877  // only those blocks whose none of their threads have experienced the critical
1878  // edge will go through the dynamic watchdog computation
1879  CHECK(row_count);
1880  auto crit_edge_rem =
1881  (blockSize() & (blockSize() - 1))
1882  ? ir_builder.CreateSRem(
1883  row_count,
1884  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1885  : ir_builder.CreateAnd(
1886  row_count,
1887  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1888  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1889  crit_edge_threshold->setName("crit_edge_threshold");
1890 
1891  // only those threads where pos < crit_edge_threshold go through dynamic
1892  // watchdog call
1893  call_watchdog_lv =
1894  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1895  } else {
1896  // CPU path: run watchdog for every 64th row
1897  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1898  call_watchdog_lv = ir_builder.CreateICmp(
1899  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1900  }
1901  CHECK(call_watchdog_lv);
1902  auto error_check_bb = bb_it->splitBasicBlock(
1903  llvm::BasicBlock::iterator(br_instr), ".error_check");
1904  auto& watchdog_br_instr = bb_it->back();
1905 
1906  auto watchdog_check_bb = llvm::BasicBlock::Create(
1907  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1908  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1909  auto detected_timeout = watchdog_ir_builder.CreateCall(
1910  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1911  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1912  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1913  watchdog_ir_builder.CreateBr(error_check_bb);
1914 
1915  llvm::ReplaceInstWithInst(
1916  &watchdog_br_instr,
1917  llvm::BranchInst::Create(
1918  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1919  ir_builder.SetInsertPoint(&br_instr);
1920  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1921 
1922  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1923  unified_err_lv->addIncoming(err_lv, &*bb_it);
1924  err_lv = unified_err_lv;
1925  } else if (run_with_allowing_runtime_interrupt) {
1926  CHECK(pos);
1927  llvm::Value* call_check_interrupt_lv{nullptr};
1928  llvm::Value* interrupt_err_lv{nullptr};
1929  llvm::BasicBlock* error_check_bb{nullptr};
1930  llvm::BasicBlock* interrupt_check_bb{nullptr};
1931  llvm::Instruction* check_interrupt_br_instr{nullptr};
1932 
1933  auto has_loop_join = std::any_of(
1934  join_loops.begin(), join_loops.end(), [](const JoinLoop& join_loop) {
1935  return join_loop.isNestedLoopJoin();
1936  });
1937  auto codegen_interrupt_checker = [&]() {
1938  error_check_bb = bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
1939  ".error_check");
1940  check_interrupt_br_instr = &bb_it->back();
1941 
1942  interrupt_check_bb = llvm::BasicBlock::Create(
1943  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
1944  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1945  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1946  cgen_state_->module_->getFunction("check_interrupt"), {});
1947  interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1948  detected_interrupt,
1949  cgen_state_->llInt(Executor::ERR_INTERRUPTED),
1950  err_lv);
1951  interrupt_checker_ir_builder.CreateBr(error_check_bb);
1952  };
1953  if (has_loop_join) {
1954  codegen_interrupt_checker();
1955  CHECK(interrupt_check_bb);
1956  CHECK(check_interrupt_br_instr);
1957  llvm::ReplaceInstWithInst(check_interrupt_br_instr,
1958  llvm::BranchInst::Create(interrupt_check_bb));
1959  ir_builder.SetInsertPoint(&br_instr);
1960  err_lv = interrupt_err_lv;
1961  } else {
1962  if (device_type == ExecutorDeviceType::GPU) {
1963  // approximate how many times the %pos variable
1964  // is increased --> the number of iteration
1965  // here we calculate the # bit shift by considering grid/block/fragment
1966  // sizes since if we use the fixed one (i.e., per 64-th increment) some CUDA
1967  // threads cannot enter the interrupt checking block depending on the
1968  // fragment size --> a thread may not take care of 64 threads if an outer
1969  // table is not sufficiently large, and so cannot be interrupted
1970  int32_t num_shift_by_gridDim = shared::getExpOfTwo(gridSize());
1971  int32_t num_shift_by_blockDim = shared::getExpOfTwo(blockSize());
1972  int64_t total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1973  uint64_t interrupt_checking_freq = 32;
1974  auto freq_control_knob = g_running_query_interrupt_freq;
1975  CHECK_GT(freq_control_knob, 0);
1976  CHECK_LE(freq_control_knob, 1.0);
1977  if (!input_table_infos.empty()) {
1978  const auto& outer_table_info = *input_table_infos.begin();
1979  auto num_outer_table_tuples =
1980  outer_table_info.info.getFragmentNumTuplesUpperBound();
1981  if (num_outer_table_tuples > 0) {
1982  // gridSize * blockSize --> pos_step (idx of the next row per thread)
1983  // we additionally multiply two to pos_step since the number of
1984  // dispatched blocks are double of the gridSize
1985  // # tuples (of fragment) / pos_step --> maximum # increment (K)
1986  // also we multiply 1 / freq_control_knob to K to control the frequency
1987  // So, needs to check the interrupt status more frequently? make K
1988  // smaller
1989  auto max_inc = uint64_t(
1990  floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1991  if (max_inc < 2) {
1992  // too small `max_inc`, so this correction is necessary to make
1993  // `interrupt_checking_freq` be valid (i.e., larger than zero)
1994  max_inc = 2;
1995  }
1996  auto calibrated_inc =
1997  uint64_t(floor(max_inc * (1 - freq_control_knob)));
1998  interrupt_checking_freq =
1999  uint64_t(pow(2, shared::getExpOfTwo(calibrated_inc)));
2000  // add the coverage when interrupt_checking_freq > K
2001  // if so, some threads still cannot be branched to the interrupt checker
2002  // so we manually use smaller but close to the max_inc as freq
2003  if (interrupt_checking_freq > max_inc) {
2004  interrupt_checking_freq = max_inc / 2;
2005  }
2006  if (interrupt_checking_freq < 8) {
2007  // such small freq incurs too frequent interrupt status checking,
2008  // so we fixup to the minimum freq value at some reasonable degree
2009  interrupt_checking_freq = 8;
2010  }
2011  }
2012  }
2013  VLOG(1) << "Set the running query interrupt checking frequency: "
2014  << interrupt_checking_freq;
2015  // check the interrupt flag for every interrupt_checking_freq-th iteration
2016  llvm::Value* pos_shifted_per_iteration =
2017  ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
2018  auto interrupt_predicate = ir_builder.CreateAnd(pos_shifted_per_iteration,
2019  interrupt_checking_freq);
2020  call_check_interrupt_lv =
2021  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2022  interrupt_predicate,
2023  cgen_state_->llInt(int64_t(0LL)));
2024  } else {
2025  // CPU path: run interrupt checker for every 64th row
2026  auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
2027  call_check_interrupt_lv =
2028  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2029  interrupt_predicate,
2030  cgen_state_->llInt(int64_t(0LL)));
2031  }
2032  codegen_interrupt_checker();
2033  CHECK(call_check_interrupt_lv);
2034  CHECK(interrupt_err_lv);
2035  CHECK(interrupt_check_bb);
2036  CHECK(error_check_bb);
2037  CHECK(check_interrupt_br_instr);
2038  llvm::ReplaceInstWithInst(
2039  check_interrupt_br_instr,
2040  llvm::BranchInst::Create(
2041  interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
2042  ir_builder.SetInsertPoint(&br_instr);
2043  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
2044 
2045  unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
2046  unified_err_lv->addIncoming(err_lv, &*bb_it);
2047  err_lv = unified_err_lv;
2048  }
2049  }
2050  if (!err_lv_returned_from_row_func) {
2051  err_lv_returned_from_row_func = err_lv;
2052  }
2053  if (device_type == ExecutorDeviceType::GPU && g_enable_dynamic_watchdog) {
2054  // let kernel execution finish as expected, regardless of the observed error,
2055  // unless it is from the dynamic watchdog where all threads within that block
2056  // return together.
2057  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
2058  err_lv,
2059  cgen_state_->llInt(Executor::ERR_OUT_OF_TIME));
2060  } else {
2061  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
2062  err_lv,
2063  cgen_state_->llInt(static_cast<int32_t>(0)));
2064  }
2065  auto error_bb = llvm::BasicBlock::Create(
2066  cgen_state_->context_, ".error_exit", query_func, new_bb);
2067  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
2068  llvm::CallInst::Create(
2069  cgen_state_->module_->getFunction("record_error_code"),
2070  std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
2071  "",
2072  error_bb);
2073  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2074  llvm::ReplaceInstWithInst(&br_instr,
2075  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2076  done_splitting = true;
2077  break;
2078  }
2079  }
2080  }
2081  CHECK(done_splitting);
2082 }
2083 
2085  llvm::Module* M = cgen_state_->module_;
2086  if (M->getFunction("allocate_varlen_buffer") == nullptr) {
2087  return;
2088  }
2089 
2090  // read metadata
2091  bool should_track = false;
2092  auto* flag = M->getModuleFlag("manage_memory_buffer");
2093  if (auto* cnt = llvm::mdconst::extract_or_null<llvm::ConstantInt>(flag)) {
2094  if (cnt->getZExtValue() == 1) {
2095  should_track = true;
2096  }
2097  }
2098 
2099  if (!should_track) {
2100  // metadata is not present
2101  return;
2102  }
2103 
2104  LOG(INFO) << "Found 'manage_memory_buffer' metadata.";
2105  llvm::SmallVector<llvm::CallInst*, 4> calls_to_analyze;
2106 
2107  for (llvm::Function& F : *M) {
2108  for (llvm::BasicBlock& BB : F) {
2109  for (llvm::Instruction& I : BB) {
2110  if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&I)) {
2111  // Keep track of calls to "allocate_varlen_buffer" for later processing
2112  auto const called_func_name = CodegenUtil::getCalledFunctionName(*CI);
2113  if (called_func_name && *called_func_name == "allocate_varlen_buffer") {
2114  calls_to_analyze.push_back(CI);
2115  }
2116  }
2117  }
2118  }
2119  }
2120 
2121  // for each call to "allocate_varlen_buffer", check if there's a corresponding
2122  // call to "register_buffer_with_executor_rsm". If not, add a call to it
2123  llvm::IRBuilder<> Builder(cgen_state_->context_);
2124  auto i64 = get_int_type(64, cgen_state_->context_);
2125  auto i8p = get_int_ptr_type(8, cgen_state_->context_);
2126  auto void_ = llvm::Type::getVoidTy(cgen_state_->context_);
2127  llvm::FunctionType* fnty = llvm::FunctionType::get(void_, {i64, i8p}, false);
2128  llvm::FunctionCallee register_buffer_fn =
2129  M->getOrInsertFunction("register_buffer_with_executor_rsm", fnty, {});
2130 
2131  int64_t executor_addr = reinterpret_cast<int64_t>(this);
2132  for (llvm::CallInst* CI : calls_to_analyze) {
2133  bool found = false;
2134  // for each user of the function, check if its a callinst
2135  // and if the callinst is calling "register_buffer_with_executor_rsm"
2136  // if no such instruction exist, add one registering the buffer
2137  for (llvm::User* U : CI->users()) {
2138  if (llvm::CallInst* call = llvm::dyn_cast<llvm::CallInst>(U)) {
2139  auto const func_name = CodegenUtil::getCalledFunctionName(*call);
2140  if (func_name && *func_name == "register_buffer_with_executor_rsm") {
2141  found = true;
2142  break;
2143  }
2144  }
2145  }
2146  if (!found) {
2147  Builder.SetInsertPoint(CI->getNextNode());
2148  Builder.CreateCall(register_buffer_fn,
2149  {ll_int(executor_addr, cgen_state_->context_), CI});
2150  }
2151  }
2152 }
2153 
2154 std::vector<llvm::Value*> Executor::inlineHoistedLiterals() {
2155  AUTOMATIC_IR_METADATA(cgen_state_.get());
2156 
2157  std::vector<llvm::Value*> hoisted_literals;
2158 
2159  // row_func_ is using literals whose defs have been hoisted up to the query_func_,
2160  // extend row_func_ signature to include extra args to pass these literal values.
2161  std::vector<llvm::Type*> row_process_arg_types;
2162 
2163  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2164  E = cgen_state_->row_func_->arg_end();
2165  I != E;
2166  ++I) {
2167  row_process_arg_types.push_back(I->getType());
2168  }
2169 
2170  for (auto& element : cgen_state_->query_func_literal_loads_) {
2171  for (auto value : element.second) {
2172  row_process_arg_types.push_back(value->getType());
2173  }
2174  }
2175 
2176  auto ft = llvm::FunctionType::get(
2177  get_int_type(32, cgen_state_->context_), row_process_arg_types, false);
2178  auto row_func_with_hoisted_literals =
2179  llvm::Function::Create(ft,
2180  llvm::Function::ExternalLinkage,
2181  "row_func_hoisted_literals",
2182  cgen_state_->row_func_->getParent());
2183 
2184  auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
2185  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2186  E = cgen_state_->row_func_->arg_end();
2187  I != E;
2188  ++I) {
2189  if (I->hasName()) {
2190  row_func_arg_it->setName(I->getName());
2191  }
2192  ++row_func_arg_it;
2193  }
2194 
2195  decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{nullptr};
2196  decltype(row_func_arg_it) filter_func_arg_it{nullptr};
2197  if (cgen_state_->filter_func_) {
2198  // filter_func_ is using literals whose defs have been hoisted up to the row_func_,
2199  // extend filter_func_ signature to include extra args to pass these literal values.
2200  std::vector<llvm::Type*> filter_func_arg_types;
2201 
2202  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2203  E = cgen_state_->filter_func_->arg_end();
2204  I != E;
2205  ++I) {
2206  filter_func_arg_types.push_back(I->getType());
2207  }
2208 
2209  for (auto& element : cgen_state_->query_func_literal_loads_) {
2210  for (auto value : element.second) {
2211  filter_func_arg_types.push_back(value->getType());
2212  }
2213  }
2214 
2215  auto ft2 = llvm::FunctionType::get(
2216  get_int_type(32, cgen_state_->context_), filter_func_arg_types, false);
2217  filter_func_with_hoisted_literals =
2218  llvm::Function::Create(ft2,
2219  llvm::Function::ExternalLinkage,
2220  "filter_func_hoisted_literals",
2221  cgen_state_->filter_func_->getParent());
2222 
2223  filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
2224  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2225  E = cgen_state_->filter_func_->arg_end();
2226  I != E;
2227  ++I) {
2228  if (I->hasName()) {
2229  filter_func_arg_it->setName(I->getName());
2230  }
2231  ++filter_func_arg_it;
2232  }
2233  }
2234 
2235  std::unordered_map<int, std::vector<llvm::Value*>>
2236  query_func_literal_loads_function_arguments,
2237  query_func_literal_loads_function_arguments2;
2238 
2239  for (auto& element : cgen_state_->query_func_literal_loads_) {
2240  std::vector<llvm::Value*> argument_values, argument_values2;
2241 
2242  for (auto value : element.second) {
2243  hoisted_literals.push_back(value);
2244  argument_values.push_back(&*row_func_arg_it);
2245  if (cgen_state_->filter_func_) {
2246  argument_values2.push_back(&*filter_func_arg_it);
2247  cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
2248  }
2249  if (value->hasName()) {
2250  row_func_arg_it->setName("arg_" + value->getName());
2251  if (cgen_state_->filter_func_) {
2252  filter_func_arg_it->getContext();
2253  filter_func_arg_it->setName("arg_" + value->getName());
2254  }
2255  }
2256  ++row_func_arg_it;
2257  ++filter_func_arg_it;
2258  }
2259 
2260  query_func_literal_loads_function_arguments[element.first] = argument_values;
2261  query_func_literal_loads_function_arguments2[element.first] = argument_values2;
2262  }
2263 
2264  // copy the row_func function body over
2265  // see
2266  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2267  row_func_with_hoisted_literals->getBasicBlockList().splice(
2268  row_func_with_hoisted_literals->begin(),
2269  cgen_state_->row_func_->getBasicBlockList());
2270 
2271  // also replace row_func arguments with the arguments from row_func_hoisted_literals
2272  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2273  E = cgen_state_->row_func_->arg_end(),
2274  I2 = row_func_with_hoisted_literals->arg_begin();
2275  I != E;
2276  ++I) {
2277  I->replaceAllUsesWith(&*I2);
2278  I2->takeName(&*I);
2279  cgen_state_->filter_func_args_.replace(&*I, &*I2);
2280  ++I2;
2281  }
2282 
2283  cgen_state_->row_func_ = row_func_with_hoisted_literals;
2284 
2285  // and finally replace literal placeholders
2286  std::vector<llvm::Instruction*> placeholders;
2287  std::string prefix("__placeholder__literal_");
2288  for (auto it = llvm::inst_begin(row_func_with_hoisted_literals),
2289  e = llvm::inst_end(row_func_with_hoisted_literals);
2290  it != e;
2291  ++it) {
2292  if (it->hasName() && it->getName().startswith(prefix)) {
2293  auto offset_and_index_entry =
2294  cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
2295  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2296 
2297  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2298  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2299 
2300  it->replaceAllUsesWith(
2301  query_func_literal_loads_function_arguments[lit_off][lit_idx]);
2302  placeholders.push_back(&*it);
2303  }
2304  }
2305  for (auto placeholder : placeholders) {
2306  placeholder->removeFromParent();
2307  }
2308 
2309  if (cgen_state_->filter_func_) {
2310  // copy the filter_func function body over
2311  // see
2312  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2313  filter_func_with_hoisted_literals->getBasicBlockList().splice(
2314  filter_func_with_hoisted_literals->begin(),
2315  cgen_state_->filter_func_->getBasicBlockList());
2316 
2317  // also replace filter_func arguments with the arguments from
2318  // filter_func_hoisted_literals
2319  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2320  E = cgen_state_->filter_func_->arg_end(),
2321  I2 = filter_func_with_hoisted_literals->arg_begin();
2322  I != E;
2323  ++I) {
2324  I->replaceAllUsesWith(&*I2);
2325  I2->takeName(&*I);
2326  ++I2;
2327  }
2328 
2329  cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2330 
2331  // and finally replace literal placeholders
2332  std::vector<llvm::Instruction*> placeholders;
2333  std::string prefix("__placeholder__literal_");
2334  for (auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2335  e = llvm::inst_end(filter_func_with_hoisted_literals);
2336  it != e;
2337  ++it) {
2338  if (it->hasName() && it->getName().startswith(prefix)) {
2339  auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2340  llvm::dyn_cast<llvm::Value>(&*it));
2341  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2342 
2343  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2344  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2345 
2346  it->replaceAllUsesWith(
2347  query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2348  placeholders.push_back(&*it);
2349  }
2350  }
2351  for (auto placeholder : placeholders) {
2352  placeholder->removeFromParent();
2353  }
2354  }
2355 
2356  return hoisted_literals;
2357 }
2358 
2359 namespace {
2360 
2361 size_t get_shared_memory_size(const bool shared_mem_used,
2362  const QueryMemoryDescriptor* query_mem_desc_ptr) {
2363  return shared_mem_used
2364  ? (query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount())
2365  : 0;
2366 }
2367 
2368 bool has_count_expr(RelAlgExecutionUnit const& ra_exe_unit) {
2369  for (auto const expr : ra_exe_unit.target_exprs) {
2370  if (auto const agg_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
2371  if (shared::is_any<SQLAgg::kCOUNT, SQLAgg::kCOUNT_IF>(agg_expr->get_aggtype())) {
2372  return true;
2373  }
2374  }
2375  }
2376  return false;
2377 }
2378 
2379 class CaseExprDetector : public ScalarExprVisitor<bool> {
2380  public:
2381  CaseExprDetector() : detect_case_expr_(false) {}
2382 
2383  bool detectCaseExpr(const Analyzer::Expr* expr) const {
2384  visit(expr);
2385  return detect_case_expr_;
2386  }
2387 
2388  protected:
2389  bool visitCaseExpr(const Analyzer::CaseExpr*) const override {
2390  detect_case_expr_ = true;
2391  return true;
2392  }
2393 
2394  private:
2395  mutable bool detect_case_expr_;
2396 };
2397 
2398 bool has_case_expr_within_groupby_expr(RelAlgExecutionUnit const& ra_exe_unit) {
2399  if (ra_exe_unit.groupby_exprs.empty() || !ra_exe_unit.groupby_exprs.front()) {
2400  return false;
2401  }
2402  CaseExprDetector detector;
2403  for (auto expr : ra_exe_unit.groupby_exprs) {
2404  if (detector.detectCaseExpr(expr.get())) {
2405  return true;
2406  }
2407  }
2408  return false;
2409 }
2410 
2411 bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor* query_mem_desc_ptr,
2412  const RelAlgExecutionUnit& ra_exe_unit,
2413  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2414  const ExecutorDeviceType device_type,
2415  const unsigned cuda_blocksize,
2416  const unsigned num_blocks_per_mp) {
2417  if (device_type == ExecutorDeviceType::CPU) {
2418  return false;
2419  }
2420  if (query_mem_desc_ptr->didOutputColumnar()) {
2421  return false;
2422  }
2423  CHECK(query_mem_desc_ptr);
2424  CHECK(cuda_mgr);
2425  /*
2426  * We only use shared memory strategy if GPU hardware provides native shared
2427  * memory atomics support. From CUDA Toolkit documentation:
2428  * https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html#atomic-ops "Like
2429  * Maxwell, Pascal [and Volta] provides native shared memory atomic operations
2430  * for 32-bit integer arithmetic, along with native 32 or 64-bit compare-and-swap
2431  * (CAS)."
2432  *
2433  */
2434  if (!cuda_mgr->isArchMaxwellOrLaterForAll()) {
2435  return false;
2436  }
2437  if (cuda_mgr->isArchPascal() && !ra_exe_unit.join_quals.empty() &&
2438  has_count_expr(ra_exe_unit) && has_case_expr_within_groupby_expr(ra_exe_unit)) {
2439  return false;
2440  }
2441 
2442  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2445  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty()) {
2446  // TODO: relax this, if necessary
2447  if (cuda_blocksize < query_mem_desc_ptr->getEntryCount()) {
2448  return false;
2449  }
2450  // skip shared memory usage when dealing with 1) variable length targets, 2)
2451  // not a COUNT aggregate
2452  const auto target_infos =
2453  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2454  std::unordered_set<SQLAgg> supported_aggs{kCOUNT, kCOUNT_IF};
2455  if (std::find_if(target_infos.begin(),
2456  target_infos.end(),
2457  [&supported_aggs](const TargetInfo& ti) {
2458  if (ti.sql_type.is_varlen() ||
2459  !supported_aggs.count(ti.agg_kind)) {
2460  return true;
2461  } else {
2462  return false;
2463  }
2464  }) == target_infos.end()) {
2465  return true;
2466  }
2467  }
2468  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2479  if (cuda_blocksize < query_mem_desc_ptr->getEntryCount()) {
2480  return false;
2481  }
2482 
2483  // Fundamentally, we should use shared memory whenever the output buffer
2484  // is small enough so that we can fit it in the shared memory and yet expect
2485  // good occupancy.
2486  // For now, we allow keyless, row-wise layout, and only for perfect hash
2487  // group by operations.
2488  if (query_mem_desc_ptr->hasKeylessHash() &&
2489  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty() &&
2490  !query_mem_desc_ptr->useStreamingTopN()) {
2491  const size_t shared_memory_threshold_bytes = std::min(
2492  g_gpu_smem_threshold == 0 ? SIZE_MAX : g_gpu_smem_threshold,
2493  cuda_mgr->getMinSharedMemoryPerBlockForAllDevices() / num_blocks_per_mp);
2494  const auto output_buffer_size =
2495  query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount();
2496  if (output_buffer_size > shared_memory_threshold_bytes) {
2497  return false;
2498  }
2499 
2500  // skip shared memory usage when dealing with 1) variable length targets, 2)
2501  // non-basic aggregates (COUNT, SUM, MIN, MAX, AVG)
2502  // TODO: relax this if necessary
2503  const auto target_infos =
2504  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2505  std::unordered_set<SQLAgg> supported_aggs{kCOUNT, kCOUNT_IF};
2507  supported_aggs = {kCOUNT, kCOUNT_IF, kMIN, kMAX, kSUM, kSUM_IF, kAVG};
2508  }
2509  if (std::find_if(target_infos.begin(),
2510  target_infos.end(),
2511  [&supported_aggs](const TargetInfo& ti) {
2512  if (ti.sql_type.is_varlen() ||
2513  !supported_aggs.count(ti.agg_kind)) {
2514  return true;
2515  } else {
2516  return false;
2517  }
2518  }) == target_infos.end()) {
2519  return true;
2520  }
2521  }
2522  }
2523  return false;
2524 }
2525 
2526 #ifndef NDEBUG
2527 std::string serialize_llvm_metadata_footnotes(llvm::Function* query_func,
2528  CgenState* cgen_state) {
2529  std::string llvm_ir;
2530  std::unordered_set<llvm::MDNode*> md;
2531 
2532  // Loop over all instructions in the query function.
2533  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2534  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2535  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2536  instr_it->getAllMetadata(imd);
2537  for (auto [kind, node] : imd) {
2538  md.insert(node);
2539  }
2540  }
2541  }
2542 
2543  // Loop over all instructions in the row function.
2544  for (auto bb_it = cgen_state->row_func_->begin(); bb_it != cgen_state->row_func_->end();
2545  ++bb_it) {
2546  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2547  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2548  instr_it->getAllMetadata(imd);
2549  for (auto [kind, node] : imd) {
2550  md.insert(node);
2551  }
2552  }
2553  }
2554 
2555  // Loop over all instructions in the filter function.
2556  if (cgen_state->filter_func_) {
2557  for (auto bb_it = cgen_state->filter_func_->begin();
2558  bb_it != cgen_state->filter_func_->end();
2559  ++bb_it) {
2560  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2561  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2562  instr_it->getAllMetadata(imd);
2563  for (auto [kind, node] : imd) {
2564  md.insert(node);
2565  }
2566  }
2567  }
2568  }
2569 
2570  // Sort the metadata by canonical number and convert to text.
2571  if (!md.empty()) {
2572  std::map<size_t, std::string> sorted_strings;
2573  for (auto p : md) {
2574  std::string str;
2575  llvm::raw_string_ostream os(str);
2576  p->print(os, cgen_state->module_, true);
2577  os.flush();
2578  auto fields = split(str, {}, 1);
2579  if (fields.empty() || fields[0].empty()) {
2580  continue;
2581  }
2582  sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2583  }
2584  llvm_ir += "\n";
2585  for (auto [id, text] : sorted_strings) {
2586  llvm_ir += text;
2587  llvm_ir += "\n";
2588  }
2589  }
2590 
2591  return llvm_ir;
2592 }
2593 #endif // NDEBUG
2594 } // namespace
2595 
2596 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2597 Executor::compileWorkUnit(const std::vector<InputTableInfo>& query_infos,
2598  const PlanState::DeletedColumnsMap& deleted_cols_map,
2599  const RelAlgExecutionUnit& ra_exe_unit,
2600  const CompilationOptions& co,
2601  const ExecutionOptions& eo,
2602  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2603  const bool allow_lazy_fetch,
2604  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2605  const size_t max_groups_buffer_entry_guess,
2606  const int8_t crt_min_byte_width,
2607  const bool has_cardinality_estimation,
2608  ColumnCacheMap& column_cache,
2609  RenderInfo* render_info) {
2610  auto timer = DEBUG_TIMER(__func__);
2611 
2613  if (!cuda_mgr) {
2614  throw QueryMustRunOnCpu();
2615  }
2616  }
2617 
2618 #ifndef NDEBUG
2619  static std::uint64_t counter = 0;
2620  ++counter;
2621  VLOG(1) << "CODEGEN #" << counter << ":";
2622  LOG(IR) << "CODEGEN #" << counter << ":";
2623  LOG(PTX) << "CODEGEN #" << counter << ":";
2624  LOG(ASM) << "CODEGEN #" << counter << ":";
2625 #endif
2626 
2627  // cgenstate_manager uses RAII pattern to manage the live time of
2628  // CgenState instances.
2629  Executor::CgenStateManager cgenstate_manager(*this,
2630  allow_lazy_fetch,
2631  query_infos,
2632  deleted_cols_map,
2633  &ra_exe_unit); // locks compilation_mutex
2634  addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
2635 
2636  GroupByAndAggregate group_by_and_aggregate(
2637  this,
2638  co.device_type,
2639  ra_exe_unit,
2640  query_infos,
2641  row_set_mem_owner,
2642  has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2643  : std::nullopt);
2644  auto query_mem_desc =
2645  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
2646  max_groups_buffer_entry_guess,
2647  crt_min_byte_width,
2648  render_info,
2650 
2651  if (query_mem_desc->getQueryDescriptionType() ==
2653  !has_cardinality_estimation && (!render_info || !render_info->isInSitu()) &&
2654  !eo.just_explain) {
2655  const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2656  throw CardinalityEstimationRequired(col_range_info.max - col_range_info.min);
2657  }
2658 
2659  const bool output_columnar = query_mem_desc->didOutputColumnar();
2660  const bool gpu_shared_mem_optimization =
2662  ra_exe_unit,
2663  cuda_mgr,
2664  co.device_type,
2665  cuda_mgr ? this->blockSize() : 1,
2666  cuda_mgr ? this->numBlocksPerMP() : 1);
2667  if (gpu_shared_mem_optimization) {
2668  // disable interleaved bins optimization on the GPU
2669  query_mem_desc->setHasInterleavedBinsOnGpu(false);
2670  LOG(DEBUG1) << "GPU shared memory is used for the " +
2671  query_mem_desc->queryDescTypeToString() + " query(" +
2672  std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
2673  query_mem_desc.get())) +
2674  " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
2675  }
2676 
2677  const GpuSharedMemoryContext gpu_smem_context(
2678  get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
2679 
2681  const size_t num_count_distinct_descs =
2682  query_mem_desc->getCountDistinctDescriptorsSize();
2683  for (size_t i = 0; i < num_count_distinct_descs; i++) {
2684  const auto& count_distinct_descriptor =
2685  query_mem_desc->getCountDistinctDescriptor(i);
2686  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::UnorderedSet ||
2687  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
2688  !co.hoist_literals)) {
2689  throw QueryMustRunOnCpu();
2690  }
2691  }
2692 
2693  // we currently do not support varlen projection based on baseline groupby when
2694  // 1) target table is multi-fragmented and 2) multiple gpus are involved for query
2695  // processing in this case, we punt the query to cpu to avoid server crash
2696  for (const auto expr : ra_exe_unit.target_exprs) {
2697  if (auto gby_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
2698  bool has_multiple_gpus = cuda_mgr ? cuda_mgr->getDeviceCount() > 1 : false;
2699  if (gby_expr->get_aggtype() == SQLAgg::kSAMPLE && has_multiple_gpus &&
2700  !g_leaf_count) {
2701  std::set<const Analyzer::ColumnVar*,
2702  bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
2704  gby_expr->collect_column_var(colvar_set, true);
2705  for (const auto cv : colvar_set) {
2706  if (cv->get_type_info().is_varlen()) {
2707  const auto tbl_key = cv->getTableKey();
2708  std::for_each(query_infos.begin(),
2709  query_infos.end(),
2710  [&tbl_key](const InputTableInfo& input_table_info) {
2711  if (input_table_info.table_key == tbl_key &&
2712  input_table_info.info.fragments.size() > 1) {
2713  throw QueryMustRunOnCpu();
2714  }
2715  });
2716  }
2717  }
2718  }
2719  }
2720  }
2721  }
2722 
2723  // Read the module template and target either CPU or GPU
2724  // by binding the stream position functions to the right implementation:
2725  // stride access for GPU, contiguous for CPU
2726  CHECK(cgen_state_->module_ == nullptr);
2727  cgen_state_->set_module_shallow_copy(get_rt_module(), /*always_clone=*/true);
2728 
2729  auto is_gpu = co.device_type == ExecutorDeviceType::GPU;
2730  if (is_gpu) {
2731  cgen_state_->module_->setDataLayout(get_gpu_data_layout());
2732  cgen_state_->module_->setTargetTriple(get_gpu_target_triple_string());
2733  }
2734  if (has_udf_module(/*is_gpu=*/is_gpu)) {
2736  get_udf_module(/*is_gpu=*/is_gpu), *cgen_state_->module_, cgen_state_.get());
2737  }
2738  if (has_rt_udf_module(/*is_gpu=*/is_gpu)) {
2740  get_rt_udf_module(/*is_gpu=*/is_gpu), *cgen_state_->module_, cgen_state_.get());
2741  }
2742 
2743  AUTOMATIC_IR_METADATA(cgen_state_.get());
2744 
2745  auto agg_fnames =
2746  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
2747 
2748  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
2749 
2750  const bool is_group_by{query_mem_desc->isGroupBy()};
2751  auto [query_func, row_func_call] = is_group_by
2752  ? query_group_by_template(cgen_state_->module_,
2753  co.hoist_literals,
2754  *query_mem_desc,
2755  co.device_type,
2756  ra_exe_unit.scan_limit,
2757  gpu_smem_context)
2758  : query_template(cgen_state_->module_,
2759  agg_slot_count,
2760  co.hoist_literals,
2761  !!ra_exe_unit.estimator,
2762  gpu_smem_context);
2763  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
2764  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
2765  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
2766 
2767  cgen_state_->query_func_ = query_func;
2768  cgen_state_->row_func_call_ = row_func_call;
2769  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2770  &query_func->getEntryBlock().front());
2771 
2772  // Generate the function signature and column head fetches s.t.
2773  // double indirection isn't needed in the inner loop
2774  auto& fetch_bb = query_func->front();
2775  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2776  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2777  auto col_heads = generate_column_heads_load(ra_exe_unit.input_col_descs.size(),
2778  get_arg_by_name(query_func, "byte_stream"),
2779  fetch_ir_builder,
2780  cgen_state_->context_);
2781  CHECK_EQ(ra_exe_unit.input_col_descs.size(), col_heads.size());
2782 
2783  cgen_state_->row_func_ = create_row_function(ra_exe_unit.input_col_descs.size(),
2784  is_group_by ? 0 : agg_slot_count,
2785  co.hoist_literals,
2786  cgen_state_->module_,
2787  cgen_state_->context_);
2788  CHECK(cgen_state_->row_func_);
2789  cgen_state_->row_func_bb_ =
2790  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
2791 
2793  auto filter_func_ft =
2794  llvm::FunctionType::get(get_int_type(32, cgen_state_->context_), {}, false);
2795  cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2796  llvm::Function::ExternalLinkage,
2797  "filter_func",
2798  cgen_state_->module_);
2799  CHECK(cgen_state_->filter_func_);
2800  cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2801  cgen_state_->context_, "entry", cgen_state_->filter_func_);
2802  }
2803 
2804  cgen_state_->current_func_ = cgen_state_->row_func_;
2805  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2806 
2807  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
2808  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
2809  const auto join_loops =
2810  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2811 
2812  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
2813  for (auto& simple_qual : ra_exe_unit.simple_quals) {
2814  plan_state_->addSimpleQual(simple_qual);
2815  }
2816  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2817  if (is_not_deleted_bb) {
2818  cgen_state_->row_func_bb_ = is_not_deleted_bb;
2819  }
2820  if (!join_loops.empty()) {
2821  codegenJoinLoops(join_loops,
2822  body_execution_unit,
2823  group_by_and_aggregate,
2824  query_func,
2825  cgen_state_->row_func_bb_,
2826  *(query_mem_desc.get()),
2827  co,
2828  eo);
2829  } else {
2830  const bool can_return_error = compileBody(
2831  ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
2832  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
2834  createErrorCheckControlFlow(query_func,
2837  join_loops,
2838  co.device_type,
2839  group_by_and_aggregate.query_infos_);
2840  }
2841  }
2842  std::vector<llvm::Value*> hoisted_literals;
2843 
2844  if (co.hoist_literals) {
2845  VLOG(1) << "number of hoisted literals: "
2846  << cgen_state_->query_func_literal_loads_.size()
2847  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2848  << " bytes";
2849  }
2850 
2851  if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2852  // we have some hoisted literals...
2853  hoisted_literals = inlineHoistedLiterals();
2854  }
2855 
2856  // replace the row func placeholder call with the call to the actual row func
2857  std::vector<llvm::Value*> row_func_args;
2858  for (size_t i = 0; i < cgen_state_->row_func_call_->getNumOperands() - 1; ++i) {
2859  row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2860  }
2861  row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2862  row_func_args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
2863  row_func_args.push_back(get_arg_by_name(query_func, "row_func_mgr"));
2864  // push hoisted literals arguments, if any
2865  row_func_args.insert(
2866  row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2867  llvm::ReplaceInstWithInst(
2868  cgen_state_->row_func_call_,
2869  llvm::CallInst::Create(cgen_state_->row_func_, row_func_args, ""));
2870 
2871  // replace the filter func placeholder call with the call to the actual filter func
2872  if (cgen_state_->filter_func_) {
2873  std::vector<llvm::Value*> filter_func_args;
2874  for (auto arg_it = cgen_state_->filter_func_args_.begin();
2875  arg_it != cgen_state_->filter_func_args_.end();
2876  ++arg_it) {
2877  filter_func_args.push_back(arg_it->first);
2878  }
2879  llvm::ReplaceInstWithInst(
2880  cgen_state_->filter_func_call_,
2881  llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args, ""));
2882  }
2883 
2884  // Aggregate
2885  plan_state_->init_agg_vals_ =
2886  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
2887 
2888  /*
2889  * If we have decided to use GPU shared memory (decision is not made here), then
2890  * we generate proper code for extra components that it needs (buffer initialization and
2891  * gpu reduction from shared memory to global memory). We then replace these functions
2892  * into the already compiled query_func (replacing two placeholders, write_back_nop and
2893  * init_smem_nop). The rest of the code should be as before (row_func, etc.).
2894  */
2895  if (gpu_smem_context.isSharedMemoryUsed()) {
2896  if (query_mem_desc->getQueryDescriptionType() ==
2898  GpuSharedMemCodeBuilder gpu_smem_code(
2899  cgen_state_->module_,
2900  cgen_state_->context_,
2901  *query_mem_desc,
2903  plan_state_->init_agg_vals_,
2904  executor_id_);
2905  gpu_smem_code.codegen();
2906  gpu_smem_code.injectFunctionsInto(query_func);
2907 
2908  // helper functions are used for caching purposes later
2909  cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2910  cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2911  LOG(IR) << gpu_smem_code.toString();
2912  }
2913  }
2914 
2915  auto multifrag_query_func = cgen_state_->module_->getFunction(
2916  "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
2917  CHECK(multifrag_query_func);
2918 
2920  insertErrorCodeChecker(multifrag_query_func,
2921  get_index_by_name(query_func, "error_code"),
2922  co.hoist_literals,
2924  }
2925 
2926  bind_query(query_func,
2927  "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
2928  multifrag_query_func,
2929  cgen_state_->module_);
2930 
2931  std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2932  if (cgen_state_->filter_func_) {
2933  root_funcs.push_back(cgen_state_->filter_func_);
2934  }
2935  auto live_funcs = CodeGenerator::markDeadRuntimeFuncs(
2936  *cgen_state_->module_, root_funcs, {multifrag_query_func});
2937 
2938  // Always inline the row function and the filter function.
2939  // We don't want register spills in the inner loops.
2940  // LLVM seems to correctly free up alloca instructions
2941  // in these functions even when they are inlined.
2942  mark_function_always_inline(cgen_state_->row_func_);
2943  if (cgen_state_->filter_func_) {
2944  mark_function_always_inline(cgen_state_->filter_func_);
2945  }
2946 
2947 #ifndef NDEBUG
2948  // Add helpful metadata to the LLVM IR for debugging.
2950 #endif
2951 
2952  auto const device_str = co.device_type == ExecutorDeviceType::CPU ? "CPU:\n" : "GPU:\n";
2953  // Serialize the important LLVM IR functions to text for SQL EXPLAIN.
2954  std::string llvm_ir =
2955  serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
2956  serialize_llvm_object(cgen_state_->row_func_) +
2957  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_) : "");
2958  VLOG(3) << "Unoptimized IR for the " << device_str << "\n" << llvm_ir << "\nEnd of IR";
2960 #ifdef WITH_JIT_DEBUG
2961  throw std::runtime_error(
2962  "Explain optimized not available when JIT runtime debug symbols are enabled");
2963 #else
2964  // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
2965  // optimized IR after NVVM reflect
2966  llvm::legacy::PassManager pass_manager;
2967  optimize_ir(query_func,
2968  cgen_state_->module_,
2969  pass_manager,
2970  live_funcs,
2971  gpu_smem_context.isSharedMemoryUsed(),
2972  co);
2973 #endif // WITH_JIT_DEBUG
2974  llvm_ir =
2975  serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
2976  serialize_llvm_object(cgen_state_->row_func_) +
2977  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2978  : "");
2979 #ifndef NDEBUG
2980  llvm_ir += serialize_llvm_metadata_footnotes(query_func, cgen_state_.get());
2981 #endif
2982  }
2983  LOG(IR) << "\n\n" << query_mem_desc->toString() << "\n";
2984  LOG(IR) << "IR for the " << device_str;
2985 #ifdef NDEBUG
2986  LOG(IR) << serialize_llvm_object(query_func)
2987  << serialize_llvm_object(cgen_state_->row_func_)
2988  << (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2989  : "")
2990  << "\nEnd of IR";
2991 #else
2992  LOG(IR) << serialize_llvm_object(cgen_state_->module_) << "\nEnd of IR";
2993 #endif
2994  // Insert calls to "register_buffer_with_executor_rsm" for allocations
2995  // in runtime functions (i.e. from RBC) without it
2996  AutoTrackBuffersInRuntimeIR();
2997 
2998  // Run some basic validation checks on the LLVM IR before code is generated below.
2999  verify_function_ir(cgen_state_->row_func_);
3000  if (cgen_state_->filter_func_) {
3001  verify_function_ir(cgen_state_->filter_func_);
3002  }
3003 
3004  // Generate final native code from the LLVM IR.
3005  return std::make_tuple(
3008  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
3009  : optimizeAndCodegenGPU(query_func,
3010  multifrag_query_func,
3011  live_funcs,
3012  is_group_by || ra_exe_unit.estimator,
3013  cuda_mgr,
3014  gpu_smem_context.isSharedMemoryUsed(),
3015  co),
3016  cgen_state_->getLiterals(),
3017  output_columnar,
3018  llvm_ir,
3019  std::move(gpu_smem_context)},
3020  std::move(query_mem_desc));
3021 }
3022 
3023 void Executor::insertErrorCodeChecker(llvm::Function* query_func,
3024  unsigned const error_code_idx,
3025  bool hoist_literals,
3026  bool allow_runtime_query_interrupt) {
3027  auto query_stub_func_name =
3028  "query_stub" + std::string(hoist_literals ? "_hoisted_literals" : "");
3029  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
3030  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
3031  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
3032  continue;
3033  }
3034  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
3035  auto const row_func_name = CodegenUtil::getCalledFunctionName(row_func_call);
3036  if (row_func_name && *row_func_name == query_stub_func_name) {
3037  auto next_inst_it = inst_it;
3038  ++next_inst_it;
3039  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
3040  auto& br_instr = bb_it->back();
3041  llvm::IRBuilder<> ir_builder(&br_instr);
3042  llvm::Value* err_lv = &*inst_it;
3043  auto error_check_bb =
3044  bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr), ".error_check");
3045  // query_func does not have parameter names assigned.
3046  llvm::Value* const error_code_arg = get_arg_by_index(query_func, error_code_idx);
3047  CHECK(error_code_arg) << error_code_idx << '/' << query_func->arg_size();
3048  llvm::Value* err_code = nullptr;
3049  if (allow_runtime_query_interrupt) {
3050  // decide the final error code with a consideration of interrupt status
3051  auto& check_interrupt_br_instr = bb_it->back();
3052  auto interrupt_check_bb = llvm::BasicBlock::Create(
3053  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
3054  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
3055  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
3056  cgen_state_->module_->getFunction("check_interrupt"), {});
3057  auto detected_error = interrupt_checker_ir_builder.CreateCall(
3058  cgen_state_->module_->getFunction("get_error_code"),
3059  std::vector<llvm::Value*>{error_code_arg});
3060  err_code = interrupt_checker_ir_builder.CreateSelect(
3061  detected_interrupt,
3062  cgen_state_->llInt(Executor::ERR_INTERRUPTED),
3063  detected_error);
3064  interrupt_checker_ir_builder.CreateBr(error_check_bb);
3065  llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
3066  llvm::BranchInst::Create(interrupt_check_bb));
3067  ir_builder.SetInsertPoint(&br_instr);
3068  } else {
3069  // uses error code returned from row_func and skip to check interrupt status
3070  ir_builder.SetInsertPoint(&br_instr);
3071  err_code =
3072  ir_builder.CreateCall(cgen_state_->module_->getFunction("get_error_code"),
3073  std::vector<llvm::Value*>{error_code_arg});
3074  }
3075  err_lv = ir_builder.CreateICmp(
3076  llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
3077  auto error_bb = llvm::BasicBlock::Create(
3078  cgen_state_->context_, ".error_exit", query_func, new_bb);
3079  llvm::CallInst::Create(cgen_state_->module_->getFunction("record_error_code"),
3080  std::vector<llvm::Value*>{err_code, error_code_arg},
3081  "",
3082  error_bb);
3083  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
3084  llvm::ReplaceInstWithInst(&br_instr,
3085  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
3086  break;
3087  }
3088  }
3089  }
3090 }
3091 
3093  const RelAlgExecutionUnit& ra_exe_unit,
3094  const CompilationOptions& co) {
3095  AUTOMATIC_IR_METADATA(cgen_state_.get());
3096  if (!co.filter_on_deleted_column) {
3097  return nullptr;
3098  }
3099  CHECK(!ra_exe_unit.input_descs.empty());
3100  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
3101  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
3102  return nullptr;
3103  }
3104  const auto& table_key = outer_input_desc.getTableKey();
3105  const auto deleted_cd = plan_state_->getDeletedColForTable(table_key);
3106  if (!deleted_cd) {
3107  return nullptr;
3108  }
3109  CHECK(deleted_cd->columnType.is_boolean());
3110  const auto deleted_expr =
3111  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
3112  shared::ColumnKey{table_key, deleted_cd->columnId},
3113  outer_input_desc.getNestLevel());
3114  CodeGenerator code_generator(this);
3115  const auto is_deleted =
3116  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
3117  const auto is_deleted_bb = llvm::BasicBlock::Create(
3118  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
3119  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
3120  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
3121  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
3122  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
3123  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3124  cgen_state_->ir_builder_.SetInsertPoint(bb);
3125  return bb;
3126 }
3127 
3128 bool Executor::compileBody(const RelAlgExecutionUnit& ra_exe_unit,
3129  GroupByAndAggregate& group_by_and_aggregate,
3131  const CompilationOptions& co,
3132  const GpuSharedMemoryContext& gpu_smem_context) {
3133  AUTOMATIC_IR_METADATA(cgen_state_.get());
3134 
3135  // Switch the code generation into a separate filter function if enabled.
3136  // Note that accesses to function arguments are still codegenned from the
3137  // row function's arguments, then later automatically forwarded and
3138  // remapped into filter function arguments by redeclareFilterFunction().
3139  cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
3140  llvm::Value* loop_done{nullptr};
3141  std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
3142  if (cgen_state_->filter_func_) {
3143  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3144  auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
3145  cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
3146  row_func_entry_bb->begin());
3147  loop_done = cgen_state_->ir_builder_.CreateAlloca(
3148  get_int_type(1, cgen_state_->context_), nullptr, "loop_done");
3149  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3150  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(true), loop_done);
3151  }
3152  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
3153  cgen_state_->current_func_ = cgen_state_->filter_func_;
3154  fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
3155  }
3156 
3157  // generate the code for the filter
3158  std::vector<Analyzer::Expr*> primary_quals;
3159  std::vector<Analyzer::Expr*> deferred_quals;
3160  bool short_circuited = CodeGenerator::prioritizeQuals(
3161  ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
3162  if (short_circuited) {
3163  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
3164  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
3165  << " quals";
3166  }
3167  llvm::Value* filter_lv = cgen_state_->llBool(true);
3168  CodeGenerator code_generator(this);
3169  for (auto expr : primary_quals) {
3170  // Generate the filter for primary quals
3171  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
3172  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
3173  }
3174  CHECK(filter_lv->getType()->isIntegerTy(1));
3175  llvm::BasicBlock* sc_false{nullptr};
3176  if (!deferred_quals.empty()) {
3177  auto sc_true = llvm::BasicBlock::Create(
3178  cgen_state_->context_, "sc_true", cgen_state_->current_func_);
3179  sc_false = llvm::BasicBlock::Create(
3180  cgen_state_->context_, "sc_false", cgen_state_->current_func_);
3181  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
3182  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
3183  if (ra_exe_unit.join_quals.empty()) {
3184  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
3185  }
3186  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
3187  filter_lv = cgen_state_->llBool(true);
3188  }
3189  for (auto expr : deferred_quals) {
3190  filter_lv = cgen_state_->ir_builder_.CreateAnd(
3191  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
3192  }
3193 
3194  CHECK(filter_lv->getType()->isIntegerTy(1));
3195  auto ret = group_by_and_aggregate.codegen(
3196  filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
3197 
3198  // Switch the code generation back to the row function if a filter
3199  // function was enabled.
3200  if (cgen_state_->filter_func_) {
3201  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3202  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(false), loop_done);
3203  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3204  }
3205 
3206  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3207  cgen_state_->current_func_ = cgen_state_->row_func_;
3208  cgen_state_->filter_func_call_ =
3209  cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
3210 
3211  // Create real filter function declaration after placeholder call
3212  // is emitted.
3213  redeclareFilterFunction();
3214 
3215  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3216  auto loop_done_true = llvm::BasicBlock::Create(
3217  cgen_state_->context_, "loop_done_true", cgen_state_->row_func_);
3218  auto loop_done_false = llvm::BasicBlock::Create(
3219  cgen_state_->context_, "loop_done_false", cgen_state_->row_func_);
3220  auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(
3221  loop_done->getType()->getPointerElementType(), loop_done);
3222  cgen_state_->ir_builder_.CreateCondBr(
3223  loop_done_flag, loop_done_true, loop_done_false);
3224  cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
3225  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3226  cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
3227  } else {
3228  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3229  }
3230  }
3231  return ret;
3232 }
3233 
3234 std::vector<llvm::Value*> generate_column_heads_load(const int num_columns,
3235  llvm::Value* byte_stream_arg,
3236  llvm::IRBuilder<>& ir_builder,
3237  llvm::LLVMContext& ctx) {
3238  CHECK(byte_stream_arg);
3239  const auto max_col_local_id = num_columns - 1;
3240 
3241  std::vector<llvm::Value*> col_heads;
3242  for (int col_id = 0; col_id <= max_col_local_id; ++col_id) {
3243  auto* gep = ir_builder.CreateGEP(
3244  byte_stream_arg->getType()->getScalarType()->getPointerElementType(),
3245  byte_stream_arg,
3246  llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id));
3247  col_heads.emplace_back(
3248  ir_builder.CreateLoad(gep->getType()->getPointerElementType(), gep));
3249  }
3250  return col_heads;
3251 }
3252 
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, const std::vector< JoinLoop > &join_loops, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
std::optional< std::string_view > getCalledFunctionName(llvm::CallInst &call_inst)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
double g_running_query_interrupt_freq
Definition: Execute.cpp:134
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
std::string get_cuda_libdevice_dir(void)
Definition: CudaMgr.cpp:578
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned cuda_blocksize, const unsigned num_blocks_per_mp)
bool countDistinctDescriptorsLogicallyEmpty() const
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1621
std::unordered_map< shared::TableKey, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
Definition: Analyzer.h:215
void mark_function_never_inline(llvm::Function *func)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
Definition: Analyzer.h:222
void optimize_ir(llvm::Function *query_func, llvm::Module *llvm_module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co)
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:285
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void AutoTrackBuffersInRuntimeIR()
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void mark_function_always_inline(llvm::Function *func)
bool is_fp() const
Definition: sqltypes.h:571
llvm::StringRef get_gpu_data_layout()
llvm::ConstantInt * ll_int(const T v, llvm::LLVMContext &context)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *llvm_module)
std::string join(T const &container, std::string const &delim)
std::vector< InputDescriptor > input_descs
#define UNREACHABLE()
Definition: Logger.h:338
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_string(const std::string &udf_ir_string, llvm::LLVMContext &ctx, bool is_gpu=false)
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *mod, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
void insertErrorCodeChecker(llvm::Function *query_func, unsigned const error_code_idx, bool hoist_literals, bool allow_runtime_query_interrupt)
std::vector< std::string > CodeCacheKey
Definition: CodeCache.h:24
ExecutorOptLevel opt_level
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:81
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
T visit(const Analyzer::Expr *expr) const
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:305
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *mod, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
bool is_time() const
Definition: sqltypes.h:577
ExecutorDeviceType
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
llvm::Function * row_func_
Definition: CgenState.h:374
bool g_enable_smem_non_grouped_agg
Definition: Execute.cpp:143
Definition: sqldefs.h:75
std::shared_lock< T > shared_lock
unsigned getExpOfTwo(unsigned n)
Definition: MathUtils.cpp:23
llvm::StringRef get_gpu_target_triple_string()
llvm::Module * module_
Definition: CgenState.h:373
Supported runtime functions management and retrieval.
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
void verify_function_ir(const llvm::Function *func)
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:168
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
ExecutionEngineWrapper create_execution_engine(llvm::Module *llvm_module, llvm::EngineBuilder &eb, const CompilationOptions &co)
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
bool is_integer() const
Definition: sqltypes.h:565
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx, bool is_gpu=false)
ExecutorExplainType explain_type
unsigned get_index_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:187
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1620
void initializeNVPTXBackend() const
Definition: sqldefs.h:77
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:135
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
CubinResult ptx_to_cubin(const std::string &ptx, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
this
Definition: Execute.cpp:277
QueryDescriptionType getQueryDescriptionType() const
static std::mutex initialize_cpu_backend_mutex_
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
Definition: CudaMgr.h:169
#define AUTOMATIC_IR_METADATA_DONE()
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *llvm_module, llvm::LLVMContext &context)
ExecutorDeviceType device_type
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *llvm_module)
llvm::Function * filter_func_
Definition: CgenState.h:375
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
static void addUdfIrToModule(const std::string &udf_ir_filename, const bool is_cuda_ir)
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:415
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *llvm_module)
#define CHECK_LE(x, y)
Definition: Logger.h:304
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool is_gpu_smem_used, const CompilationOptions &)
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(Executor *executor, llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co, const GPUTarget &gpu_target)
bool g_enable_smem_grouped_non_count_agg
Definition: Execute.cpp:140
Definition: sqldefs.h:78
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::unordered_map< shared::TableKey, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
bool has_count_expr(RelAlgExecutionUnit const &ra_exe_unit)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
static std::map< ExtModuleKinds, std::string > extension_module_sources
Definition: Execute.h:528
void show_defined(llvm::Module &llvm_module)
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)
int CUdevice
Definition: nocuda.h:20
bool g_enable_filter_function
Definition: Execute.cpp:85
static void linkModuleWithLibdevice(Executor *executor, llvm::Module &module, llvm::PassManagerBuilder &pass_manager_builder, const GPUTarget &gpu_target)
virtual T visitCaseExpr(const Analyzer::CaseExpr *case_) const
float g_fraction_code_cache_to_evict
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
Definition: LogicalIR.cpp:157
SQLAgg get_aggtype() const
Definition: Analyzer.h:1329
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool g_enable_watchdog false
Definition: Execute.cpp:80
#define CHECK(condition)
Definition: Logger.h:291
bool is_geometry() const
Definition: sqltypes.h:595
#define DEBUG_TIMER(name)
Definition: Logger.h:412
llvm::ValueToValueMapTy vmap_
Definition: CgenState.h:383
std::vector< llvm::Value * > inlineHoistedLiterals()
static std::shared_ptr< QueryEngine > getInstance()
Definition: QueryEngine.h:86
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool isArchPascal() const
Definition: CudaMgr.h:160
bool any_of(std::vector< Analyzer::Expr * > const &target_exprs)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
bool is_string() const
Definition: sqltypes.h:559
size_t g_leaf_count
Definition: ParserNode.cpp:78
Definition: sqldefs.h:76
int cpu_threads()
Definition: thread_count.h:25
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls, const bool is_gpu=false)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool is_decimal() const
Definition: sqltypes.h:568
Definition: sqldefs.h:74
llvm::Type * get_int_ptr_type(const int width, llvm::LLVMContext &context)
constexpr std::array< std::string_view, 18 > TARGET_RUNTIME_FUNCTIONS_FOR_MODULE_CLONING
Definition: sqldefs.h:83
bool is_array() const
Definition: sqltypes.h:583
#define VLOG(n)
Definition: Logger.h:388
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
llvm::Value * get_arg_by_index(llvm::Function *func, unsigned const index)
Definition: Execute.h:178
std::unique_ptr< llvm::Module > read_llvm_module_from_bc_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
bool has_case_expr_within_groupby_expr(RelAlgExecutionUnit const &ra_exe_unit)
static std::mutex initialize_nvptx_mutex_
size_t g_gpu_smem_threshold
Definition: Execute.cpp:135