OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
NativeCodegen.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryEngine/Execute.h"
18 
19 #if LLVM_VERSION_MAJOR < 9
20 static_assert(false, "LLVM Version >= 9 is required.");
21 #endif
22 
23 #include <llvm/Analysis/ScopedNoAliasAA.h>
24 #include <llvm/Analysis/TypeBasedAliasAnalysis.h>
25 #include <llvm/Bitcode/BitcodeReader.h>
26 #include <llvm/Bitcode/BitcodeWriter.h>
27 #include <llvm/ExecutionEngine/MCJIT.h>
28 #include <llvm/IR/Attributes.h>
29 #include <llvm/IR/GlobalValue.h>
30 #include <llvm/IR/InstIterator.h>
31 #include <llvm/IR/IntrinsicInst.h>
32 #include <llvm/IR/Intrinsics.h>
33 #include <llvm/IR/LegacyPassManager.h>
34 #include <llvm/IR/Verifier.h>
35 #include <llvm/IRReader/IRReader.h>
36 #if 14 <= LLVM_VERSION_MAJOR
37 #include <llvm/MC/TargetRegistry.h>
38 #else
39 #include <llvm/Support/TargetRegistry.h>
40 #endif
41 #include <llvm/Support/Casting.h>
42 #include <llvm/Support/FileSystem.h>
43 #include <llvm/Support/FormattedStream.h>
44 #include <llvm/Support/MemoryBuffer.h>
45 #include <llvm/Support/SourceMgr.h>
46 #include <llvm/Support/TargetSelect.h>
47 #include <llvm/Support/raw_os_ostream.h>
48 #include <llvm/Support/raw_ostream.h>
49 #include <llvm/Transforms/IPO.h>
50 #include <llvm/Transforms/IPO/AlwaysInliner.h>
51 #include <llvm/Transforms/IPO/InferFunctionAttrs.h>
52 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
53 #include <llvm/Transforms/InstCombine/InstCombine.h>
54 #include <llvm/Transforms/Instrumentation.h>
55 #include <llvm/Transforms/Scalar.h>
56 #include <llvm/Transforms/Scalar/GVN.h>
57 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
58 #include <llvm/Transforms/Utils.h>
59 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
60 #include <llvm/Transforms/Utils/Cloning.h>
61 
62 #if LLVM_VERSION_MAJOR >= 11
63 #include <llvm/Support/Host.h>
64 #endif
65 
66 #include "CudaMgr/CudaMgr.h"
76 #include "Shared/MathUtils.h"
77 #include "StreamingTopN.h"
78 
80 
81 static llvm::sys::Mutex g_ee_create_mutex;
82 
83 #ifdef ENABLE_GEOS
84 
85 #include <llvm/Support/DynamicLibrary.h>
86 
87 #ifndef GEOS_LIBRARY_FILENAME
88 #error Configuration should include GEOS library file name
89 #endif
90 std::unique_ptr<std::string> g_libgeos_so_filename(
91  new std::string(GEOS_LIBRARY_FILENAME));
92 static llvm::sys::DynamicLibrary geos_dynamic_library;
93 static std::mutex geos_init_mutex;
94 
95 namespace {
96 
97 void load_geos_dynamic_library() {
98  std::lock_guard<std::mutex> guard(geos_init_mutex);
99 
100  if (!geos_dynamic_library.isValid()) {
101  if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
102  LOG(WARNING) << "Misconfigured GEOS library file name, trying 'libgeos_c.so'";
103  g_libgeos_so_filename.reset(new std::string("libgeos_c.so"));
104  }
105  auto filename = *g_libgeos_so_filename;
106  std::string error_message;
107  geos_dynamic_library =
108  llvm::sys::DynamicLibrary::getPermanentLibrary(filename.c_str(), &error_message);
109  if (!geos_dynamic_library.isValid()) {
110  LOG(ERROR) << "Failed to load GEOS library '" + filename + "'";
111  std::string exception_message = "Failed to load GEOS library: " + error_message;
112  throw std::runtime_error(exception_message.c_str());
113  } else {
114  LOG(INFO) << "Loaded GEOS library '" + filename + "'";
115  }
116  }
117 }
118 
119 } // namespace
120 #endif
121 
122 namespace {
123 
124 void throw_parseIR_error(const llvm::SMDiagnostic& parse_error,
125  std::string src = "",
126  const bool is_gpu = false) {
127  std::string excname = (is_gpu ? "NVVM IR ParseError: " : "LLVM IR ParseError: ");
128  llvm::raw_string_ostream ss(excname);
129  parse_error.print(src.c_str(), ss, false, false);
130  throw ParseIRError(ss.str());
131 }
132 
133 /* SHOW_DEFINED(<llvm::Module instance>) prints the function names
134  that are defined in the given LLVM Module instance.
135 
136  SHOW_FUNCTIONS(<llvm::Module instance>) prints the function names
137  of all used functions in the given LLVM Module
138  instance. Declarations are marked with `[decl]` as a name suffix.
139 
140  Useful for debugging.
141 */
142 
143 #define SHOW_DEFINED(MODULE) \
144  { \
145  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
146  ::show_defined(MODULE); \
147  }
148 
149 #define SHOW_FUNCTIONS(MODULE) \
150  { \
151  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
152  ::show_functions(MODULE); \
153  }
154 
155 template <typename T = void>
156 void show_defined(llvm::Module& llvm_module) {
157  std::cout << "defines: ";
158  for (auto& f : llvm_module.getFunctionList()) {
159  if (!f.isDeclaration()) {
160  std::cout << f.getName().str() << ", ";
161  }
162  }
163  std::cout << std::endl;
164 }
165 
166 template <typename T = void>
167 void show_defined(llvm::Module* llvm_module) {
168  if (llvm_module == nullptr) {
169  std::cout << "is null" << std::endl;
170  } else {
171  show_defined(*llvm_module);
172  }
173 }
174 
175 template <typename T = void>
176 void show_defined(std::unique_ptr<llvm::Module>& llvm_module) {
177  show_defined(llvm_module.get());
178 }
179 
180 /*
181  scan_function_calls(module, defined, undefined, ignored) computes
182  defined and undefined sets of function names:
183 
184  - defined functions are those that are defined in the given module
185 
186  - undefined functions are those that are called by defined functions
187  but that are not defined in the given module
188 
189  - ignored functions are functions that may be undefined but will not
190  be listed in the set of undefined functions.
191 
192  Useful for debugging.
193 */
194 template <typename T = void>
195 void scan_function_calls(llvm::Function& F,
196  std::unordered_set<std::string>& defined,
197  std::unordered_set<std::string>& undefined,
198  const std::unordered_set<std::string>& ignored) {
199  for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
200  if (auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
201  auto* F2 = CI->getCalledFunction();
202  if (F2 != nullptr) {
203  auto F2name = F2->getName().str();
204  if (F2->isDeclaration()) {
205  if (F2name.rfind("__", 0) !=
206  0 // assume symbols with double underscore are defined
207  && F2name.rfind("llvm.", 0) !=
208  0 // TODO: this may give false positive for NVVM intrinsics
209  && ignored.find(F2name) == ignored.end() // not in ignored list
210  ) {
211  undefined.emplace(F2name);
212  }
213  } else {
214  if (defined.find(F2name) == defined.end()) {
215  defined.emplace(F2name);
216  scan_function_calls<T>(*F2, defined, undefined, ignored);
217  }
218  }
219  }
220  }
221  }
222 }
223 
224 template <typename T = void>
225 void scan_function_calls(llvm::Module& llvm_module,
226  std::unordered_set<std::string>& defined,
227  std::unordered_set<std::string>& undefined,
228  const std::unordered_set<std::string>& ignored) {
229  for (auto& F : llvm_module) {
230  if (!F.isDeclaration()) {
231  scan_function_calls(F, defined, undefined, ignored);
232  }
233  }
234 }
235 
236 template <typename T = void>
237 std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>>
238 scan_function_calls(llvm::Module& llvm_module,
239  const std::unordered_set<std::string>& ignored = {}) {
240  std::unordered_set<std::string> defined, undefined;
241  scan_function_calls(llvm_module, defined, undefined, ignored);
242  return std::make_tuple(defined, undefined);
243 }
244 
245 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
247  llvm::Module& M,
248  const std::unordered_set<llvm::Function*>& live_funcs) {
249  std::vector<llvm::Function*> dead_funcs;
250  for (auto& F : M) {
251  bool bAlive = false;
252  if (live_funcs.count(&F)) {
253  continue;
254  }
255  for (auto U : F.users()) {
256  auto* C = llvm::dyn_cast<const llvm::CallInst>(U);
257  if (!C || C->getParent()->getParent() != &F) {
258  bAlive = true;
259  break;
260  }
261  }
262  if (!bAlive) {
263  dead_funcs.push_back(&F);
264  }
265  }
266  for (auto pFn : dead_funcs) {
267  pFn->eraseFromParent();
268  }
269 }
270 
271 #ifdef HAVE_CUDA
272 
273 // check if linking with libdevice is required
274 // libdevice functions have a __nv_* prefix
275 bool check_module_requires_libdevice(llvm::Module* llvm_module) {
276  auto timer = DEBUG_TIMER(__func__);
277  for (llvm::Function& F : *llvm_module) {
278  if (F.hasName() && F.getName().startswith("__nv_")) {
279  LOG(INFO) << "Module requires linking with libdevice: " << std::string(F.getName());
280  return true;
281  }
282  }
283  LOG(DEBUG1) << "module does not require linking against libdevice";
284  return false;
285 }
286 
287 // Adds the missing intrinsics declarations to the given module
288 void add_intrinsics_to_module(llvm::Module* llvm_module) {
289  for (llvm::Function& F : *llvm_module) {
290  for (llvm::Instruction& I : instructions(F)) {
291  if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
292  if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
293  llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
294  llvm::Function& decl_fn =
295  *llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID(), Tys);
296  ii->setCalledFunction(&decl_fn);
297  } else {
298  // inserts the declaration into the module if not present
299  llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID());
300  }
301  }
302  }
303  }
304 }
305 
306 #endif
307 
308 void optimize_ir(llvm::Function* query_func,
309  llvm::Module* llvm_module,
310  llvm::legacy::PassManager& pass_manager,
311  const std::unordered_set<llvm::Function*>& live_funcs,
312  const bool is_gpu_smem_used,
313  const CompilationOptions& co) {
314  auto timer = DEBUG_TIMER(__func__);
315  // the always inliner legacy pass must always run first
316  pass_manager.add(llvm::createVerifierPass());
317  pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
318 
319  pass_manager.add(new AnnotateInternalFunctionsPass());
320 
321  pass_manager.add(llvm::createSROAPass());
322  // mem ssa drops unused load and store instructions, e.g. passing variables directly
323  // where possible
324  pass_manager.add(
325  llvm::createEarlyCSEPass(/*enable_mem_ssa=*/true)); // Catch trivial redundancies
326 
327  if (!is_gpu_smem_used) {
328  // thread jumps can change the execution order around SMEM sections guarded by
329  // `__syncthreads()`, which results in race conditions. For now, disable jump
330  // threading for shared memory queries. In the future, consider handling shared memory
331  // aggregations with a separate kernel launch
332  pass_manager.add(llvm::createJumpThreadingPass()); // Thread jumps.
333  }
334  pass_manager.add(llvm::createCFGSimplificationPass());
335 
336  // remove load/stores in PHIs if instructions can be accessed directly post thread jumps
337  pass_manager.add(llvm::createNewGVNPass());
338 
339  pass_manager.add(llvm::createDeadStoreEliminationPass());
340  pass_manager.add(llvm::createLICMPass());
341 
342  pass_manager.add(llvm::createInstructionCombiningPass());
343 
344  // module passes
345  pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
346  pass_manager.add(llvm::createGlobalOptimizerPass());
347 
348  pass_manager.add(llvm::createCFGSimplificationPass()); // cleanup after everything
349 
350  pass_manager.run(*llvm_module);
351 
352  eliminate_dead_self_recursive_funcs(*llvm_module, live_funcs);
353 }
354 #endif
355 
356 } // namespace
357 
359 
360 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine)
361  : execution_engine_(execution_engine) {}
362 
363 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine,
364  const CompilationOptions& co)
365  : execution_engine_(execution_engine) {
366  if (execution_engine_) {
368 #ifdef ENABLE_INTEL_JIT_LISTENER
369  intel_jit_listener_.reset(llvm::JITEventListener::createIntelJITEventListener());
371  execution_engine_->RegisterJITEventListener(intel_jit_listener_.get());
372  LOG(INFO) << "Registered IntelJITEventListener";
373 #else
374  LOG(WARNING) << "This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
375  "listener configuration parameter.";
376 #endif // ENABLE_INTEL_JIT_LISTENER
377  }
378  }
379 }
380 
382  llvm::ExecutionEngine* execution_engine) {
383  execution_engine_.reset(execution_engine);
384  intel_jit_listener_ = nullptr;
385  return *this;
386 }
387 
388 void verify_function_ir(const llvm::Function* func) {
389  std::stringstream err_ss;
390  llvm::raw_os_ostream err_os(err_ss);
391  err_os << "\n-----\n";
392  if (llvm::verifyFunction(*func, &err_os)) {
393  err_os << "\n-----\n";
394  func->print(err_os, nullptr);
395  err_os << "\n-----\n";
396  LOG(FATAL) << err_ss.str();
397  }
398 }
399 
400 namespace {
401 
402 std::string assemblyForCPU(ExecutionEngineWrapper& execution_engine,
403  llvm::Module* llvm_module) {
404  llvm::legacy::PassManager pass_manager;
405  auto cpu_target_machine = execution_engine->getTargetMachine();
406  CHECK(cpu_target_machine);
407  llvm::SmallString<256> code_str;
408  llvm::raw_svector_ostream os(code_str);
409 #if LLVM_VERSION_MAJOR >= 10
410  cpu_target_machine->addPassesToEmitFile(
411  pass_manager, os, nullptr, llvm::CGFT_AssemblyFile);
412 #else
413  cpu_target_machine->addPassesToEmitFile(
414  pass_manager, os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
415 #endif
416  pass_manager.run(*llvm_module);
417  return "Assembly for the CPU:\n" + std::string(code_str.str()) + "\nEnd of assembly";
418 }
419 
421  llvm::EngineBuilder& eb,
422  const CompilationOptions& co) {
423  auto timer = DEBUG_TIMER(__func__);
424  // Avoids data race in
425  // llvm::sys::DynamicLibrary::getPermanentLibrary and
426  // GDBJITRegistrationListener::notifyObjectLoaded while creating a
427  // new ExecutionEngine instance. Unfortunately we have to use global
428  // mutex here.
429  std::lock_guard<llvm::sys::Mutex> lock(g_ee_create_mutex);
430  ExecutionEngineWrapper execution_engine(eb.create(), co);
431  CHECK(execution_engine.get());
432  // Force the module data layout to match the layout for the selected target
433  llvm_module->setDataLayout(execution_engine->getDataLayout());
434 
435  LOG(ASM) << assemblyForCPU(execution_engine, llvm_module);
436 
437  execution_engine->finalizeObject();
438  return execution_engine;
439 }
440 
441 } // namespace
442 
444  llvm::Function* func,
445  const std::unordered_set<llvm::Function*>& live_funcs,
446  const CompilationOptions& co) {
447  auto timer = DEBUG_TIMER(__func__);
448  llvm::Module* llvm_module = func->getParent();
449  // run optimizations
450 #ifndef WITH_JIT_DEBUG
451  llvm::legacy::PassManager pass_manager;
452  optimize_ir(
453  func, llvm_module, pass_manager, live_funcs, /*is_gpu_smem_used=*/false, co);
454 #endif // WITH_JIT_DEBUG
455 
456  auto init_err = llvm::InitializeNativeTarget();
457  CHECK(!init_err);
458 
459  llvm::InitializeAllTargetMCs();
460  llvm::InitializeNativeTargetAsmPrinter();
461  llvm::InitializeNativeTargetAsmParser();
462 
463  std::string err_str;
464  std::unique_ptr<llvm::Module> owner(llvm_module);
465  llvm::EngineBuilder eb(std::move(owner));
466  eb.setErrorStr(&err_str);
467  eb.setEngineKind(llvm::EngineKind::JIT);
468  llvm::TargetOptions to;
469  to.EnableFastISel = true;
470  eb.setTargetOptions(to);
472  eb.setOptLevel(llvm::CodeGenOpt::None);
473  }
474 
475  return create_execution_engine(llvm_module, eb, co);
476 }
477 
478 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenCPU(
479  llvm::Function* query_func,
480  llvm::Function* multifrag_query_func,
481  const std::unordered_set<llvm::Function*>& live_funcs,
482  const CompilationOptions& co) {
483  CodeCacheKey key{serialize_llvm_object(query_func),
484  serialize_llvm_object(cgen_state_->row_func_)};
485 
486  llvm::Module* M = query_func->getParent();
487  auto* flag = llvm::mdconst::extract_or_null<llvm::ConstantInt>(
488  M->getModuleFlag("manage_memory_buffer"));
489  if (flag and flag->getZExtValue() == 1 and M->getFunction("allocate_varlen_buffer") and
490  M->getFunction("register_buffer_with_executor_rsm")) {
491  LOG(INFO) << "including executor addr to cache key\n";
492  key.push_back(std::to_string(reinterpret_cast<int64_t>(this)));
493  }
494  if (cgen_state_->filter_func_) {
495  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
496  }
497  for (const auto helper : cgen_state_->helper_functions_) {
498  key.push_back(serialize_llvm_object(helper));
499  }
500  auto cached_code = QueryEngine::getInstance()->cpu_code_accessor->get_value(key);
501  if (cached_code) {
502  return cached_code;
503  }
504 
505  if (cgen_state_->needs_geos_) {
506 #ifdef ENABLE_GEOS
507  auto llvm_module = multifrag_query_func->getParent();
508  load_geos_dynamic_library();
509 
510  // Read geos runtime module and bind GEOS API function references to GEOS library
511  auto rt_geos_module_copy = llvm::CloneModule(
512  *get_geos_module(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
513  auto func = llvm::dyn_cast<llvm::Function>(gv);
514  if (!func) {
515  return true;
516  }
517  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
518  func->getLinkage() ==
519  llvm::GlobalValue::LinkageTypes::InternalLinkage ||
520  func->getLinkage() == llvm::GlobalValue::LinkageTypes::ExternalLinkage);
521  });
522  CodeGenerator::link_udf_module(rt_geos_module_copy,
523  *llvm_module,
524  cgen_state_.get(),
525  llvm::Linker::Flags::LinkOnlyNeeded);
526 #else
527  throw std::runtime_error("GEOS is disabled in this build");
528 #endif
529  }
530 
531  auto execution_engine =
532  CodeGenerator::generateNativeCPUCode(query_func, live_funcs, co);
533  auto cpu_compilation_context =
534  std::make_shared<CpuCompilationContext>(std::move(execution_engine));
535  cpu_compilation_context->setFunctionPointer(multifrag_query_func);
536  QueryEngine::getInstance()->cpu_code_accessor->put(key, cpu_compilation_context);
537  return std::dynamic_pointer_cast<CompilationContext>(cpu_compilation_context);
538 }
539 
540 void CodeGenerator::link_udf_module(const std::unique_ptr<llvm::Module>& udf_module,
541  llvm::Module& llvm_module,
542  CgenState* cgen_state,
543  llvm::Linker::Flags flags) {
544  auto timer = DEBUG_TIMER(__func__);
545  // throw a runtime error if the target module contains functions
546  // with the same name as in module of UDF functions.
547  for (auto& f : *udf_module) {
548  auto func = llvm_module.getFunction(f.getName());
549  if (!(func == nullptr) && !f.isDeclaration() && flags == llvm::Linker::Flags::None) {
550  LOG(ERROR) << " Attempt to overwrite " << f.getName().str() << " in "
551  << llvm_module.getModuleIdentifier() << " from `"
552  << udf_module->getModuleIdentifier() << "`" << std::endl;
553  throw std::runtime_error(
554  "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
555  "function ***");
556  } else {
557  VLOG(1) << " Adding " << f.getName().str() << " to "
558  << llvm_module.getModuleIdentifier() << " from `"
559  << udf_module->getModuleIdentifier() << "`" << std::endl;
560  }
561  }
562 
563  auto udf_module_copy = llvm::CloneModule(*udf_module, cgen_state->vmap_);
564 
565  udf_module_copy->setDataLayout(llvm_module.getDataLayout());
566  udf_module_copy->setTargetTriple(llvm_module.getTargetTriple());
567 
568  // Initialize linker with module for RuntimeFunctions.bc
569  llvm::Linker ld(llvm_module);
570  bool link_error = false;
571 
572  link_error = ld.linkInModule(std::move(udf_module_copy), flags);
573 
574  if (link_error) {
575  throw std::runtime_error("link_udf_module: *** error linking module ***");
576  }
577 }
578 
579 namespace {
580 
581 std::string cpp_to_llvm_name(const std::string& s) {
582  if (s == "int8_t") {
583  return "i8";
584  }
585  if (s == "int16_t") {
586  return "i16";
587  }
588  if (s == "int32_t") {
589  return "i32";
590  }
591  if (s == "int64_t") {
592  return "i64";
593  }
594  CHECK(s == "float" || s == "double");
595  return s;
596 }
597 
598 std::string gen_array_any_all_sigs() {
599  std::string result;
600  for (const std::string any_or_all : {"any", "all"}) {
601  for (const std::string elem_type :
602  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
603  for (const std::string needle_type :
604  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
605  for (const std::string op_name : {"eq", "ne", "lt", "le", "gt", "ge"}) {
606  result += ("declare i1 @array_" + any_or_all + "_" + op_name + "_" + elem_type +
607  "_" + needle_type + "(i8*, i64, " + cpp_to_llvm_name(needle_type) +
608  ", " + cpp_to_llvm_name(elem_type) + ");\n");
609  }
610  }
611  }
612  }
613  return result;
614 }
615 
617  std::string result;
618  for (const std::string key_type : {"int8_t", "int16_t", "int32_t", "int64_t"}) {
619  const auto key_llvm_type = cpp_to_llvm_name(key_type);
620  result += "declare i64 @translate_null_key_" + key_type + "(" + key_llvm_type + ", " +
621  key_llvm_type + ", i64);\n";
622  }
623  return result;
624 }
625 
626 const std::string cuda_rt_decls =
627  R"( declare void @llvm.dbg.declare(metadata, metadata, metadata) declare void @llvm.dbg.value(metadata, metadata, metadata) declare double @llvm.fmuladd.f64(double, double, double) declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind declare i64 @get_thread_index(); declare i64 @get_block_index(); declare i32 @pos_start_impl(i32*); declare i32 @group_buff_idx_impl(); declare i32 @pos_step_impl(); declare i8 @thread_warp_idx(i8); declare i64* @init_shared_mem(i64*, i32); declare i64* @init_shared_mem_nop(i64*, i32); declare i64* @declare_dynamic_shared_memory(); declare void @write_back_nop(i64*, i64*, i32); declare void @write_back_non_grouped_agg(i64*, i64*, i32); declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8); declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32); declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32); declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32); declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32); declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32); declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32); declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64); declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64); declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64); declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64); declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64); declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double); declare i64 @get_bucket_key_for_range_double(i8*, i64, double); declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double); declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64); declare i64 @agg_count_shared(i64*, i64); declare i64 @agg_count_skip_val_shared(i64*, i64, i64); declare i32 @agg_count_int32_shared(i32*, i32); declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32); declare i64 @agg_count_double_shared(i64*, double); declare i64 @agg_count_double_skip_val_shared(i64*, double, double); declare i32 @agg_count_float_shared(i32*, float); declare i32 @agg_count_float_skip_val_shared(i32*, float, float); declare i64 @agg_count_if_shared(i64*, i64); declare i64 @agg_count_if_skip_val_shared(i64*, i64, i64); declare i32 @agg_count_if_int32_shared(i32*, i32); declare i32 @agg_count_if_int32_skip_val_shared(i32*, i32, i32); declare i64 @agg_sum_shared(i64*, i64); declare i64 @agg_sum_skip_val_shared(i64*, i64, i64); declare i32 @agg_sum_int32_shared(i32*, i32); declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32); declare void @agg_sum_double_shared(i64*, double); declare void @agg_sum_double_skip_val_shared(i64*, double, double); declare void @agg_sum_float_shared(i32*, float); declare void @agg_sum_float_skip_val_shared(i32*, float, float); declare i64 @agg_sum_if_shared(i64*, i64, i8); declare i64 @agg_sum_if_skip_val_shared(i64*, i64, i64, i8); declare i32 @agg_sum_if_int32_shared(i32*, i32, i8); declare i32 @agg_sum_if_int32_skip_val_shared(i32*, i32, i32, i8); declare void @agg_sum_if_double_shared(i64*, double, i8); declare void @agg_sum_if_double_skip_val_shared(i64*, double, double, i8); declare void @agg_sum_if_float_shared(i32*, float, i8); declare void @agg_sum_if_float_skip_val_shared(i32*, float, float, i8); declare void @agg_max_shared(i64*, i64); declare void @agg_max_skip_val_shared(i64*, i64, i64); declare void @agg_max_int32_shared(i32*, i32); declare void @agg_max_int32_skip_val_shared(i32*, i32, i32); declare void @agg_max_int16_shared(i16*, i16); declare void @agg_max_int16_skip_val_shared(i16*, i16, i16); declare void @agg_max_int8_shared(i8*, i8); declare void @agg_max_int8_skip_val_shared(i8*, i8, i8); declare void @agg_max_double_shared(i64*, double); declare void @agg_max_double_skip_val_shared(i64*, double, double); declare void @agg_max_float_shared(i32*, float); declare void @agg_max_float_skip_val_shared(i32*, float, float); declare void @agg_min_shared(i64*, i64); declare void @agg_min_skip_val_shared(i64*, i64, i64); declare void @agg_min_int32_shared(i32*, i32); declare void @agg_min_int32_skip_val_shared(i32*, i32, i32); declare void @agg_min_int16_shared(i16*, i16); declare void @agg_min_int16_skip_val_shared(i16*, i16, i16); declare void @agg_min_int8_shared(i8*, i8); declare void @agg_min_int8_skip_val_shared(i8*, i8, i8); declare void @agg_min_double_shared(i64*, double); declare void @agg_min_double_skip_val_shared(i64*, double, double); declare void @agg_min_float_shared(i32*, float); declare void @agg_min_float_skip_val_shared(i32*, float, float); declare void @agg_id_shared(i64*, i64); declare i8* @agg_id_varlen_shared(i8*, i64, i8*, i64); declare void @agg_id_int32_shared(i32*, i32); declare void @agg_id_int16_shared(i16*, i16); declare void @agg_id_int8_shared(i8*, i8); declare void @agg_id_double_shared(i64*, double); declare void @agg_id_double_shared_slow(i64*, double*); declare void @agg_id_float_shared(i32*, float); declare i32 @checked_single_agg_id_shared(i64*, i64, i64); declare i32 @checked_single_agg_id_double_shared(i64*, double, double); declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double); declare i32 @checked_single_agg_id_float_shared(i32*, float, float); declare i1 @slotEmptyKeyCAS(i64*, i64, i64); declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32); declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16); declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8); declare i64 @datetrunc_century(i64); declare i64 @datetrunc_day(i64); declare i64 @datetrunc_decade(i64); declare i64 @datetrunc_hour(i64); declare i64 @datetrunc_millennium(i64); declare i64 @datetrunc_minute(i64); declare i64 @datetrunc_month(i64); declare i64 @datetrunc_quarter(i64); declare i64 @datetrunc_quarterday(i64); declare i64 @datetrunc_week_monday(i64); declare i64 @datetrunc_week_sunday(i64); declare i64 @datetrunc_week_saturday(i64); declare i64 @datetrunc_year(i64); declare i64 @extract_epoch(i64); declare i64 @extract_dateepoch(i64); declare i64 @extract_quarterday(i64); declare i64 @extract_hour(i64); declare i64 @extract_minute(i64); declare i64 @extract_second(i64); declare i64 @extract_millisecond(i64); declare i64 @extract_microsecond(i64); declare i64 @extract_nanosecond(i64); declare i64 @extract_dow(i64); declare i64 @extract_isodow(i64); declare i64 @extract_day(i64); declare i64 @extract_week_monday(i64); declare i64 @extract_week_sunday(i64); declare i64 @extract_week_saturday(i64); declare i64 @extract_day_of_year(i64); declare i64 @extract_month(i64); declare i64 @extract_quarter(i64); declare i64 @extract_year(i64); declare i64 @ExtractTimeFromHPTimestamp(i64,i64); declare i64 @ExtractTimeFromHPTimestampNullable(i64,i64,i64); declare i64 @ExtractTimeFromLPTimestamp(i64); declare i64 @ExtractTimeFromLPTimestampNullable(i64,i64); declare i64 @DateTruncateHighPrecisionToDate(i64, i64); declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64); declare i64 @DateDiff(i32, i64, i64); declare i64 @DateDiffNullable(i32, i64, i64, i64); declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32); declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64); declare i64 @DateAdd(i32, i64, i64); declare i64 @DateAddNullable(i32, i64, i64, i64); declare i64 @DateAddHighPrecision(i32, i64, i64, i32); declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64); declare {i8*,i64} @string_decode(i8*, i64); declare i32 @array_size(i8*, i64, i32); declare i32 @array_size_nullable(i8*, i64, i32, i32); declare i32 @array_size_1_nullable(i8*, i64, i32); declare i32 @fast_fixlen_array_size(i8*, i32); declare i1 @array_is_null(i8*, i64); declare i1 @point_coord_array_is_null(i8*, i64); declare i8* @array_buff(i8*, i64); declare i8* @fast_fixlen_array_buff(i8*, i64); declare i8 @array_at_int8_t(i8*, i64, i32); declare i16 @array_at_int16_t(i8*, i64, i32); declare i32 @array_at_int32_t(i8*, i64, i32); declare i64 @array_at_int64_t(i8*, i64, i32); declare float @array_at_float(i8*, i64, i32); declare double @array_at_double(i8*, i64, i32); declare i8 @varlen_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_array_at_int64_t(i8*, i64, i32); declare float @varlen_array_at_float(i8*, i64, i32); declare double @varlen_array_at_double(i8*, i64, i32); declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32); declare float @varlen_notnull_array_at_float(i8*, i64, i32); declare double @varlen_notnull_array_at_double(i8*, i64, i32); declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8); declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16); declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32); declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64); declare float @array_at_float_checked(i8*, i64, i64, float); declare double @array_at_double_checked(i8*, i64, i64, double); declare i32 @char_length(i8*, i32); declare i32 @char_length_nullable(i8*, i32, i32); declare i32 @char_length_encoded(i8*, i32); declare i32 @char_length_encoded_nullable(i8*, i32, i32); declare i32 @key_for_string_encoded(i32); declare i1 @sample_ratio(double, i64); declare double @width_bucket(double, double, double, double, i32); declare double @width_bucket_reverse(double, double, double, double, i32); declare double @width_bucket_nullable(double, double, double, double, i32, double); declare double @width_bucket_reversed_nullable(double, double, double, double, i32, double); declare double @width_bucket_no_oob_check(double, double, double); declare double @width_bucket_reverse_no_oob_check(double, double, double); declare double @width_bucket_expr(double, i1, double, double, i32); declare double @width_bucket_expr_nullable(double, i1, double, double, i32, double); declare double @width_bucket_expr_no_oob_check(double, i1, double, double, i32); declare i1 @string_like(i8*, i32, i8*, i32, i8); declare i1 @string_ilike(i8*, i32, i8*, i32, i8); declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8); declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8); declare i1 @string_like_simple(i8*, i32, i8*, i32); declare i1 @string_ilike_simple(i8*, i32, i8*, i32); declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8); declare i1 @string_lt(i8*, i32, i8*, i32); declare i1 @string_le(i8*, i32, i8*, i32); declare i1 @string_gt(i8*, i32, i8*, i32); declare i1 @string_ge(i8*, i32, i8*, i32); declare i1 @string_eq(i8*, i32, i8*, i32); declare i1 @string_ne(i8*, i32, i8*, i32); declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8); declare i1 @regexp_like(i8*, i32, i8*, i32, i8); declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8); declare void @linear_probabilistic_count(i8*, i32, i8*, i32); declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64); declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64); declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64); declare void @record_error_code(i32, i32*); declare i32 @get_error_code(i32*); declare i1 @dynamic_watchdog(); declare i1 @check_interrupt(); declare void @force_sync(); declare void @sync_warp(); declare void @sync_warp_protected(i64, i64); declare void @sync_threadblock(); declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32); declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64); declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float); declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double); declare double @decompress_x_coord_geoint(i32); declare double @decompress_y_coord_geoint(i32); declare i32 @compress_x_coord_geoint(double); declare i32 @compress_y_coord_geoint(double); )" + gen_array_any_all_sigs() +
629 
630 #ifdef HAVE_CUDA
631 std::string extension_function_decls(const std::unordered_set<std::string>& udf_decls) {
632  const auto decls =
633  ExtensionFunctionsWhitelist::getLLVMDeclarations(udf_decls, /*is_gpu=*/true);
634  return boost::algorithm::join(decls, "\n");
635 }
636 
637 void legalize_nvvm_ir(llvm::Function* query_func) {
638  // optimizations might add attributes to the function
639  // and NVPTX doesn't understand all of them; play it
640  // safe and clear all attributes
641  clear_function_attributes(query_func);
642  verify_function_ir(query_func);
643 
644  std::vector<llvm::Instruction*> stackrestore_intrinsics;
645  std::vector<llvm::Instruction*> stacksave_intrinsics;
646  std::vector<llvm::Instruction*> lifetime;
647  for (auto& BB : *query_func) {
648  for (llvm::Instruction& I : BB) {
649  if (const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
650  if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
651  stacksave_intrinsics.push_back(&I);
652  } else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
653  stackrestore_intrinsics.push_back(&I);
654  } else if (II->getIntrinsicID() == llvm::Intrinsic::lifetime_start ||
655  II->getIntrinsicID() == llvm::Intrinsic::lifetime_end) {
656  lifetime.push_back(&I);
657  }
658  }
659  }
660  }
661 
662  // stacksave and stackrestore intrinsics appear together, and
663  // stackrestore uses stacksaved result as its argument
664  // so it should be removed first.
665  for (auto& II : stackrestore_intrinsics) {
666  II->eraseFromParent();
667  }
668  for (auto& II : stacksave_intrinsics) {
669  II->eraseFromParent();
670  }
671  // Remove lifetime intrinsics as well. NVPTX don't like them
672  for (auto& II : lifetime) {
673  II->eraseFromParent();
674  }
675 }
676 #endif // HAVE_CUDA
677 
678 } // namespace
679 
680 llvm::StringRef get_gpu_target_triple_string() {
681  return llvm::StringRef("nvptx64-nvidia-cuda");
682 }
683 
684 llvm::StringRef get_gpu_data_layout() {
685  return llvm::StringRef(
686  "e-p:64:64:64-i1:8:8-i8:8:8-"
687  "i16:16:16-i32:32:32-i64:64:64-"
688  "f32:32:32-f64:64:64-v16:16:16-"
689  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
690 }
691 
692 std::map<std::string, std::string> get_device_parameters(bool cpu_only) {
693  std::map<std::string, std::string> result;
694 
695  result.insert(std::make_pair("cpu_name", llvm::sys::getHostCPUName()));
696  result.insert(std::make_pair("cpu_triple", llvm::sys::getProcessTriple()));
697  result.insert(
698  std::make_pair("cpu_cores", std::to_string(llvm::sys::getHostNumPhysicalCores())));
699  result.insert(std::make_pair("cpu_threads", std::to_string(cpu_threads())));
700 
701  // https://en.cppreference.com/w/cpp/language/types
702  std::string sizeof_types;
703  sizeof_types += "bool:" + std::to_string(sizeof(bool)) + ";";
704  sizeof_types += "size_t:" + std::to_string(sizeof(size_t)) + ";";
705  sizeof_types += "ssize_t:" + std::to_string(sizeof(ssize_t)) + ";";
706  sizeof_types += "char:" + std::to_string(sizeof(char)) + ";";
707  sizeof_types += "uchar:" + std::to_string(sizeof(unsigned char)) + ";";
708  sizeof_types += "short:" + std::to_string(sizeof(short)) + ";";
709  sizeof_types += "ushort:" + std::to_string(sizeof(unsigned short int)) + ";";
710  sizeof_types += "int:" + std::to_string(sizeof(int)) + ";";
711  sizeof_types += "uint:" + std::to_string(sizeof(unsigned int)) + ";";
712  sizeof_types += "long:" + std::to_string(sizeof(long int)) + ";";
713  sizeof_types += "ulong:" + std::to_string(sizeof(unsigned long int)) + ";";
714  sizeof_types += "longlong:" + std::to_string(sizeof(long long int)) + ";";
715  sizeof_types += "ulonglong:" + std::to_string(sizeof(unsigned long long int)) + ";";
716  sizeof_types += "float:" + std::to_string(sizeof(float)) + ";";
717  sizeof_types += "double:" + std::to_string(sizeof(double)) + ";";
718  sizeof_types += "longdouble:" + std::to_string(sizeof(long double)) + ";";
719  sizeof_types += "voidptr:" + std::to_string(sizeof(void*)) + ";";
720 
721  result.insert(std::make_pair("type_sizeof", sizeof_types));
722 
723  std::string null_values;
724  null_values += "boolean1:" + std::to_string(serialized_null_value<bool>()) + ";";
725  null_values += "boolean8:" + std::to_string(serialized_null_value<int8_t>()) + ";";
726  null_values += "int8:" + std::to_string(serialized_null_value<int8_t>()) + ";";
727  null_values += "int16:" + std::to_string(serialized_null_value<int16_t>()) + ";";
728  null_values += "int32:" + std::to_string(serialized_null_value<int32_t>()) + ";";
729  null_values += "int64:" + std::to_string(serialized_null_value<int64_t>()) + ";";
730  null_values += "uint8:" + std::to_string(serialized_null_value<uint8_t>()) + ";";
731  null_values += "uint16:" + std::to_string(serialized_null_value<uint16_t>()) + ";";
732  null_values += "uint32:" + std::to_string(serialized_null_value<uint32_t>()) + ";";
733  null_values += "uint64:" + std::to_string(serialized_null_value<uint64_t>()) + ";";
734  null_values += "float32:" + std::to_string(serialized_null_value<float>()) + ";";
735  null_values += "float64:" + std::to_string(serialized_null_value<double>()) + ";";
736  null_values +=
737  "Array<boolean8>:" + std::to_string(serialized_null_value<int8_t, true>()) + ";";
738  null_values +=
739  "Array<int8>:" + std::to_string(serialized_null_value<int8_t, true>()) + ";";
740  null_values +=
741  "Array<int16>:" + std::to_string(serialized_null_value<int16_t, true>()) + ";";
742  null_values +=
743  "Array<int32>:" + std::to_string(serialized_null_value<int32_t, true>()) + ";";
744  null_values +=
745  "Array<int64>:" + std::to_string(serialized_null_value<int64_t, true>()) + ";";
746  null_values +=
747  "Array<float32>:" + std::to_string(serialized_null_value<float, true>()) + ";";
748  null_values +=
749  "Array<float64>:" + std::to_string(serialized_null_value<double, true>()) + ";";
750 
751  result.insert(std::make_pair("null_values", null_values));
752 
753  llvm::StringMap<bool> cpu_features;
754  if (llvm::sys::getHostCPUFeatures(cpu_features)) {
755  std::string features_str = "";
756  for (auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
757  features_str += (it->getValue() ? " +" : " -");
758  features_str += it->getKey().str();
759  }
760  result.insert(std::make_pair("cpu_features", features_str));
761  }
762 
763  result.insert(std::make_pair("llvm_version",
764  std::to_string(LLVM_VERSION_MAJOR) + "." +
765  std::to_string(LLVM_VERSION_MINOR) + "." +
766  std::to_string(LLVM_VERSION_PATCH)));
767 
768 #ifdef HAVE_CUDA
769  if (!cpu_only) {
770  int device_count = 0;
771  checkCudaErrors(cuDeviceGetCount(&device_count));
772  if (device_count) {
773  CUdevice device{};
774  char device_name[256];
775  int major = 0, minor = 0;
776  int driver_version;
777  checkCudaErrors(cuDeviceGet(&device, 0)); // assuming homogeneous multi-GPU system
778  checkCudaErrors(cuDeviceGetName(device_name, 256, device));
779  checkCudaErrors(cuDeviceGetAttribute(
780  &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
781  checkCudaErrors(cuDeviceGetAttribute(
782  &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
783  checkCudaErrors(cuDriverGetVersion(&driver_version));
784 
785  result.insert(std::make_pair("gpu_name", device_name));
786  result.insert(std::make_pair("gpu_count", std::to_string(device_count)));
787  result.insert(std::make_pair("gpu_compute_capability",
788  std::to_string(major) + "." + std::to_string(minor)));
789  result.insert(std::make_pair("gpu_triple", get_gpu_target_triple_string()));
790  result.insert(std::make_pair("gpu_datalayout", get_gpu_data_layout()));
791  result.insert(std::make_pair("gpu_driver",
792  "CUDA " + std::to_string(driver_version / 1000) + "." +
793  std::to_string((driver_version % 1000) / 10)));
794 
795  auto rt_libdevice_path = get_cuda_libdevice_dir() + "/libdevice.10.bc";
796  result.insert(
797  std::make_pair("gpu_has_libdevice",
798  std::to_string(boost::filesystem::exists(rt_libdevice_path))));
799  }
800  }
801 #endif
802 
803  return result;
804 }
805 
806 namespace {
807 
808 #ifdef HAVE_CUDA
809 std::unordered_set<llvm::Function*> findAliveRuntimeFuncs(
810  llvm::Module& llvm_module,
811  const std::vector<llvm::Function*>& roots) {
812  std::queue<llvm::Function*> queue;
813  std::unordered_set<llvm::Function*> visited;
814  for (llvm::Function* F : roots) {
815  queue.push(F);
816  }
817 
818  while (!queue.empty()) {
819  llvm::Function* F = queue.front();
820  queue.pop();
821  if (visited.find(F) != visited.end()) {
822  continue;
823  }
824  visited.insert(F);
825 
826  for (llvm::inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
827  if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
828  if (CI->isInlineAsm()) // libdevice calls inline assembly code
829  continue;
830  llvm::Function* called = CI->getCalledFunction();
831  if (!called || visited.find(called) != visited.end()) {
832  continue;
833  }
834  queue.push(called);
835  }
836  }
837  }
838  return visited;
839 }
840 #endif
841 
842 } // namespace
843 
845  Executor* executor,
846  llvm::Module& llvm_module,
847  llvm::PassManagerBuilder& pass_manager_builder,
848  const GPUTarget& gpu_target) {
849 #ifdef HAVE_CUDA
850  auto timer = DEBUG_TIMER(__func__);
851 
852  if (!executor->has_libdevice_module()) {
853  // raise error
854  throw std::runtime_error(
855  "libdevice library is not available but required by the UDF module");
856  }
857 
858  // Saves functions \in module
859  std::vector<llvm::Function*> roots;
860  for (llvm::Function& fn : llvm_module) {
861  if (!fn.isDeclaration())
862  roots.emplace_back(&fn);
863  }
864 
865  // Bind libdevice to the current module
866  CodeGenerator::link_udf_module(executor->get_libdevice_module(),
867  llvm_module,
868  gpu_target.cgen_state,
869  llvm::Linker::Flags::OverrideFromSrc);
870 
871  std::unordered_set<llvm::Function*> live_funcs =
872  findAliveRuntimeFuncs(llvm_module, roots);
873 
874  std::vector<llvm::Function*> funcs_to_delete;
875  for (llvm::Function& fn : llvm_module) {
876  if (!live_funcs.count(&fn)) {
877  // deleting the function were would invalidate the iterator
878  funcs_to_delete.emplace_back(&fn);
879  }
880  }
881 
882  for (llvm::Function* f : funcs_to_delete) {
883  f->eraseFromParent();
884  }
885 
886  // activate nvvm-reflect-ftz flag on the module
887 #if LLVM_VERSION_MAJOR >= 11
888  llvm::LLVMContext& ctx = llvm_module.getContext();
889  llvm_module.setModuleFlag(llvm::Module::Override,
890  "nvvm-reflect-ftz",
891  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
892  llvm::Type::getInt32Ty(ctx), uint32_t(1))));
893 #else
894  llvm_module.addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", uint32_t(1));
895 #endif
896  for (llvm::Function& fn : llvm_module) {
897  fn.addFnAttr("nvptx-f32ftz", "true");
898  }
899 
900  // add nvvm reflect pass replacing any NVVM conditionals with constants
901  gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
902  llvm::legacy::FunctionPassManager FPM(&llvm_module);
903  pass_manager_builder.populateFunctionPassManager(FPM);
904 
905  // Run the NVVMReflectPass here rather than inside optimize_ir
906  FPM.doInitialization();
907  for (auto& F : llvm_module) {
908  FPM.run(F);
909  }
910  FPM.doFinalization();
911 #endif
912 }
913 
914 std::shared_ptr<GpuCompilationContext> CodeGenerator::generateNativeGPUCode(
915  Executor* executor,
916  llvm::Function* func,
917  llvm::Function* wrapper_func,
918  const std::unordered_set<llvm::Function*>& live_funcs,
919  const bool is_gpu_smem_used,
920  const CompilationOptions& co,
921  const GPUTarget& gpu_target) {
922 #ifdef HAVE_CUDA
923  auto timer = DEBUG_TIMER(__func__);
924  auto llvm_module = func->getParent();
925  /*
926  `func` is one of the following generated functions:
927  - `call_table_function(i8** %input_col_buffers, i64*
928  %input_row_count, i64** %output_buffers, i64* %output_row_count)`
929  that wraps the user-defined table function.
930  - `multifrag_query`
931  - `multifrag_query_hoisted_literals`
932  - ...
933 
934  `wrapper_func` is table_func_kernel(i32*, i8**, i64*, i64**,
935  i64*) that wraps `call_table_function`.
936 
937  `llvm_module` is from `build/QueryEngine/RuntimeFunctions.bc` and it
938  contains `func` and `wrapper_func`. `llvm_module` should also contain
939  the definitions of user-defined table functions.
940 
941  `live_funcs` contains table_func_kernel and call_table_function
942 
943  `gpu_target.cgen_state->module_` appears to be the same as `llvm_module`
944  */
945  CHECK(gpu_target.cgen_state->module_ == llvm_module);
946  CHECK(func->getParent() == wrapper_func->getParent());
947  llvm_module->setDataLayout(
948  "e-p:64:64:64-i1:8:8-i8:8:8-"
949  "i16:16:16-i32:32:32-i64:64:64-"
950  "f32:32:32-f64:64:64-v16:16:16-"
951  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
952  llvm_module->setTargetTriple("nvptx64-nvidia-cuda");
953  CHECK(gpu_target.nvptx_target_machine);
954  llvm::PassManagerBuilder pass_manager_builder = llvm::PassManagerBuilder();
955 
956  pass_manager_builder.OptLevel = 0;
957  llvm::legacy::PassManager module_pass_manager;
958  pass_manager_builder.populateModulePassManager(module_pass_manager);
959 
960  bool requires_libdevice = check_module_requires_libdevice(llvm_module);
961 
962  if (requires_libdevice) {
963  linkModuleWithLibdevice(executor, *llvm_module, pass_manager_builder, gpu_target);
964  }
965 
966  // run optimizations
967  optimize_ir(func, llvm_module, module_pass_manager, live_funcs, is_gpu_smem_used, co);
968  legalize_nvvm_ir(func);
969 
970  std::stringstream ss;
971  llvm::raw_os_ostream os(ss);
972 
973  llvm::LLVMContext& ctx = llvm_module->getContext();
974  // Get "nvvm.annotations" metadata node
975  llvm::NamedMDNode* md = llvm_module->getOrInsertNamedMetadata("nvvm.annotations");
976 
977  llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
978  llvm::MDString::get(ctx, "kernel"),
979  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
980  llvm::Type::getInt32Ty(ctx), 1))};
981 
982  // Append metadata to nvvm.annotations
983  md->addOperand(llvm::MDNode::get(ctx, md_vals));
984 
985  std::unordered_set<llvm::Function*> roots{wrapper_func, func};
986  if (gpu_target.row_func_not_inlined) {
987  clear_function_attributes(gpu_target.cgen_state->row_func_);
988  roots.insert(gpu_target.cgen_state->row_func_);
989  if (gpu_target.cgen_state->filter_func_) {
990  roots.insert(gpu_target.cgen_state->filter_func_);
991  }
992  }
993 
994  // prevent helper functions from being removed
995  for (auto f : gpu_target.cgen_state->helper_functions_) {
996  roots.insert(f);
997  }
998 
999  if (requires_libdevice) {
1000  for (llvm::Function& F : *llvm_module) {
1001  // Some libdevice functions calls another functions that starts with "__internal_"
1002  // prefix.
1003  // __internal_trig_reduction_slowpathd
1004  // __internal_accurate_pow
1005  // __internal_lgamma_pos
1006  // Those functions have a "noinline" attribute which prevents the optimizer from
1007  // inlining them into the body of @query_func
1008  if (F.hasName() && F.getName().startswith("__internal") && !F.isDeclaration()) {
1009  roots.insert(&F);
1010  }
1011  legalize_nvvm_ir(&F);
1012  }
1013  }
1014 
1015  // Prevent the udf function(s) from being removed the way the runtime functions are
1016  std::unordered_set<std::string> udf_declarations;
1017 
1018  if (executor->has_udf_module(/*is_gpu=*/true)) {
1019  for (auto& f : executor->get_udf_module(/*is_gpu=*/true)->getFunctionList()) {
1020  llvm::Function* udf_function = llvm_module->getFunction(f.getName());
1021 
1022  if (udf_function) {
1023  legalize_nvvm_ir(udf_function);
1024  roots.insert(udf_function);
1025 
1026  // If we have a udf that declares a external function
1027  // note it so we can avoid duplicate declarations
1028  if (f.isDeclaration()) {
1029  udf_declarations.insert(f.getName().str());
1030  }
1031  }
1032  }
1033  }
1034 
1035  if (executor->has_rt_udf_module(/*is_gpu=*/true)) {
1036  for (auto& f : executor->get_rt_udf_module(/*is_gpu=*/true)->getFunctionList()) {
1037  llvm::Function* udf_function = llvm_module->getFunction(f.getName());
1038  if (udf_function) {
1039  legalize_nvvm_ir(udf_function);
1040  roots.insert(udf_function);
1041 
1042  // If we have a udf that declares a external function
1043  // note it so we can avoid duplicate declarations
1044  if (f.isDeclaration()) {
1045  udf_declarations.insert(f.getName().str());
1046  }
1047  }
1048  }
1049  }
1050 
1051  std::vector<llvm::Function*> rt_funcs;
1052  for (auto& Fn : *llvm_module) {
1053  if (roots.count(&Fn)) {
1054  continue;
1055  }
1056  rt_funcs.push_back(&Fn);
1057  }
1058  for (auto& pFn : rt_funcs) {
1059  pFn->removeFromParent();
1060  }
1061 
1062  if (requires_libdevice) {
1063  add_intrinsics_to_module(llvm_module);
1064  }
1065 
1066  llvm_module->print(os, nullptr);
1067  os.flush();
1068 
1069  for (auto& pFn : rt_funcs) {
1070  llvm_module->getFunctionList().push_back(pFn);
1071  }
1072  llvm_module->eraseNamedMetadata(md);
1073 
1074  auto cuda_llir = ss.str() + cuda_rt_decls + extension_function_decls(udf_declarations);
1075  std::string ptx;
1076  try {
1077  ptx = generatePTX(
1078  cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
1079  } catch (ParseIRError& e) {
1080  LOG(WARNING) << "Failed to generate PTX: " << e.what()
1081  << ". Switching to CPU execution target.";
1082  throw QueryMustRunOnCpu();
1083  }
1084  LOG(PTX) << "PTX for the GPU:\n" << ptx << "\nEnd of PTX";
1085 
1086  auto cubin_result = ptx_to_cubin(ptx, gpu_target.cuda_mgr);
1087  auto& option_keys = cubin_result.option_keys;
1088  auto& option_values = cubin_result.option_values;
1089  auto cubin = cubin_result.cubin;
1090  auto link_state = cubin_result.link_state;
1091  const auto num_options = option_keys.size();
1092 
1093  auto func_name = wrapper_func->getName().str();
1094  auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
1095  for (int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
1096  ++device_id) {
1097  gpu_compilation_context->addDeviceCode(
1098  std::make_unique<GpuDeviceCompilationContext>(cubin,
1099  func_name,
1100  device_id,
1101  gpu_target.cuda_mgr,
1102  num_options,
1103  &option_keys[0],
1104  &option_values[0]));
1105  }
1106 
1107  checkCudaErrors(cuLinkDestroy(link_state));
1108  return gpu_compilation_context;
1109 #else
1110  return {};
1111 #endif
1112 }
1113 
1114 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenGPU(
1115  llvm::Function* query_func,
1116  llvm::Function* multifrag_query_func,
1117  std::unordered_set<llvm::Function*>& live_funcs,
1118  const bool no_inline,
1119  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
1120  const bool is_gpu_smem_used,
1121  const CompilationOptions& co) {
1122 #ifdef HAVE_CUDA
1123  auto timer = DEBUG_TIMER(__func__);
1124 
1125  CHECK(cuda_mgr);
1126  CodeCacheKey key{serialize_llvm_object(query_func),
1127  serialize_llvm_object(cgen_state_->row_func_)};
1128  if (cgen_state_->filter_func_) {
1129  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
1130  }
1131  for (const auto helper : cgen_state_->helper_functions_) {
1132  key.push_back(serialize_llvm_object(helper));
1133  }
1134  auto cached_code = QueryEngine::getInstance()->gpu_code_accessor->get_value(key);
1135  if (cached_code) {
1136  return cached_code;
1137  }
1138 
1139  bool row_func_not_inlined = false;
1140  if (no_inline) {
1141  for (auto it = llvm::inst_begin(cgen_state_->row_func_),
1142  e = llvm::inst_end(cgen_state_->row_func_);
1143  it != e;
1144  ++it) {
1145  if (llvm::isa<llvm::CallInst>(*it)) {
1146  auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
1147  if (get_gv_call.getCalledFunction()->getName() == "array_size" ||
1148  get_gv_call.getCalledFunction()->getName() == "linear_probabilistic_count") {
1149  mark_function_never_inline(cgen_state_->row_func_);
1150  row_func_not_inlined = true;
1151  break;
1152  }
1153  }
1154  }
1155  }
1156 
1157  initializeNVPTXBackend();
1158  CodeGenerator::GPUTarget gpu_target{
1159  nvptx_target_machine_.get(), cuda_mgr, cgen_state_.get(), row_func_not_inlined};
1160  std::shared_ptr<GpuCompilationContext> compilation_context;
1161 
1162  try {
1163  compilation_context = CodeGenerator::generateNativeGPUCode(this,
1164  query_func,
1165  multifrag_query_func,
1166  live_funcs,
1167  is_gpu_smem_used,
1168  co,
1169  gpu_target);
1170  } catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1171  if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1172  // Thrown if memory not able to be allocated on gpu
1173  // Retry once after evicting portion of code cache
1174  LOG(WARNING) << "Failed to allocate GPU memory for generated code. Evicting "
1176  << "% of GPU code cache and re-trying.";
1177  QueryEngine::getInstance()->gpu_code_accessor->evictFractionEntries(
1179  compilation_context = CodeGenerator::generateNativeGPUCode(this,
1180  query_func,
1181  multifrag_query_func,
1182  live_funcs,
1183  is_gpu_smem_used,
1184  co,
1185  gpu_target);
1186  } else {
1187  throw;
1188  }
1189  }
1190  QueryEngine::getInstance()->gpu_code_accessor->put(key, compilation_context);
1191  return std::dynamic_pointer_cast<CompilationContext>(compilation_context);
1192 #else
1193  return nullptr;
1194 #endif
1195 }
1196 
1197 std::string CodeGenerator::generatePTX(const std::string& cuda_llir,
1198  llvm::TargetMachine* nvptx_target_machine,
1199  llvm::LLVMContext& context) {
1200  auto timer = DEBUG_TIMER(__func__);
1201  auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir, "", false);
1202 
1203  llvm::SMDiagnostic parse_error;
1204 
1205  auto llvm_module = llvm::parseIR(mem_buff->getMemBufferRef(), parse_error, context);
1206  if (!llvm_module) {
1207  LOG(IR) << "CodeGenerator::generatePTX:NVVM IR:\n" << cuda_llir << "\nEnd of NNVM IR";
1208  throw_parseIR_error(parse_error, "generatePTX", /* is_gpu= */ true);
1209  }
1210 
1211  llvm::SmallString<256> code_str;
1212  llvm::raw_svector_ostream formatted_os(code_str);
1213  CHECK(nvptx_target_machine);
1214  {
1215  llvm::legacy::PassManager ptxgen_pm;
1216  llvm_module->setDataLayout(nvptx_target_machine->createDataLayout());
1217 
1218 #if LLVM_VERSION_MAJOR >= 10
1219  nvptx_target_machine->addPassesToEmitFile(
1220  ptxgen_pm, formatted_os, nullptr, llvm::CGFT_AssemblyFile);
1221 #else
1222  nvptx_target_machine->addPassesToEmitFile(
1223  ptxgen_pm, formatted_os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1224 #endif
1225  ptxgen_pm.run(*llvm_module);
1226  }
1227 
1228 #if LLVM_VERSION_MAJOR >= 11
1229  return std::string(code_str);
1230 #else
1231  return code_str.str();
1232 #endif
1233 }
1234 
1235 std::unique_ptr<llvm::TargetMachine> CodeGenerator::initializeNVPTXBackend(
1237  auto timer = DEBUG_TIMER(__func__);
1238  llvm::InitializeAllTargets();
1239  llvm::InitializeAllTargetMCs();
1240  llvm::InitializeAllAsmPrinters();
1241  std::string err;
1242  auto target = llvm::TargetRegistry::lookupTarget("nvptx64", err);
1243  if (!target) {
1244  LOG(FATAL) << err;
1245  }
1246  return std::unique_ptr<llvm::TargetMachine>(
1247  target->createTargetMachine("nvptx64-nvidia-cuda",
1249  "",
1250  llvm::TargetOptions(),
1251  llvm::Reloc::Static));
1252 }
1253 
1254 std::string Executor::generatePTX(const std::string& cuda_llir) const {
1256  cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1257 }
1258 
1259 void Executor::initializeNVPTXBackend() const {
1260  if (nvptx_target_machine_) {
1261  return;
1262  }
1263  const auto arch = cudaMgr()->getDeviceArch();
1264  nvptx_target_machine_ = CodeGenerator::initializeNVPTXBackend(arch);
1265 }
1266 
1267 // A small number of runtime functions don't get through CgenState::emitCall. List them
1268 // explicitly here and always clone their implementation from the runtime module.
1269 bool CodeGenerator::alwaysCloneRuntimeFunction(const llvm::Function* func) {
1270  return func->getName() == "query_stub_hoisted_literals" ||
1271  func->getName() == "multifrag_query_hoisted_literals" ||
1272  func->getName() == "query_stub" || func->getName() == "multifrag_query" ||
1273  func->getName() == "fixed_width_int_decode" ||
1274  func->getName() == "fixed_width_unsigned_decode" ||
1275  func->getName() == "diff_fixed_width_int_decode" ||
1276  func->getName() == "fixed_width_double_decode" ||
1277  func->getName() == "fixed_width_float_decode" ||
1278  func->getName() == "fixed_width_small_date_decode" ||
1279  func->getName() == "fixed_width_date_encode" ||
1280  func->getName() == "record_error_code" || func->getName() == "get_error_code" ||
1281  func->getName() == "pos_start_impl" || func->getName() == "pos_step_impl" ||
1282  func->getName() == "group_buff_idx_impl" ||
1283  func->getName() == "init_shared_mem" ||
1284  func->getName() == "init_shared_mem_nop" || func->getName() == "write_back_nop";
1285 }
1286 
1287 std::unique_ptr<llvm::Module> read_llvm_module_from_bc_file(
1288  const std::string& bc_filename,
1289  llvm::LLVMContext& context) {
1290  llvm::SMDiagnostic err;
1291 
1292  auto buffer_or_error = llvm::MemoryBuffer::getFile(bc_filename);
1293  CHECK(!buffer_or_error.getError()) << "bc_filename=" << bc_filename;
1294  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1295 
1296  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1297  CHECK(!owner.takeError());
1298  CHECK(owner->get());
1299  return std::move(owner.get());
1300 }
1301 
1302 std::unique_ptr<llvm::Module> read_llvm_module_from_ir_file(
1303  const std::string& udf_ir_filename,
1304  llvm::LLVMContext& ctx,
1305  bool is_gpu = false) {
1306  llvm::SMDiagnostic parse_error;
1307 
1308  llvm::StringRef file_name_arg(udf_ir_filename);
1309 
1310  auto owner = llvm::parseIRFile(file_name_arg, parse_error, ctx);
1311  if (!owner) {
1312  throw_parseIR_error(parse_error, udf_ir_filename, is_gpu);
1313  }
1314 
1315  if (is_gpu) {
1316  llvm::Triple gpu_triple(owner->getTargetTriple());
1317  if (!gpu_triple.isNVPTX()) {
1318  LOG(WARNING)
1319  << "Expected triple nvptx64-nvidia-cuda for NVVM IR of loadtime UDFs but got "
1320  << gpu_triple.str() << ". Disabling the NVVM IR module.";
1321  return std::unique_ptr<llvm::Module>();
1322  }
1323  }
1324  return owner;
1325 }
1326 
1327 std::unique_ptr<llvm::Module> read_llvm_module_from_ir_string(
1328  const std::string& udf_ir_string,
1329  llvm::LLVMContext& ctx,
1330  bool is_gpu = false) {
1331  llvm::SMDiagnostic parse_error;
1332 
1333  auto buf = std::make_unique<llvm::MemoryBufferRef>(udf_ir_string,
1334  "Runtime UDF/UDTF LLVM/NVVM IR");
1335 
1336  auto owner = llvm::parseIR(*buf, parse_error, ctx);
1337  if (!owner) {
1338  LOG(IR) << "read_llvm_module_from_ir_string:\n"
1339  << udf_ir_string << "\nEnd of LLVM/NVVM IR";
1340  throw_parseIR_error(parse_error, "", /* is_gpu= */ is_gpu);
1341  }
1342 
1343  if (is_gpu) {
1344  llvm::Triple gpu_triple(owner->getTargetTriple());
1345  if (!gpu_triple.isNVPTX()) {
1346  LOG(IR) << "read_llvm_module_from_ir_string:\n"
1347  << udf_ir_string << "\nEnd of NNVM IR";
1348  LOG(WARNING) << "Expected triple nvptx64-nvidia-cuda for NVVM IR but got "
1349  << gpu_triple.str()
1350  << ". Executing runtime UDF/UDTFs on GPU will be disabled.";
1351  return std::unique_ptr<llvm::Module>();
1352  ;
1353  }
1354  }
1355  return owner;
1356 }
1357 
1358 namespace {
1359 
1360 void bind_pos_placeholders(const std::string& pos_fn_name,
1361  const bool use_resume_param,
1362  llvm::Function* query_func,
1363  llvm::Module* llvm_module) {
1364  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1365  ++it) {
1366  if (!llvm::isa<llvm::CallInst>(*it)) {
1367  continue;
1368  }
1369  auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1370  if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
1371  if (use_resume_param) {
1372  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1373  llvm::ReplaceInstWithInst(
1374  &pos_call,
1375  llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name + "_impl"),
1376  error_code_arg));
1377  } else {
1378  llvm::ReplaceInstWithInst(
1379  &pos_call,
1380  llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name + "_impl")));
1381  }
1382  break;
1383  }
1384  }
1385 }
1386 
1387 void set_row_func_argnames(llvm::Function* row_func,
1388  const size_t in_col_count,
1389  const size_t agg_col_count,
1390  const bool hoist_literals) {
1391  auto arg_it = row_func->arg_begin();
1392 
1393  if (agg_col_count) {
1394  for (size_t i = 0; i < agg_col_count; ++i) {
1395  arg_it->setName("out");
1396  ++arg_it;
1397  }
1398  } else {
1399  arg_it->setName("group_by_buff");
1400  ++arg_it;
1401  arg_it->setName("varlen_output_buff");
1402  ++arg_it;
1403  arg_it->setName("crt_matched");
1404  ++arg_it;
1405  arg_it->setName("total_matched");
1406  ++arg_it;
1407  arg_it->setName("old_total_matched");
1408  ++arg_it;
1409  arg_it->setName("max_matched");
1410  ++arg_it;
1411  }
1412 
1413  arg_it->setName("agg_init_val");
1414  ++arg_it;
1415 
1416  arg_it->setName("pos");
1417  ++arg_it;
1418 
1419  arg_it->setName("frag_row_off");
1420  ++arg_it;
1421 
1422  arg_it->setName("num_rows_per_scan");
1423  ++arg_it;
1424 
1425  if (hoist_literals) {
1426  arg_it->setName("literals");
1427  ++arg_it;
1428  }
1429 
1430  for (size_t i = 0; i < in_col_count; ++i) {
1431  arg_it->setName("col_buf" + std::to_string(i));
1432  ++arg_it;
1433  }
1434 
1435  arg_it->setName("join_hash_tables");
1436  ++arg_it;
1437  arg_it->setName("row_func_mgr");
1438 }
1439 
1440 llvm::Function* create_row_function(const size_t in_col_count,
1441  const size_t agg_col_count,
1442  const bool hoist_literals,
1443  llvm::Module* llvm_module,
1444  llvm::LLVMContext& context) {
1445  std::vector<llvm::Type*> row_process_arg_types;
1446 
1447  if (agg_col_count) {
1448  // output (aggregate) arguments
1449  for (size_t i = 0; i < agg_col_count; ++i) {
1450  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1451  }
1452  } else {
1453  // group by buffer
1454  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1455  // varlen output buffer
1456  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1457  // current match count
1458  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1459  // total match count passed from the caller
1460  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1461  // old total match count returned to the caller
1462  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1463  // max matched (total number of slots in the output buffer)
1464  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1465  }
1466 
1467  // aggregate init values
1468  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1469 
1470  // position argument
1471  row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1472 
1473  // fragment row offset argument
1474  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1475 
1476  // number of rows for each scan
1477  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1478 
1479  // literals buffer argument
1480  if (hoist_literals) {
1481  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1482  }
1483 
1484  // column buffer arguments
1485  for (size_t i = 0; i < in_col_count; ++i) {
1486  row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1487  }
1488 
1489  // join hash table argument
1490  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1491 
1492  // row function manager
1493  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1494 
1495  // generate the function
1496  auto ft =
1497  llvm::FunctionType::get(get_int_type(32, context), row_process_arg_types, false);
1499  auto row_func = llvm::Function::Create(
1500  ft, llvm::Function::ExternalLinkage, "row_func", llvm_module);
1501 
1502  // set the row function argument names; for debugging purposes only
1503  set_row_func_argnames(row_func, in_col_count, agg_col_count, hoist_literals);
1504 
1505  return row_func;
1506 }
1507 
1508 // Iterate through multifrag_query_func, replacing calls to query_fname with query_func.
1509 void bind_query(llvm::Function* query_func,
1510  const std::string& query_fname,
1511  llvm::Function* multifrag_query_func,
1512  llvm::Module* llvm_module) {
1513  std::vector<llvm::CallInst*> query_stubs;
1514  for (auto it = llvm::inst_begin(multifrag_query_func),
1515  e = llvm::inst_end(multifrag_query_func);
1516  it != e;
1517  ++it) {
1518  if (!llvm::isa<llvm::CallInst>(*it)) {
1519  continue;
1520  }
1521  auto& query_call = llvm::cast<llvm::CallInst>(*it);
1522  if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
1523  query_stubs.push_back(&query_call);
1524  }
1525  }
1526  for (auto& S : query_stubs) {
1527  std::vector<llvm::Value*> args;
1528  for (size_t i = 0; i < S->getNumOperands() - 1; ++i) {
1529  args.push_back(S->getArgOperand(i));
1530  }
1531  llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args, ""));
1532  }
1533 }
1534 
1535 std::vector<std::string> get_agg_fnames(const std::vector<Analyzer::Expr*>& target_exprs,
1536  const bool is_group_by) {
1537  std::vector<std::string> result;
1538  for (size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1539  ++target_idx, ++agg_col_idx) {
1540  const auto target_expr = target_exprs[target_idx];
1541  CHECK(target_expr);
1542  const auto target_type_info = target_expr->get_type_info();
1543  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
1544  const bool is_varlen =
1545  (target_type_info.is_string() &&
1546  target_type_info.get_compression() == kENCODING_NONE) ||
1547  target_type_info.is_array(); // TODO: should it use is_varlen_array() ?
1548  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
1549  result.emplace_back(target_type_info.is_fp() ? "agg_id_double" : "agg_id");
1550  if (is_varlen) {
1551  result.emplace_back("agg_id");
1552  }
1553  if (target_type_info.is_geometry()) {
1554  result.emplace_back("agg_id");
1555  for (auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1556  result.emplace_back("agg_id");
1557  }
1558  }
1559  continue;
1560  }
1561  const auto agg_type = agg_expr->get_aggtype();
1562  SQLTypeInfo agg_type_info;
1563  switch (agg_type) {
1564  case kCOUNT:
1565  case kCOUNT_IF:
1566  agg_type_info = target_type_info;
1567  break;
1568  default:
1569  agg_type_info = agg_expr->get_arg()->get_type_info();
1570  break;
1571  }
1572  switch (agg_type) {
1573  case kAVG: {
1574  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1575  !agg_type_info.is_fp()) {
1576  throw std::runtime_error("AVG is only valid on integer and floating point");
1577  }
1578  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1579  ? "agg_sum"
1580  : "agg_sum_double");
1581  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1582  ? "agg_count"
1583  : "agg_count_double");
1584  break;
1585  }
1586  case kMIN: {
1587  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1588  agg_type_info.is_geometry()) {
1589  throw std::runtime_error(
1590  "MIN on strings, arrays or geospatial types not supported yet");
1591  }
1592  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1593  ? "agg_min"
1594  : "agg_min_double");
1595  break;
1596  }
1597  case kMAX: {
1598  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1599  agg_type_info.is_geometry()) {
1600  throw std::runtime_error(
1601  "MAX on strings, arrays or geospatial types not supported yet");
1602  }
1603  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1604  ? "agg_max"
1605  : "agg_max_double");
1606  break;
1607  }
1608  case kSUM:
1609  case kSUM_IF: {
1610  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1611  !agg_type_info.is_fp()) {
1612  throw std::runtime_error(
1613  "SUM and SUM_IF is only valid on integer and floating point");
1614  }
1615  std::string func_name = (agg_type_info.is_integer() || agg_type_info.is_time())
1616  ? "agg_sum"
1617  : "agg_sum_double";
1618  if (agg_type == kSUM_IF) {
1619  func_name += "_if";
1620  }
1621  result.emplace_back(func_name);
1622  break;
1623  }
1624  case kCOUNT:
1625  result.emplace_back(agg_expr->get_is_distinct() ? "agg_count_distinct"
1626  : "agg_count");
1627  break;
1628  case kCOUNT_IF:
1629  result.emplace_back("agg_count_if");
1630  break;
1631  case kSINGLE_VALUE: {
1632  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1633  break;
1634  }
1635  case kSAMPLE: {
1636  // Note that varlen SAMPLE arguments are handled separately above
1637  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1638  break;
1639  }
1641  result.emplace_back("agg_approximate_count_distinct");
1642  break;
1643  case kAPPROX_QUANTILE:
1644  result.emplace_back("agg_approx_quantile");
1645  break;
1646  case kMODE:
1647  result.emplace_back("agg_mode_func");
1648  break;
1649  default:
1650  UNREACHABLE() << "Usupported agg_type: " << agg_type;
1651  }
1652  }
1653  return result;
1654 }
1655 
1656 } // namespace
1657 
1658 void Executor::addUdfIrToModule(const std::string& udf_ir_filename,
1659  const bool is_cuda_ir) {
1663  udf_ir_filename;
1664 }
1665 
1666 std::unordered_set<llvm::Function*> CodeGenerator::markDeadRuntimeFuncs(
1667  llvm::Module& llvm_module,
1668  const std::vector<llvm::Function*>& roots,
1669  const std::vector<llvm::Function*>& leaves) {
1670  auto timer = DEBUG_TIMER(__func__);
1671  std::unordered_set<llvm::Function*> live_funcs;
1672  live_funcs.insert(roots.begin(), roots.end());
1673  live_funcs.insert(leaves.begin(), leaves.end());
1674 
1675  if (auto F = llvm_module.getFunction("init_shared_mem_nop")) {
1676  live_funcs.insert(F);
1677  }
1678  if (auto F = llvm_module.getFunction("write_back_nop")) {
1679  live_funcs.insert(F);
1680  }
1681 
1682  for (const llvm::Function* F : roots) {
1683  for (const llvm::BasicBlock& BB : *F) {
1684  for (const llvm::Instruction& I : BB) {
1685  if (const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1686  live_funcs.insert(CI->getCalledFunction());
1687  }
1688  }
1689  }
1690  }
1691 
1692  for (llvm::Function& F : llvm_module) {
1693  if (!live_funcs.count(&F) && !F.isDeclaration()) {
1694  F.setLinkage(llvm::GlobalValue::InternalLinkage);
1695  }
1696  }
1697 
1698  return live_funcs;
1699 }
1700 
1701 namespace {
1702 // searches for a particular variable within a specific basic block (or all if bb_name is
1703 // empty)
1704 template <typename InstType>
1705 llvm::Value* find_variable_in_basic_block(llvm::Function* func,
1706  std::string bb_name,
1707  std::string variable_name) {
1708  llvm::Value* result = nullptr;
1709  if (func == nullptr || variable_name.empty()) {
1710  return result;
1711  }
1712  bool is_found = false;
1713  for (auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1714  if (!bb_name.empty() && bb_it->getName() != bb_name) {
1715  continue;
1716  }
1717  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1718  if (llvm::isa<InstType>(*inst_it)) {
1719  if (inst_it->getName() == variable_name) {
1720  result = &*inst_it;
1721  is_found = true;
1722  break;
1723  }
1724  }
1725  }
1726  }
1727  return result;
1728 }
1729 }; // namespace
1730 
1732  llvm::Function* query_func,
1733  bool run_with_dynamic_watchdog,
1734  bool run_with_allowing_runtime_interrupt,
1735  const std::vector<JoinLoop>& join_loops,
1736  ExecutorDeviceType device_type,
1737  const std::vector<InputTableInfo>& input_table_infos) {
1738  AUTOMATIC_IR_METADATA(cgen_state_.get());
1739 
1740  // check whether the row processing was successful; currently, it can
1741  // fail by running out of group by buffer slots
1742 
1743  if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1744  // when both dynamic watchdog and runtime interrupt turns on
1745  // we use dynamic watchdog
1746  run_with_allowing_runtime_interrupt = false;
1747  }
1749  {
1750  // disable injecting query interrupt checker if the session info is invalid
1752  executor_session_mutex_);
1753  if (current_query_session_.empty()) {
1754  run_with_allowing_runtime_interrupt = false;
1755  }
1756  }
1757 
1758  llvm::Value* row_count = nullptr;
1759  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1760  device_type == ExecutorDeviceType::GPU) {
1761  row_count =
1762  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1763  }
1764 
1765  bool done_splitting = false;
1766  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1767  ++bb_it) {
1768  llvm::Value* pos = nullptr;
1769  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1770  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1771  llvm::isa<llvm::PHINode>(*inst_it)) {
1772  if (inst_it->getName() == "pos") {
1773  pos = &*inst_it;
1774  }
1775  continue;
1776  }
1777  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1778  continue;
1779  }
1780  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1781  if (std::string(row_func_call.getCalledFunction()->getName()) == "row_process") {
1782  auto next_inst_it = inst_it;
1783  ++next_inst_it;
1784  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1785  auto& br_instr = bb_it->back();
1786  llvm::IRBuilder<> ir_builder(&br_instr);
1787  llvm::Value* err_lv = &*inst_it;
1788  llvm::Value* err_lv_returned_from_row_func = nullptr;
1789  if (run_with_dynamic_watchdog) {
1790  CHECK(pos);
1791  llvm::Value* call_watchdog_lv = nullptr;
1792  if (device_type == ExecutorDeviceType::GPU) {
1793  // In order to make sure all threads within a block see the same barrier,
1794  // only those blocks whose none of their threads have experienced the critical
1795  // edge will go through the dynamic watchdog computation
1796  CHECK(row_count);
1797  auto crit_edge_rem =
1798  (blockSize() & (blockSize() - 1))
1799  ? ir_builder.CreateSRem(
1800  row_count,
1801  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1802  : ir_builder.CreateAnd(
1803  row_count,
1804  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1805  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1806  crit_edge_threshold->setName("crit_edge_threshold");
1807 
1808  // only those threads where pos < crit_edge_threshold go through dynamic
1809  // watchdog call
1810  call_watchdog_lv =
1811  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1812  } else {
1813  // CPU path: run watchdog for every 64th row
1814  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1815  call_watchdog_lv = ir_builder.CreateICmp(
1816  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1817  }
1818  CHECK(call_watchdog_lv);
1819  auto error_check_bb = bb_it->splitBasicBlock(
1820  llvm::BasicBlock::iterator(br_instr), ".error_check");
1821  auto& watchdog_br_instr = bb_it->back();
1822 
1823  auto watchdog_check_bb = llvm::BasicBlock::Create(
1824  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1825  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1826  auto detected_timeout = watchdog_ir_builder.CreateCall(
1827  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1828  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1829  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1830  watchdog_ir_builder.CreateBr(error_check_bb);
1831 
1832  llvm::ReplaceInstWithInst(
1833  &watchdog_br_instr,
1834  llvm::BranchInst::Create(
1835  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1836  ir_builder.SetInsertPoint(&br_instr);
1837  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1838 
1839  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1840  unified_err_lv->addIncoming(err_lv, &*bb_it);
1841  err_lv = unified_err_lv;
1842  } else if (run_with_allowing_runtime_interrupt) {
1843  CHECK(pos);
1844  llvm::Value* call_check_interrupt_lv{nullptr};
1845  llvm::Value* interrupt_err_lv{nullptr};
1846  llvm::BasicBlock* error_check_bb{nullptr};
1847  llvm::BasicBlock* interrupt_check_bb{nullptr};
1848  llvm::Instruction* check_interrupt_br_instr{nullptr};
1849 
1850  auto has_loop_join = std::any_of(
1851  join_loops.begin(), join_loops.end(), [](const JoinLoop& join_loop) {
1852  return join_loop.isNestedLoopJoin();
1853  });
1854  auto codegen_interrupt_checker = [&]() {
1855  error_check_bb = bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
1856  ".error_check");
1857  check_interrupt_br_instr = &bb_it->back();
1858 
1859  interrupt_check_bb = llvm::BasicBlock::Create(
1860  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
1861  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1862  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1863  cgen_state_->module_->getFunction("check_interrupt"), {});
1864  interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1865  detected_interrupt,
1866  cgen_state_->llInt(Executor::ERR_INTERRUPTED),
1867  err_lv);
1868  interrupt_checker_ir_builder.CreateBr(error_check_bb);
1869  };
1870  if (has_loop_join) {
1871  codegen_interrupt_checker();
1872  CHECK(interrupt_check_bb);
1873  CHECK(check_interrupt_br_instr);
1874  llvm::ReplaceInstWithInst(check_interrupt_br_instr,
1875  llvm::BranchInst::Create(interrupt_check_bb));
1876  ir_builder.SetInsertPoint(&br_instr);
1877  err_lv = interrupt_err_lv;
1878  } else {
1879  if (device_type == ExecutorDeviceType::GPU) {
1880  // approximate how many times the %pos variable
1881  // is increased --> the number of iteration
1882  // here we calculate the # bit shift by considering grid/block/fragment
1883  // sizes since if we use the fixed one (i.e., per 64-th increment) some CUDA
1884  // threads cannot enter the interrupt checking block depending on the
1885  // fragment size --> a thread may not take care of 64 threads if an outer
1886  // table is not sufficiently large, and so cannot be interrupted
1887  int32_t num_shift_by_gridDim = shared::getExpOfTwo(gridSize());
1888  int32_t num_shift_by_blockDim = shared::getExpOfTwo(blockSize());
1889  int64_t total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1890  uint64_t interrupt_checking_freq = 32;
1891  auto freq_control_knob = g_running_query_interrupt_freq;
1892  CHECK_GT(freq_control_knob, 0);
1893  CHECK_LE(freq_control_knob, 1.0);
1894  if (!input_table_infos.empty()) {
1895  const auto& outer_table_info = *input_table_infos.begin();
1896  auto num_outer_table_tuples =
1897  outer_table_info.info.getFragmentNumTuplesUpperBound();
1898  if (num_outer_table_tuples > 0) {
1899  // gridSize * blockSize --> pos_step (idx of the next row per thread)
1900  // we additionally multiply two to pos_step since the number of
1901  // dispatched blocks are double of the gridSize
1902  // # tuples (of fragment) / pos_step --> maximum # increment (K)
1903  // also we multiply 1 / freq_control_knob to K to control the frequency
1904  // So, needs to check the interrupt status more frequently? make K
1905  // smaller
1906  auto max_inc = uint64_t(
1907  floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1908  if (max_inc < 2) {
1909  // too small `max_inc`, so this correction is necessary to make
1910  // `interrupt_checking_freq` be valid (i.e., larger than zero)
1911  max_inc = 2;
1912  }
1913  auto calibrated_inc =
1914  uint64_t(floor(max_inc * (1 - freq_control_knob)));
1915  interrupt_checking_freq =
1916  uint64_t(pow(2, shared::getExpOfTwo(calibrated_inc)));
1917  // add the coverage when interrupt_checking_freq > K
1918  // if so, some threads still cannot be branched to the interrupt checker
1919  // so we manually use smaller but close to the max_inc as freq
1920  if (interrupt_checking_freq > max_inc) {
1921  interrupt_checking_freq = max_inc / 2;
1922  }
1923  if (interrupt_checking_freq < 8) {
1924  // such small freq incurs too frequent interrupt status checking,
1925  // so we fixup to the minimum freq value at some reasonable degree
1926  interrupt_checking_freq = 8;
1927  }
1928  }
1929  }
1930  VLOG(1) << "Set the running query interrupt checking frequency: "
1931  << interrupt_checking_freq;
1932  // check the interrupt flag for every interrupt_checking_freq-th iteration
1933  llvm::Value* pos_shifted_per_iteration =
1934  ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1935  auto interrupt_predicate = ir_builder.CreateAnd(pos_shifted_per_iteration,
1936  interrupt_checking_freq);
1937  call_check_interrupt_lv =
1938  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1939  interrupt_predicate,
1940  cgen_state_->llInt(int64_t(0LL)));
1941  } else {
1942  // CPU path: run interrupt checker for every 64th row
1943  auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1944  call_check_interrupt_lv =
1945  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1946  interrupt_predicate,
1947  cgen_state_->llInt(int64_t(0LL)));
1948  }
1949  codegen_interrupt_checker();
1950  CHECK(call_check_interrupt_lv);
1951  CHECK(interrupt_err_lv);
1952  CHECK(interrupt_check_bb);
1953  CHECK(error_check_bb);
1954  CHECK(check_interrupt_br_instr);
1955  llvm::ReplaceInstWithInst(
1956  check_interrupt_br_instr,
1957  llvm::BranchInst::Create(
1958  interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
1959  ir_builder.SetInsertPoint(&br_instr);
1960  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1961 
1962  unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
1963  unified_err_lv->addIncoming(err_lv, &*bb_it);
1964  err_lv = unified_err_lv;
1965  }
1966  }
1967  if (!err_lv_returned_from_row_func) {
1968  err_lv_returned_from_row_func = err_lv;
1969  }
1971  // let kernel execution finish as expected, regardless of the observed error,
1972  // unless it is from the dynamic watchdog where all threads within that block
1973  // return together.
1974  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1975  err_lv,
1976  cgen_state_->llInt(Executor::ERR_OUT_OF_TIME));
1977  } else {
1978  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1979  err_lv,
1980  cgen_state_->llInt(static_cast<int32_t>(0)));
1981  }
1982  auto error_bb = llvm::BasicBlock::Create(
1983  cgen_state_->context_, ".error_exit", query_func, new_bb);
1984  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1985  llvm::CallInst::Create(
1986  cgen_state_->module_->getFunction("record_error_code"),
1987  std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
1988  "",
1989  error_bb);
1990  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1991  llvm::ReplaceInstWithInst(&br_instr,
1992  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1993  done_splitting = true;
1994  break;
1995  }
1996  }
1997  }
1998  CHECK(done_splitting);
1999 }
2000 
2002  llvm::Module* M = cgen_state_->module_;
2003  if (M->getFunction("allocate_varlen_buffer") == nullptr)
2004  return;
2005 
2006  // read metadata
2007  bool should_track = false;
2008  auto* flag = M->getModuleFlag("manage_memory_buffer");
2009  if (auto* cnt = llvm::mdconst::extract_or_null<llvm::ConstantInt>(flag)) {
2010  if (cnt->getZExtValue() == 1) {
2011  should_track = true;
2012  }
2013  }
2014 
2015  if (!should_track) {
2016  // metadata is not present
2017  return;
2018  }
2019 
2020  LOG(INFO) << "Found 'manage_memory_buffer' metadata.";
2021  llvm::SmallVector<llvm::CallInst*, 4> calls_to_analyze;
2022 
2023  for (llvm::Function& F : *M) {
2024  for (llvm::BasicBlock& BB : F) {
2025  for (llvm::Instruction& I : BB) {
2026  if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&I)) {
2027  // Keep track of calls to "allocate_varlen_buffer" for later processing
2028  llvm::Function* called = CI->getCalledFunction();
2029  if (called) {
2030  if (called->getName() == "allocate_varlen_buffer") {
2031  calls_to_analyze.push_back(CI);
2032  }
2033  }
2034  }
2035  }
2036  }
2037  }
2038 
2039  // for each call to "allocate_varlen_buffer", check if there's a corresponding
2040  // call to "register_buffer_with_executor_rsm". If not, add a call to it
2041  llvm::IRBuilder<> Builder(cgen_state_->context_);
2042  auto i64 = get_int_type(64, cgen_state_->context_);
2043  auto i8p = get_int_ptr_type(8, cgen_state_->context_);
2044  auto void_ = llvm::Type::getVoidTy(cgen_state_->context_);
2045  llvm::FunctionType* fnty = llvm::FunctionType::get(void_, {i64, i8p}, false);
2046  llvm::FunctionCallee register_buffer_fn =
2047  M->getOrInsertFunction("register_buffer_with_executor_rsm", fnty, {});
2048 
2049  int64_t executor_addr = reinterpret_cast<int64_t>(this);
2050  for (llvm::CallInst* CI : calls_to_analyze) {
2051  bool found = false;
2052  // for each user of the function, check if its a callinst
2053  // and if the callinst is calling "register_buffer_with_executor_rsm"
2054  // if no such instruction exist, add one registering the buffer
2055  for (llvm::User* U : CI->users()) {
2056  if (llvm::CallInst* call = llvm::dyn_cast<llvm::CallInst>(U)) {
2057  if (call->getCalledFunction() and
2058  call->getCalledFunction()->getName() == "register_buffer_with_executor_rsm") {
2059  found = true;
2060  break;
2061  }
2062  }
2063  }
2064  if (!found) {
2065  Builder.SetInsertPoint(CI->getNextNode());
2066  Builder.CreateCall(register_buffer_fn,
2067  {ll_int(executor_addr, cgen_state_->context_), CI});
2068  }
2069  }
2070 }
2071 
2072 std::vector<llvm::Value*> Executor::inlineHoistedLiterals() {
2073  AUTOMATIC_IR_METADATA(cgen_state_.get());
2074 
2075  std::vector<llvm::Value*> hoisted_literals;
2076 
2077  // row_func_ is using literals whose defs have been hoisted up to the query_func_,
2078  // extend row_func_ signature to include extra args to pass these literal values.
2079  std::vector<llvm::Type*> row_process_arg_types;
2080 
2081  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2082  E = cgen_state_->row_func_->arg_end();
2083  I != E;
2084  ++I) {
2085  row_process_arg_types.push_back(I->getType());
2086  }
2087 
2088  for (auto& element : cgen_state_->query_func_literal_loads_) {
2089  for (auto value : element.second) {
2090  row_process_arg_types.push_back(value->getType());
2091  }
2092  }
2093 
2094  auto ft = llvm::FunctionType::get(
2095  get_int_type(32, cgen_state_->context_), row_process_arg_types, false);
2096  auto row_func_with_hoisted_literals =
2097  llvm::Function::Create(ft,
2098  llvm::Function::ExternalLinkage,
2099  "row_func_hoisted_literals",
2100  cgen_state_->row_func_->getParent());
2101 
2102  auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
2103  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2104  E = cgen_state_->row_func_->arg_end();
2105  I != E;
2106  ++I) {
2107  if (I->hasName()) {
2108  row_func_arg_it->setName(I->getName());
2109  }
2110  ++row_func_arg_it;
2111  }
2112 
2113  decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{nullptr};
2114  decltype(row_func_arg_it) filter_func_arg_it{nullptr};
2115  if (cgen_state_->filter_func_) {
2116  // filter_func_ is using literals whose defs have been hoisted up to the row_func_,
2117  // extend filter_func_ signature to include extra args to pass these literal values.
2118  std::vector<llvm::Type*> filter_func_arg_types;
2119 
2120  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2121  E = cgen_state_->filter_func_->arg_end();
2122  I != E;
2123  ++I) {
2124  filter_func_arg_types.push_back(I->getType());
2125  }
2126 
2127  for (auto& element : cgen_state_->query_func_literal_loads_) {
2128  for (auto value : element.second) {
2129  filter_func_arg_types.push_back(value->getType());
2130  }
2131  }
2132 
2133  auto ft2 = llvm::FunctionType::get(
2134  get_int_type(32, cgen_state_->context_), filter_func_arg_types, false);
2135  filter_func_with_hoisted_literals =
2136  llvm::Function::Create(ft2,
2137  llvm::Function::ExternalLinkage,
2138  "filter_func_hoisted_literals",
2139  cgen_state_->filter_func_->getParent());
2140 
2141  filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
2142  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2143  E = cgen_state_->filter_func_->arg_end();
2144  I != E;
2145  ++I) {
2146  if (I->hasName()) {
2147  filter_func_arg_it->setName(I->getName());
2148  }
2149  ++filter_func_arg_it;
2150  }
2151  }
2152 
2153  std::unordered_map<int, std::vector<llvm::Value*>>
2154  query_func_literal_loads_function_arguments,
2155  query_func_literal_loads_function_arguments2;
2156 
2157  for (auto& element : cgen_state_->query_func_literal_loads_) {
2158  std::vector<llvm::Value*> argument_values, argument_values2;
2159 
2160  for (auto value : element.second) {
2161  hoisted_literals.push_back(value);
2162  argument_values.push_back(&*row_func_arg_it);
2163  if (cgen_state_->filter_func_) {
2164  argument_values2.push_back(&*filter_func_arg_it);
2165  cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
2166  }
2167  if (value->hasName()) {
2168  row_func_arg_it->setName("arg_" + value->getName());
2169  if (cgen_state_->filter_func_) {
2170  filter_func_arg_it->getContext();
2171  filter_func_arg_it->setName("arg_" + value->getName());
2172  }
2173  }
2174  ++row_func_arg_it;
2175  ++filter_func_arg_it;
2176  }
2177 
2178  query_func_literal_loads_function_arguments[element.first] = argument_values;
2179  query_func_literal_loads_function_arguments2[element.first] = argument_values2;
2180  }
2181 
2182  // copy the row_func function body over
2183  // see
2184  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2185  row_func_with_hoisted_literals->getBasicBlockList().splice(
2186  row_func_with_hoisted_literals->begin(),
2187  cgen_state_->row_func_->getBasicBlockList());
2188 
2189  // also replace row_func arguments with the arguments from row_func_hoisted_literals
2190  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2191  E = cgen_state_->row_func_->arg_end(),
2192  I2 = row_func_with_hoisted_literals->arg_begin();
2193  I != E;
2194  ++I) {
2195  I->replaceAllUsesWith(&*I2);
2196  I2->takeName(&*I);
2197  cgen_state_->filter_func_args_.replace(&*I, &*I2);
2198  ++I2;
2199  }
2200 
2201  cgen_state_->row_func_ = row_func_with_hoisted_literals;
2202 
2203  // and finally replace literal placeholders
2204  std::vector<llvm::Instruction*> placeholders;
2205  std::string prefix("__placeholder__literal_");
2206  for (auto it = llvm::inst_begin(row_func_with_hoisted_literals),
2207  e = llvm::inst_end(row_func_with_hoisted_literals);
2208  it != e;
2209  ++it) {
2210  if (it->hasName() && it->getName().startswith(prefix)) {
2211  auto offset_and_index_entry =
2212  cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
2213  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2214 
2215  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2216  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2217 
2218  it->replaceAllUsesWith(
2219  query_func_literal_loads_function_arguments[lit_off][lit_idx]);
2220  placeholders.push_back(&*it);
2221  }
2222  }
2223  for (auto placeholder : placeholders) {
2224  placeholder->removeFromParent();
2225  }
2226 
2227  if (cgen_state_->filter_func_) {
2228  // copy the filter_func function body over
2229  // see
2230  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2231  filter_func_with_hoisted_literals->getBasicBlockList().splice(
2232  filter_func_with_hoisted_literals->begin(),
2233  cgen_state_->filter_func_->getBasicBlockList());
2234 
2235  // also replace filter_func arguments with the arguments from
2236  // filter_func_hoisted_literals
2237  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2238  E = cgen_state_->filter_func_->arg_end(),
2239  I2 = filter_func_with_hoisted_literals->arg_begin();
2240  I != E;
2241  ++I) {
2242  I->replaceAllUsesWith(&*I2);
2243  I2->takeName(&*I);
2244  ++I2;
2245  }
2246 
2247  cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2248 
2249  // and finally replace literal placeholders
2250  std::vector<llvm::Instruction*> placeholders;
2251  std::string prefix("__placeholder__literal_");
2252  for (auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2253  e = llvm::inst_end(filter_func_with_hoisted_literals);
2254  it != e;
2255  ++it) {
2256  if (it->hasName() && it->getName().startswith(prefix)) {
2257  auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2258  llvm::dyn_cast<llvm::Value>(&*it));
2259  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2260 
2261  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2262  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2263 
2264  it->replaceAllUsesWith(
2265  query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2266  placeholders.push_back(&*it);
2267  }
2268  }
2269  for (auto placeholder : placeholders) {
2270  placeholder->removeFromParent();
2271  }
2272  }
2273 
2274  return hoisted_literals;
2275 }
2276 
2277 namespace {
2278 
2279 size_t get_shared_memory_size(const bool shared_mem_used,
2280  const QueryMemoryDescriptor* query_mem_desc_ptr) {
2281  return shared_mem_used
2282  ? (query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount())
2283  : 0;
2284 }
2285 
2286 bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor* query_mem_desc_ptr,
2287  const RelAlgExecutionUnit& ra_exe_unit,
2288  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2289  const ExecutorDeviceType device_type,
2290  const unsigned cuda_blocksize,
2291  const unsigned num_blocks_per_mp) {
2292  if (device_type == ExecutorDeviceType::CPU) {
2293  return false;
2294  }
2295  if (query_mem_desc_ptr->didOutputColumnar()) {
2296  return false;
2297  }
2298  CHECK(query_mem_desc_ptr);
2299  CHECK(cuda_mgr);
2300  /*
2301  * We only use shared memory strategy if GPU hardware provides native shared
2302  * memory atomics support. From CUDA Toolkit documentation:
2303  * https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html#atomic-ops "Like
2304  * Maxwell, Pascal [and Volta] provides native shared memory atomic operations
2305  * for 32-bit integer arithmetic, along with native 32 or 64-bit compare-and-swap
2306  * (CAS)."
2307  *
2308  */
2309  if (!cuda_mgr->isArchMaxwellOrLaterForAll()) {
2310  return false;
2311  }
2312 
2313  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2316  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty()) {
2317  // TODO: relax this, if necessary
2318  if (cuda_blocksize < query_mem_desc_ptr->getEntryCount()) {
2319  return false;
2320  }
2321  // skip shared memory usage when dealing with 1) variable length targets, 2)
2322  // not a COUNT aggregate
2323  const auto target_infos =
2324  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2325  std::unordered_set<SQLAgg> supported_aggs{kCOUNT, kCOUNT_IF};
2326  if (std::find_if(target_infos.begin(),
2327  target_infos.end(),
2328  [&supported_aggs](const TargetInfo& ti) {
2329  if (ti.sql_type.is_varlen() ||
2330  !supported_aggs.count(ti.agg_kind)) {
2331  return true;
2332  } else {
2333  return false;
2334  }
2335  }) == target_infos.end()) {
2336  return true;
2337  }
2338  }
2339  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2350  if (cuda_blocksize < query_mem_desc_ptr->getEntryCount()) {
2351  return false;
2352  }
2353 
2354  // Fundamentally, we should use shared memory whenever the output buffer
2355  // is small enough so that we can fit it in the shared memory and yet expect
2356  // good occupancy.
2357  // For now, we allow keyless, row-wise layout, and only for perfect hash
2358  // group by operations.
2359  if (query_mem_desc_ptr->hasKeylessHash() &&
2360  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty() &&
2361  !query_mem_desc_ptr->useStreamingTopN()) {
2362  const size_t shared_memory_threshold_bytes = std::min(
2363  g_gpu_smem_threshold == 0 ? SIZE_MAX : g_gpu_smem_threshold,
2364  cuda_mgr->getMinSharedMemoryPerBlockForAllDevices() / num_blocks_per_mp);
2365  const auto output_buffer_size =
2366  query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount();
2367  if (output_buffer_size > shared_memory_threshold_bytes) {
2368  return false;
2369  }
2370 
2371  // skip shared memory usage when dealing with 1) variable length targets, 2)
2372  // non-basic aggregates (COUNT, SUM, MIN, MAX, AVG)
2373  // TODO: relax this if necessary
2374  const auto target_infos =
2375  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2376  std::unordered_set<SQLAgg> supported_aggs{kCOUNT, kCOUNT_IF};
2378  supported_aggs = {kCOUNT, kCOUNT_IF, kMIN, kMAX, kSUM, kSUM_IF, kAVG};
2379  }
2380  if (std::find_if(target_infos.begin(),
2381  target_infos.end(),
2382  [&supported_aggs](const TargetInfo& ti) {
2383  if (ti.sql_type.is_varlen() ||
2384  !supported_aggs.count(ti.agg_kind)) {
2385  return true;
2386  } else {
2387  return false;
2388  }
2389  }) == target_infos.end()) {
2390  return true;
2391  }
2392  }
2393  }
2394  return false;
2395 }
2396 
2397 #ifndef NDEBUG
2398 std::string serialize_llvm_metadata_footnotes(llvm::Function* query_func,
2399  CgenState* cgen_state) {
2400  std::string llvm_ir;
2401  std::unordered_set<llvm::MDNode*> md;
2402 
2403  // Loop over all instructions in the query function.
2404  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2405  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2406  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2407  instr_it->getAllMetadata(imd);
2408  for (auto [kind, node] : imd) {
2409  md.insert(node);
2410  }
2411  }
2412  }
2413 
2414  // Loop over all instructions in the row function.
2415  for (auto bb_it = cgen_state->row_func_->begin(); bb_it != cgen_state->row_func_->end();
2416  ++bb_it) {
2417  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2418  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2419  instr_it->getAllMetadata(imd);
2420  for (auto [kind, node] : imd) {
2421  md.insert(node);
2422  }
2423  }
2424  }
2425 
2426  // Loop over all instructions in the filter function.
2427  if (cgen_state->filter_func_) {
2428  for (auto bb_it = cgen_state->filter_func_->begin();
2429  bb_it != cgen_state->filter_func_->end();
2430  ++bb_it) {
2431  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2432  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2433  instr_it->getAllMetadata(imd);
2434  for (auto [kind, node] : imd) {
2435  md.insert(node);
2436  }
2437  }
2438  }
2439  }
2440 
2441  // Sort the metadata by canonical number and convert to text.
2442  if (!md.empty()) {
2443  std::map<size_t, std::string> sorted_strings;
2444  for (auto p : md) {
2445  std::string str;
2446  llvm::raw_string_ostream os(str);
2447  p->print(os, cgen_state->module_, true);
2448  os.flush();
2449  auto fields = split(str, {}, 1);
2450  if (fields.empty() || fields[0].empty()) {
2451  continue;
2452  }
2453  sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2454  }
2455  llvm_ir += "\n";
2456  for (auto [id, text] : sorted_strings) {
2457  llvm_ir += text;
2458  llvm_ir += "\n";
2459  }
2460  }
2461 
2462  return llvm_ir;
2463 }
2464 #endif // NDEBUG
2465 
2466 } // namespace
2467 
2468 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2469 Executor::compileWorkUnit(const std::vector<InputTableInfo>& query_infos,
2470  const PlanState::DeletedColumnsMap& deleted_cols_map,
2471  const RelAlgExecutionUnit& ra_exe_unit,
2472  const CompilationOptions& co,
2473  const ExecutionOptions& eo,
2474  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2475  const bool allow_lazy_fetch,
2476  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2477  const size_t max_groups_buffer_entry_guess,
2478  const int8_t crt_min_byte_width,
2479  const bool has_cardinality_estimation,
2480  ColumnCacheMap& column_cache,
2481  RenderInfo* render_info) {
2482  auto timer = DEBUG_TIMER(__func__);
2483 
2485  if (!cuda_mgr) {
2486  throw QueryMustRunOnCpu();
2487  }
2488  }
2489 
2490 #ifndef NDEBUG
2491  static std::uint64_t counter = 0;
2492  ++counter;
2493  VLOG(1) << "CODEGEN #" << counter << ":";
2494  LOG(IR) << "CODEGEN #" << counter << ":";
2495  LOG(PTX) << "CODEGEN #" << counter << ":";
2496  LOG(ASM) << "CODEGEN #" << counter << ":";
2497 #endif
2498 
2499  // cgenstate_manager uses RAII pattern to manage the live time of
2500  // CgenState instances.
2501  Executor::CgenStateManager cgenstate_manager(*this,
2502  allow_lazy_fetch,
2503  query_infos,
2504  deleted_cols_map,
2505  &ra_exe_unit); // locks compilation_mutex
2506 
2507  addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
2508 
2509  GroupByAndAggregate group_by_and_aggregate(
2510  this,
2511  co.device_type,
2512  ra_exe_unit,
2513  query_infos,
2514  row_set_mem_owner,
2515  has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2516  : std::nullopt);
2517  auto query_mem_desc =
2518  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
2519  max_groups_buffer_entry_guess,
2520  crt_min_byte_width,
2521  render_info,
2523 
2524  if (query_mem_desc->getQueryDescriptionType() ==
2526  !has_cardinality_estimation && (!render_info || !render_info->isInSitu()) &&
2527  !eo.just_explain) {
2528  const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2529  throw CardinalityEstimationRequired(col_range_info.max - col_range_info.min);
2530  }
2531 
2532  const bool output_columnar = query_mem_desc->didOutputColumnar();
2533  const bool gpu_shared_mem_optimization =
2535  ra_exe_unit,
2536  cuda_mgr,
2537  co.device_type,
2538  cuda_mgr ? this->blockSize() : 1,
2539  cuda_mgr ? this->numBlocksPerMP() : 1);
2540  if (gpu_shared_mem_optimization) {
2541  // disable interleaved bins optimization on the GPU
2542  query_mem_desc->setHasInterleavedBinsOnGpu(false);
2543  LOG(DEBUG1) << "GPU shared memory is used for the " +
2544  query_mem_desc->queryDescTypeToString() + " query(" +
2545  std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
2546  query_mem_desc.get())) +
2547  " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
2548  }
2549 
2550  const GpuSharedMemoryContext gpu_smem_context(
2551  get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
2552 
2554  const size_t num_count_distinct_descs =
2555  query_mem_desc->getCountDistinctDescriptorsSize();
2556  for (size_t i = 0; i < num_count_distinct_descs; i++) {
2557  const auto& count_distinct_descriptor =
2558  query_mem_desc->getCountDistinctDescriptor(i);
2559  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::UnorderedSet ||
2560  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
2561  !co.hoist_literals)) {
2562  throw QueryMustRunOnCpu();
2563  }
2564  }
2565 
2566  // we currently do not support varlen projection based on baseline groupby when
2567  // 1) target table is multi-fragmented and 2) multiple gpus are involved for query
2568  // processing in this case, we punt the query to cpu to avoid server crash
2569  for (const auto expr : ra_exe_unit.target_exprs) {
2570  if (auto gby_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
2571  bool has_multiple_gpus = cuda_mgr ? cuda_mgr->getDeviceCount() > 1 : false;
2572  if (gby_expr->get_aggtype() == SQLAgg::kSAMPLE && has_multiple_gpus &&
2573  !g_leaf_count) {
2574  std::set<const Analyzer::ColumnVar*,
2575  bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
2577  gby_expr->collect_column_var(colvar_set, true);
2578  for (const auto cv : colvar_set) {
2579  if (cv->get_type_info().is_varlen()) {
2580  const auto tbl_key = cv->getTableKey();
2581  std::for_each(query_infos.begin(),
2582  query_infos.end(),
2583  [&tbl_key](const InputTableInfo& input_table_info) {
2584  if (input_table_info.table_key == tbl_key &&
2585  input_table_info.info.fragments.size() > 1) {
2586  throw QueryMustRunOnCpu();
2587  }
2588  });
2589  }
2590  }
2591  }
2592  }
2593  }
2594  }
2595 
2596  // Read the module template and target either CPU or GPU
2597  // by binding the stream position functions to the right implementation:
2598  // stride access for GPU, contiguous for CPU
2599  CHECK(cgen_state_->module_ == nullptr);
2600  cgen_state_->set_module_shallow_copy(get_rt_module(), /*always_clone=*/true);
2601 
2602  auto is_gpu = co.device_type == ExecutorDeviceType::GPU;
2603  if (is_gpu) {
2604  cgen_state_->module_->setDataLayout(get_gpu_data_layout());
2605  cgen_state_->module_->setTargetTriple(get_gpu_target_triple_string());
2606  }
2607  if (has_udf_module(/*is_gpu=*/is_gpu)) {
2609  get_udf_module(/*is_gpu=*/is_gpu), *cgen_state_->module_, cgen_state_.get());
2610  }
2611  if (has_rt_udf_module(/*is_gpu=*/is_gpu)) {
2613  get_rt_udf_module(/*is_gpu=*/is_gpu), *cgen_state_->module_, cgen_state_.get());
2614  }
2615 
2616  AUTOMATIC_IR_METADATA(cgen_state_.get());
2617 
2618  auto agg_fnames =
2619  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
2620 
2621  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
2622 
2623  const bool is_group_by{query_mem_desc->isGroupBy()};
2624  auto [query_func, row_func_call] = is_group_by
2625  ? query_group_by_template(cgen_state_->module_,
2626  co.hoist_literals,
2627  *query_mem_desc,
2628  co.device_type,
2629  ra_exe_unit.scan_limit,
2630  gpu_smem_context)
2631  : query_template(cgen_state_->module_,
2632  agg_slot_count,
2633  co.hoist_literals,
2634  !!ra_exe_unit.estimator,
2635  gpu_smem_context);
2636  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
2637  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
2638  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
2639 
2640  cgen_state_->query_func_ = query_func;
2641  cgen_state_->row_func_call_ = row_func_call;
2642  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2643  &query_func->getEntryBlock().front());
2644 
2645  // Generate the function signature and column head fetches s.t.
2646  // double indirection isn't needed in the inner loop
2647  auto& fetch_bb = query_func->front();
2648  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2649  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2650  auto col_heads = generate_column_heads_load(ra_exe_unit.input_col_descs.size(),
2651  query_func->args().begin(),
2652  fetch_ir_builder,
2653  cgen_state_->context_);
2654  CHECK_EQ(ra_exe_unit.input_col_descs.size(), col_heads.size());
2655 
2656  cgen_state_->row_func_ = create_row_function(ra_exe_unit.input_col_descs.size(),
2657  is_group_by ? 0 : agg_slot_count,
2658  co.hoist_literals,
2659  cgen_state_->module_,
2660  cgen_state_->context_);
2661  CHECK(cgen_state_->row_func_);
2662  cgen_state_->row_func_bb_ =
2663  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
2664 
2666  auto filter_func_ft =
2667  llvm::FunctionType::get(get_int_type(32, cgen_state_->context_), {}, false);
2668  cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2669  llvm::Function::ExternalLinkage,
2670  "filter_func",
2671  cgen_state_->module_);
2672  CHECK(cgen_state_->filter_func_);
2673  cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2674  cgen_state_->context_, "entry", cgen_state_->filter_func_);
2675  }
2676 
2677  cgen_state_->current_func_ = cgen_state_->row_func_;
2678  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2679 
2680  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
2681  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
2682  const auto join_loops =
2683  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2684 
2685  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
2686  for (auto& simple_qual : ra_exe_unit.simple_quals) {
2687  plan_state_->addSimpleQual(simple_qual);
2688  }
2689  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2690  if (is_not_deleted_bb) {
2691  cgen_state_->row_func_bb_ = is_not_deleted_bb;
2692  }
2693  if (!join_loops.empty()) {
2694  codegenJoinLoops(join_loops,
2695  body_execution_unit,
2696  group_by_and_aggregate,
2697  query_func,
2698  cgen_state_->row_func_bb_,
2699  *(query_mem_desc.get()),
2700  co,
2701  eo);
2702  } else {
2703  const bool can_return_error = compileBody(
2704  ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
2705  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
2707  createErrorCheckControlFlow(query_func,
2710  join_loops,
2711  co.device_type,
2712  group_by_and_aggregate.query_infos_);
2713  }
2714  }
2715  std::vector<llvm::Value*> hoisted_literals;
2716 
2717  if (co.hoist_literals) {
2718  VLOG(1) << "number of hoisted literals: "
2719  << cgen_state_->query_func_literal_loads_.size()
2720  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2721  << " bytes";
2722  }
2723 
2724  if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2725  // we have some hoisted literals...
2726  hoisted_literals = inlineHoistedLiterals();
2727  }
2728 
2729  // replace the row func placeholder call with the call to the actual row func
2730  std::vector<llvm::Value*> row_func_args;
2731  for (size_t i = 0; i < cgen_state_->row_func_call_->getNumOperands() - 1; ++i) {
2732  row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2733  }
2734  row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2735  row_func_args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
2736  row_func_args.push_back(get_arg_by_name(query_func, "row_func_mgr"));
2737  // push hoisted literals arguments, if any
2738  row_func_args.insert(
2739  row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2740  llvm::ReplaceInstWithInst(
2741  cgen_state_->row_func_call_,
2742  llvm::CallInst::Create(cgen_state_->row_func_, row_func_args, ""));
2743 
2744  // replace the filter func placeholder call with the call to the actual filter func
2745  if (cgen_state_->filter_func_) {
2746  std::vector<llvm::Value*> filter_func_args;
2747  for (auto arg_it = cgen_state_->filter_func_args_.begin();
2748  arg_it != cgen_state_->filter_func_args_.end();
2749  ++arg_it) {
2750  filter_func_args.push_back(arg_it->first);
2751  }
2752  llvm::ReplaceInstWithInst(
2753  cgen_state_->filter_func_call_,
2754  llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args, ""));
2755  }
2756 
2757  // Aggregate
2758  plan_state_->init_agg_vals_ =
2759  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
2760 
2761  /*
2762  * If we have decided to use GPU shared memory (decision is not made here), then
2763  * we generate proper code for extra components that it needs (buffer initialization and
2764  * gpu reduction from shared memory to global memory). We then replace these functions
2765  * into the already compiled query_func (replacing two placeholders, write_back_nop and
2766  * init_smem_nop). The rest of the code should be as before (row_func, etc.).
2767  */
2768  if (gpu_smem_context.isSharedMemoryUsed()) {
2769  if (query_mem_desc->getQueryDescriptionType() ==
2771  GpuSharedMemCodeBuilder gpu_smem_code(
2772  cgen_state_->module_,
2773  cgen_state_->context_,
2774  *query_mem_desc,
2776  plan_state_->init_agg_vals_,
2777  executor_id_);
2778  gpu_smem_code.codegen();
2779  gpu_smem_code.injectFunctionsInto(query_func);
2780 
2781  // helper functions are used for caching purposes later
2782  cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2783  cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2784  LOG(IR) << gpu_smem_code.toString();
2785  }
2786  }
2787 
2788  auto multifrag_query_func = cgen_state_->module_->getFunction(
2789  "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
2790  CHECK(multifrag_query_func);
2791 
2793  insertErrorCodeChecker(
2794  multifrag_query_func, co.hoist_literals, eo.allow_runtime_query_interrupt);
2795  }
2796 
2797  bind_query(query_func,
2798  "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
2799  multifrag_query_func,
2800  cgen_state_->module_);
2801 
2802  std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2803  if (cgen_state_->filter_func_) {
2804  root_funcs.push_back(cgen_state_->filter_func_);
2805  }
2806  auto live_funcs = CodeGenerator::markDeadRuntimeFuncs(
2807  *cgen_state_->module_, root_funcs, {multifrag_query_func});
2808 
2809  // Always inline the row function and the filter function.
2810  // We don't want register spills in the inner loops.
2811  // LLVM seems to correctly free up alloca instructions
2812  // in these functions even when they are inlined.
2813  mark_function_always_inline(cgen_state_->row_func_);
2814  if (cgen_state_->filter_func_) {
2815  mark_function_always_inline(cgen_state_->filter_func_);
2816  }
2817 
2818 #ifndef NDEBUG
2819  // Add helpful metadata to the LLVM IR for debugging.
2821 #endif
2822 
2823  auto const device_str = co.device_type == ExecutorDeviceType::CPU ? "CPU:\n" : "GPU:\n";
2824  // Serialize the important LLVM IR functions to text for SQL EXPLAIN.
2825  std::string llvm_ir =
2826  serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
2827  serialize_llvm_object(cgen_state_->row_func_) +
2828  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_) : "");
2829  VLOG(3) << "Unoptimized IR for the " << device_str << "\n" << llvm_ir << "\nEnd of IR";
2831 #ifdef WITH_JIT_DEBUG
2832  throw std::runtime_error(
2833  "Explain optimized not available when JIT runtime debug symbols are enabled");
2834 #else
2835  // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
2836  // optimized IR after NVVM reflect
2837  llvm::legacy::PassManager pass_manager;
2838  optimize_ir(query_func,
2839  cgen_state_->module_,
2840  pass_manager,
2841  live_funcs,
2842  gpu_smem_context.isSharedMemoryUsed(),
2843  co);
2844 #endif // WITH_JIT_DEBUG
2845  llvm_ir =
2846  serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
2847  serialize_llvm_object(cgen_state_->row_func_) +
2848  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2849  : "");
2850 #ifndef NDEBUG
2851  llvm_ir += serialize_llvm_metadata_footnotes(query_func, cgen_state_.get());
2852 #endif
2853  }
2854  LOG(IR) << "\n\n" << query_mem_desc->toString() << "\n";
2855  LOG(IR) << "IR for the " << device_str;
2856 #ifdef NDEBUG
2857  LOG(IR) << serialize_llvm_object(query_func)
2858  << serialize_llvm_object(cgen_state_->row_func_)
2859  << (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2860  : "")
2861  << "\nEnd of IR";
2862 #else
2863  LOG(IR) << serialize_llvm_object(cgen_state_->module_) << "\nEnd of IR";
2864 #endif
2865 
2866  // Insert calls to "register_buffer_with_executor_rsm" for allocations
2867  // in runtime functions (i.e. from RBC) without it
2868  AutoTrackBuffersInRuntimeIR();
2869 
2870  // Run some basic validation checks on the LLVM IR before code is generated below.
2871  verify_function_ir(cgen_state_->row_func_);
2872  if (cgen_state_->filter_func_) {
2873  verify_function_ir(cgen_state_->filter_func_);
2874  }
2875 
2876  // Generate final native code from the LLVM IR.
2877  return std::make_tuple(
2880  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2881  : optimizeAndCodegenGPU(query_func,
2882  multifrag_query_func,
2883  live_funcs,
2884  is_group_by || ra_exe_unit.estimator,
2885  cuda_mgr,
2886  gpu_smem_context.isSharedMemoryUsed(),
2887  co),
2888  cgen_state_->getLiterals(),
2889  output_columnar,
2890  llvm_ir,
2891  std::move(gpu_smem_context)},
2892  std::move(query_mem_desc));
2893 }
2894 
2895 void Executor::insertErrorCodeChecker(llvm::Function* query_func,
2896  bool hoist_literals,
2897  bool allow_runtime_query_interrupt) {
2898  auto query_stub_func_name =
2899  "query_stub" + std::string(hoist_literals ? "_hoisted_literals" : "");
2900  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2901  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
2902  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
2903  continue;
2904  }
2905  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
2906  if (std::string(row_func_call.getCalledFunction()->getName()) ==
2907  query_stub_func_name) {
2908  auto next_inst_it = inst_it;
2909  ++next_inst_it;
2910  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
2911  auto& br_instr = bb_it->back();
2912  llvm::IRBuilder<> ir_builder(&br_instr);
2913  llvm::Value* err_lv = &*inst_it;
2914  auto error_check_bb =
2915  bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr), ".error_check");
2916  llvm::Value* error_code_arg = nullptr;
2917  auto arg_cnt = 0;
2918  for (auto arg_it = query_func->arg_begin(); arg_it != query_func->arg_end();
2919  arg_it++, ++arg_cnt) {
2920  // since multi_frag_* func has anonymous arguments so we use arg_offset
2921  // explicitly to capture "error_code" argument in the func's argument list
2922  if (hoist_literals) {
2923  if (arg_cnt == 9) {
2924  error_code_arg = &*arg_it;
2925  break;
2926  }
2927  } else {
2928  if (arg_cnt == 8) {
2929  error_code_arg = &*arg_it;
2930  break;
2931  }
2932  }
2933  }
2934  CHECK(error_code_arg);
2935  llvm::Value* err_code = nullptr;
2936  if (allow_runtime_query_interrupt) {
2937  // decide the final error code with a consideration of interrupt status
2938  auto& check_interrupt_br_instr = bb_it->back();
2939  auto interrupt_check_bb = llvm::BasicBlock::Create(
2940  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
2941  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
2942  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
2943  cgen_state_->module_->getFunction("check_interrupt"), {});
2944  auto detected_error = interrupt_checker_ir_builder.CreateCall(
2945  cgen_state_->module_->getFunction("get_error_code"),
2946  std::vector<llvm::Value*>{error_code_arg});
2947  err_code = interrupt_checker_ir_builder.CreateSelect(
2948  detected_interrupt,
2949  cgen_state_->llInt(Executor::ERR_INTERRUPTED),
2950  detected_error);
2951  interrupt_checker_ir_builder.CreateBr(error_check_bb);
2952  llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
2953  llvm::BranchInst::Create(interrupt_check_bb));
2954  ir_builder.SetInsertPoint(&br_instr);
2955  } else {
2956  // uses error code returned from row_func and skip to check interrupt status
2957  ir_builder.SetInsertPoint(&br_instr);
2958  err_code =
2959  ir_builder.CreateCall(cgen_state_->module_->getFunction("get_error_code"),
2960  std::vector<llvm::Value*>{error_code_arg});
2961  }
2962  err_lv = ir_builder.CreateICmp(
2963  llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
2964  auto error_bb = llvm::BasicBlock::Create(
2965  cgen_state_->context_, ".error_exit", query_func, new_bb);
2966  llvm::CallInst::Create(cgen_state_->module_->getFunction("record_error_code"),
2967  std::vector<llvm::Value*>{err_code, error_code_arg},
2968  "",
2969  error_bb);
2970  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2971  llvm::ReplaceInstWithInst(&br_instr,
2972  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2973  break;
2974  }
2975  }
2976  }
2977 }
2978 
2980  const RelAlgExecutionUnit& ra_exe_unit,
2981  const CompilationOptions& co) {
2982  AUTOMATIC_IR_METADATA(cgen_state_.get());
2983  if (!co.filter_on_deleted_column) {
2984  return nullptr;
2985  }
2986  CHECK(!ra_exe_unit.input_descs.empty());
2987  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
2988  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
2989  return nullptr;
2990  }
2991  const auto& table_key = outer_input_desc.getTableKey();
2992  const auto deleted_cd = plan_state_->getDeletedColForTable(table_key);
2993  if (!deleted_cd) {
2994  return nullptr;
2995  }
2996  CHECK(deleted_cd->columnType.is_boolean());
2997  const auto deleted_expr =
2998  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
2999  shared::ColumnKey{table_key, deleted_cd->columnId},
3000  outer_input_desc.getNestLevel());
3001  CodeGenerator code_generator(this);
3002  const auto is_deleted =
3003  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
3004  const auto is_deleted_bb = llvm::BasicBlock::Create(
3005  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
3006  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
3007  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
3008  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
3009  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
3010  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3011  cgen_state_->ir_builder_.SetInsertPoint(bb);
3012  return bb;
3013 }
3014 
3015 bool Executor::compileBody(const RelAlgExecutionUnit& ra_exe_unit,
3016  GroupByAndAggregate& group_by_and_aggregate,
3018  const CompilationOptions& co,
3019  const GpuSharedMemoryContext& gpu_smem_context) {
3020  AUTOMATIC_IR_METADATA(cgen_state_.get());
3021 
3022  // Switch the code generation into a separate filter function if enabled.
3023  // Note that accesses to function arguments are still codegenned from the
3024  // row function's arguments, then later automatically forwarded and
3025  // remapped into filter function arguments by redeclareFilterFunction().
3026  cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
3027  llvm::Value* loop_done{nullptr};
3028  std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
3029  if (cgen_state_->filter_func_) {
3030  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3031  auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
3032  cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
3033  row_func_entry_bb->begin());
3034  loop_done = cgen_state_->ir_builder_.CreateAlloca(
3035  get_int_type(1, cgen_state_->context_), nullptr, "loop_done");
3036  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3037  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(true), loop_done);
3038  }
3039  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
3040  cgen_state_->current_func_ = cgen_state_->filter_func_;
3041  fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
3042  }
3043 
3044  // generate the code for the filter
3045  std::vector<Analyzer::Expr*> primary_quals;
3046  std::vector<Analyzer::Expr*> deferred_quals;
3047  bool short_circuited = CodeGenerator::prioritizeQuals(
3048  ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
3049  if (short_circuited) {
3050  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
3051  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
3052  << " quals";
3053  }
3054  llvm::Value* filter_lv = cgen_state_->llBool(true);
3055  CodeGenerator code_generator(this);
3056  for (auto expr : primary_quals) {
3057  // Generate the filter for primary quals
3058  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
3059  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
3060  }
3061  CHECK(filter_lv->getType()->isIntegerTy(1));
3062  llvm::BasicBlock* sc_false{nullptr};
3063  if (!deferred_quals.empty()) {
3064  auto sc_true = llvm::BasicBlock::Create(
3065  cgen_state_->context_, "sc_true", cgen_state_->current_func_);
3066  sc_false = llvm::BasicBlock::Create(
3067  cgen_state_->context_, "sc_false", cgen_state_->current_func_);
3068  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
3069  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
3070  if (ra_exe_unit.join_quals.empty()) {
3071  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
3072  }
3073  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
3074  filter_lv = cgen_state_->llBool(true);
3075  }
3076  for (auto expr : deferred_quals) {
3077  filter_lv = cgen_state_->ir_builder_.CreateAnd(
3078  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
3079  }
3080 
3081  CHECK(filter_lv->getType()->isIntegerTy(1));
3082  auto ret = group_by_and_aggregate.codegen(
3083  filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
3084 
3085  // Switch the code generation back to the row function if a filter
3086  // function was enabled.
3087  if (cgen_state_->filter_func_) {
3088  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3089  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(false), loop_done);
3090  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3091  }
3092 
3093  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3094  cgen_state_->current_func_ = cgen_state_->row_func_;
3095  cgen_state_->filter_func_call_ =
3096  cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
3097 
3098  // Create real filter function declaration after placeholder call
3099  // is emitted.
3100  redeclareFilterFunction();
3101 
3102  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3103  auto loop_done_true = llvm::BasicBlock::Create(
3104  cgen_state_->context_, "loop_done_true", cgen_state_->row_func_);
3105  auto loop_done_false = llvm::BasicBlock::Create(
3106  cgen_state_->context_, "loop_done_false", cgen_state_->row_func_);
3107  auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(
3108  loop_done->getType()->getPointerElementType(), loop_done);
3109  cgen_state_->ir_builder_.CreateCondBr(
3110  loop_done_flag, loop_done_true, loop_done_false);
3111  cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
3112  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3113  cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
3114  } else {
3115  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3116  }
3117  }
3118  return ret;
3119 }
3120 
3121 std::vector<llvm::Value*> generate_column_heads_load(const int num_columns,
3122  llvm::Value* byte_stream_arg,
3123  llvm::IRBuilder<>& ir_builder,
3124  llvm::LLVMContext& ctx) {
3125  CHECK(byte_stream_arg);
3126  const auto max_col_local_id = num_columns - 1;
3127 
3128  std::vector<llvm::Value*> col_heads;
3129  for (int col_id = 0; col_id <= max_col_local_id; ++col_id) {
3130  auto* gep = ir_builder.CreateGEP(
3131  byte_stream_arg->getType()->getScalarType()->getPointerElementType(),
3132  byte_stream_arg,
3133  llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id));
3134  col_heads.emplace_back(
3135  ir_builder.CreateLoad(gep->getType()->getPointerElementType(), gep));
3136  }
3137  return col_heads;
3138 }
3139 
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, const std::vector< JoinLoop > &join_loops, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
double g_running_query_interrupt_freq
Definition: Execute.cpp:129
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
std::string get_cuda_libdevice_dir(void)
Definition: CudaMgr.cpp:494
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned cuda_blocksize, const unsigned num_blocks_per_mp)
bool countDistinctDescriptorsLogicallyEmpty() const
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1436
std::unordered_map< shared::TableKey, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:43
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
Definition: Analyzer.h:215
void mark_function_never_inline(llvm::Function *func)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
Definition: Analyzer.h:222
ExecutorDeviceType
void optimize_ir(llvm::Function *query_func, llvm::Module *llvm_module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co)
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:285
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void AutoTrackBuffersInRuntimeIR()
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void mark_function_always_inline(llvm::Function *func)
bool is_fp() const
Definition: sqltypes.h:584
llvm::StringRef get_gpu_data_layout()
llvm::ConstantInt * ll_int(const T v, llvm::LLVMContext &context)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *llvm_module)
std::string join(T const &container, std::string const &delim)
std::vector< InputDescriptor > input_descs
#define UNREACHABLE()
Definition: Logger.h:337
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_string(const std::string &udf_ir_string, llvm::LLVMContext &ctx, bool is_gpu=false)
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *mod, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
std::vector< std::string > CodeCacheKey
Definition: CodeCache.h:25
ExecutorOptLevel opt_level
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:80
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:305
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *mod, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
bool is_time() const
Definition: sqltypes.h:586
constexpr double f
Definition: Utm.h:31
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
llvm::Function * row_func_
Definition: CgenState.h:367
bool g_enable_smem_non_grouped_agg
Definition: Execute.cpp:138
Definition: sqldefs.h:75
std::shared_lock< T > shared_lock
unsigned getExpOfTwo(unsigned n)
Definition: MathUtils.cpp:23
llvm::StringRef get_gpu_target_triple_string()
llvm::Module * module_
Definition: CgenState.h:366
Supported runtime functions management and retrieval.
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
void verify_function_ir(const llvm::Function *func)
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:167
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
ExecutionEngineWrapper create_execution_engine(llvm::Module *llvm_module, llvm::EngineBuilder &eb, const CompilationOptions &co)
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
bool is_integer() const
Definition: sqltypes.h:582
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx, bool is_gpu=false)
ExecutorExplainType explain_type
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
void insertErrorCodeChecker(llvm::Function *query_func, bool hoist_literals, bool allow_runtime_query_interrupt)
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1435
void initializeNVPTXBackend() const
Definition: sqldefs.h:77
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:122
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
CubinResult ptx_to_cubin(const std::string &ptx, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
this
Definition: Execute.cpp:253
QueryDescriptionType getQueryDescriptionType() const
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
Definition: CudaMgr.h:156
#define AUTOMATIC_IR_METADATA_DONE()
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *llvm_module, llvm::LLVMContext &context)
ExecutorDeviceType device_type
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *llvm_module)
llvm::Function * filter_func_
Definition: CgenState.h:368
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
static void addUdfIrToModule(const std::string &udf_ir_filename, const bool is_cuda_ir)
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:331
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *llvm_module)
#define CHECK_LE(x, y)
Definition: Logger.h:304
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool is_gpu_smem_used, const CompilationOptions &)
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(Executor *executor, llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co, const GPUTarget &gpu_target)
bool g_enable_smem_grouped_non_count_agg
Definition: Execute.cpp:135
Definition: sqldefs.h:78
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::unordered_map< shared::TableKey, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
static std::map< ExtModuleKinds, std::string > extension_module_sources
Definition: Execute.h:476
void show_defined(llvm::Module &llvm_module)
int CUdevice
Definition: nocuda.h:20
bool g_enable_filter_function
Definition: Execute.cpp:84
static void linkModuleWithLibdevice(Executor *executor, llvm::Module &module, llvm::PassManagerBuilder &pass_manager_builder, const GPUTarget &gpu_target)
float g_fraction_code_cache_to_evict
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
Definition: LogicalIR.cpp:157
SQLAgg get_aggtype() const
Definition: Analyzer.h:1207
std::list< std::shared_ptr< Analyzer::Expr > > quals
#define CHECK(condition)
Definition: Logger.h:291
bool is_geometry() const
Definition: sqltypes.h:592
#define DEBUG_TIMER(name)
Definition: Logger.h:411
llvm::ValueToValueMapTy vmap_
Definition: CgenState.h:376
std::vector< llvm::Value * > inlineHoistedLiterals()
static std::shared_ptr< QueryEngine > getInstance()
Definition: QueryEngine.h:81
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool any_of(std::vector< Analyzer::Expr * > const &target_exprs)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
bool is_string() const
Definition: sqltypes.h:580
size_t g_leaf_count
Definition: ParserNode.cpp:77
Definition: sqldefs.h:76
static llvm::sys::Mutex g_ee_create_mutex
int cpu_threads()
Definition: thread_count.h:25
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls, const bool is_gpu=false)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool is_decimal() const
Definition: sqltypes.h:583
Definition: sqldefs.h:74
llvm::Type * get_int_ptr_type(const int width, llvm::LLVMContext &context)
Definition: sqldefs.h:83
bool is_array() const
Definition: sqltypes.h:588
#define VLOG(n)
Definition: Logger.h:387
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
std::unique_ptr< llvm::Module > read_llvm_module_from_bc_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
size_t g_gpu_smem_threshold
Definition: Execute.cpp:130