OmniSciDB  085a039ca4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
NativeCodegen.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryEngine/Execute.h"
18 
19 #if LLVM_VERSION_MAJOR < 9
20 static_assert(false, "LLVM Version >= 9 is required.");
21 #endif
22 
23 #include <llvm/Analysis/ScopedNoAliasAA.h>
24 #include <llvm/Analysis/TypeBasedAliasAnalysis.h>
25 #include <llvm/Bitcode/BitcodeReader.h>
26 #include <llvm/Bitcode/BitcodeWriter.h>
27 #include <llvm/ExecutionEngine/MCJIT.h>
28 #include <llvm/IR/Attributes.h>
29 #include <llvm/IR/GlobalValue.h>
30 #include <llvm/IR/InstIterator.h>
31 #include <llvm/IR/IntrinsicInst.h>
32 #include <llvm/IR/Intrinsics.h>
33 #include <llvm/IR/LegacyPassManager.h>
34 #include <llvm/IR/Verifier.h>
35 #include <llvm/IRReader/IRReader.h>
36 #include <llvm/Linker/Linker.h>
37 #include <llvm/Support/Casting.h>
38 #include <llvm/Support/FileSystem.h>
39 #include <llvm/Support/FormattedStream.h>
40 #include <llvm/Support/MemoryBuffer.h>
41 #include <llvm/Support/SourceMgr.h>
42 #include <llvm/Support/TargetRegistry.h>
43 #include <llvm/Support/TargetSelect.h>
44 #include <llvm/Support/raw_os_ostream.h>
45 #include <llvm/Support/raw_ostream.h>
46 #include <llvm/Transforms/IPO.h>
47 #include <llvm/Transforms/IPO/AlwaysInliner.h>
48 #include <llvm/Transforms/IPO/InferFunctionAttrs.h>
49 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
50 #include <llvm/Transforms/InstCombine/InstCombine.h>
51 #include <llvm/Transforms/Instrumentation.h>
52 #include <llvm/Transforms/Scalar.h>
53 #include <llvm/Transforms/Scalar/GVN.h>
54 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
55 #include <llvm/Transforms/Utils.h>
56 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
57 #include <llvm/Transforms/Utils/Cloning.h>
58 
59 #if LLVM_VERSION_MAJOR >= 11
60 #include <llvm/Support/Host.h>
61 #endif
62 
63 #include "CudaMgr/CudaMgr.h"
72 #include "Shared/MathUtils.h"
73 #include "StreamingTopN.h"
74 
76 
77 static llvm::sys::Mutex g_ee_create_mutex;
78 
79 #ifdef ENABLE_GEOS
80 
81 #include <llvm/Support/DynamicLibrary.h>
82 
83 #ifndef GEOS_LIBRARY_FILENAME
84 #error Configuration should include GEOS library file name
85 #endif
86 std::unique_ptr<std::string> g_libgeos_so_filename(
87  new std::string(GEOS_LIBRARY_FILENAME));
88 static llvm::sys::DynamicLibrary geos_dynamic_library;
89 static std::mutex geos_init_mutex;
90 
91 namespace {
92 
93 void load_geos_dynamic_library() {
94  std::lock_guard<std::mutex> guard(geos_init_mutex);
95 
96  if (!geos_dynamic_library.isValid()) {
97  if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
98  LOG(WARNING) << "Misconfigured GEOS library file name, trying 'libgeos_c.so'";
99  g_libgeos_so_filename.reset(new std::string("libgeos_c.so"));
100  }
101  auto filename = *g_libgeos_so_filename;
102  std::string error_message;
103  geos_dynamic_library =
104  llvm::sys::DynamicLibrary::getPermanentLibrary(filename.c_str(), &error_message);
105  if (!geos_dynamic_library.isValid()) {
106  LOG(ERROR) << "Failed to load GEOS library '" + filename + "'";
107  std::string exception_message = "Failed to load GEOS library: " + error_message;
108  throw std::runtime_error(exception_message.c_str());
109  } else {
110  LOG(INFO) << "Loaded GEOS library '" + filename + "'";
111  }
112  }
113 }
114 
115 } // namespace
116 #endif
117 
118 namespace {
119 
120 void throw_parseIR_error(const llvm::SMDiagnostic& parse_error,
121  std::string src = "",
122  const bool is_gpu = false) {
123  std::string excname = (is_gpu ? "NVVM IR ParseError: " : "LLVM IR ParseError: ");
124  llvm::raw_string_ostream ss(excname);
125  parse_error.print(src.c_str(), ss, false, false);
126  throw ParseIRError(ss.str());
127 }
128 
129 /* SHOW_DEFINED(<llvm::Module instance>) prints the function names
130  that are defined in the given LLVM Module instance.
131 
132  SHOW_FUNCTIONS(<llvm::Module instance>) prints the function names
133  of all used functions in the given LLVM Module
134  instance. Declarations are marked with `[decl]` as a name suffix.
135 
136  Useful for debugging.
137 */
138 
139 #define SHOW_DEFINED(MODULE) \
140  { \
141  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
142  ::show_defined(MODULE); \
143  }
144 
145 #define SHOW_FUNCTIONS(MODULE) \
146  { \
147  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
148  ::show_functions(MODULE); \
149  }
150 
151 template <typename T = void>
152 void show_defined(llvm::Module& llvm_module) {
153  std::cout << "defines: ";
154  for (auto& f : llvm_module.getFunctionList()) {
155  if (!f.isDeclaration()) {
156  std::cout << f.getName().str() << ", ";
157  }
158  }
159  std::cout << std::endl;
160 }
161 
162 template <typename T = void>
163 void show_defined(llvm::Module* llvm_module) {
164  if (llvm_module == nullptr) {
165  std::cout << "is null" << std::endl;
166  } else {
167  show_defined(*llvm_module);
168  }
169 }
170 
171 template <typename T = void>
172 void show_defined(std::unique_ptr<llvm::Module>& llvm_module) {
173  show_defined(llvm_module.get());
174 }
175 
176 /*
177  scan_function_calls(module, defined, undefined, ignored) computes
178  defined and undefined sets of function names:
179 
180  - defined functions are those that are defined in the given module
181 
182  - undefined functions are those that are called by defined functions
183  but that are not defined in the given module
184 
185  - ignored functions are functions that may be undefined but will not
186  be listed in the set of undefined functions.
187 
188  Useful for debugging.
189 */
190 template <typename T = void>
191 void scan_function_calls(llvm::Function& F,
192  std::unordered_set<std::string>& defined,
193  std::unordered_set<std::string>& undefined,
194  const std::unordered_set<std::string>& ignored) {
195  for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
196  if (auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
197  auto* F2 = CI->getCalledFunction();
198  if (F2 != nullptr) {
199  auto F2name = F2->getName().str();
200  if (F2->isDeclaration()) {
201  if (F2name.rfind("__", 0) !=
202  0 // assume symbols with double underscore are defined
203  && F2name.rfind("llvm.", 0) !=
204  0 // TODO: this may give false positive for NVVM intrinsics
205  && ignored.find(F2name) == ignored.end() // not in ignored list
206  ) {
207  undefined.emplace(F2name);
208  }
209  } else {
210  if (defined.find(F2name) == defined.end()) {
211  defined.emplace(F2name);
212  scan_function_calls<T>(*F2, defined, undefined, ignored);
213  }
214  }
215  }
216  }
217  }
218 }
219 
220 template <typename T = void>
221 void scan_function_calls(llvm::Module& llvm_module,
222  std::unordered_set<std::string>& defined,
223  std::unordered_set<std::string>& undefined,
224  const std::unordered_set<std::string>& ignored) {
225  for (auto& F : llvm_module) {
226  if (!F.isDeclaration()) {
227  scan_function_calls(F, defined, undefined, ignored);
228  }
229  }
230 }
231 
232 template <typename T = void>
233 std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>>
234 scan_function_calls(llvm::Module& llvm_module,
235  const std::unordered_set<std::string>& ignored = {}) {
236  std::unordered_set<std::string> defined, undefined;
237  scan_function_calls(llvm_module, defined, undefined, ignored);
238  return std::make_tuple(defined, undefined);
239 }
240 
241 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
243  llvm::Module& M,
244  const std::unordered_set<llvm::Function*>& live_funcs) {
245  std::vector<llvm::Function*> dead_funcs;
246  for (auto& F : M) {
247  bool bAlive = false;
248  if (live_funcs.count(&F)) {
249  continue;
250  }
251  for (auto U : F.users()) {
252  auto* C = llvm::dyn_cast<const llvm::CallInst>(U);
253  if (!C || C->getParent()->getParent() != &F) {
254  bAlive = true;
255  break;
256  }
257  }
258  if (!bAlive) {
259  dead_funcs.push_back(&F);
260  }
261  }
262  for (auto pFn : dead_funcs) {
263  pFn->eraseFromParent();
264  }
265 }
266 
267 #ifdef HAVE_CUDA
268 
269 // check if linking with libdevice is required
270 // libdevice functions have a __nv_* prefix
271 bool check_module_requires_libdevice(llvm::Module* llvm_module) {
272  auto timer = DEBUG_TIMER(__func__);
273  for (llvm::Function& F : *llvm_module) {
274  if (F.hasName() && F.getName().startswith("__nv_")) {
275  LOG(INFO) << "Module requires linking with libdevice: " << std::string(F.getName());
276  return true;
277  }
278  }
279  LOG(DEBUG1) << "module does not require linking against libdevice";
280  return false;
281 }
282 
283 // Adds the missing intrinsics declarations to the given module
284 void add_intrinsics_to_module(llvm::Module* llvm_module) {
285  for (llvm::Function& F : *llvm_module) {
286  for (llvm::Instruction& I : instructions(F)) {
287  if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
288  if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
289  llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
290  llvm::Function& decl_fn =
291  *llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID(), Tys);
292  ii->setCalledFunction(&decl_fn);
293  } else {
294  // inserts the declaration into the module if not present
295  llvm::Intrinsic::getDeclaration(llvm_module, ii->getIntrinsicID());
296  }
297  }
298  }
299  }
300 }
301 
302 #endif
303 
304 void optimize_ir(llvm::Function* query_func,
305  llvm::Module* llvm_module,
306  llvm::legacy::PassManager& pass_manager,
307  const std::unordered_set<llvm::Function*>& live_funcs,
308  const bool is_gpu_smem_used,
309  const CompilationOptions& co) {
310  auto timer = DEBUG_TIMER(__func__);
311  // the always inliner legacy pass must always run first
312  pass_manager.add(llvm::createVerifierPass());
313  pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
314 
315  pass_manager.add(new AnnotateInternalFunctionsPass());
316 
317  pass_manager.add(llvm::createSROAPass());
318  // mem ssa drops unused load and store instructions, e.g. passing variables directly
319  // where possible
320  pass_manager.add(
321  llvm::createEarlyCSEPass(/*enable_mem_ssa=*/true)); // Catch trivial redundancies
322 
323  if (!is_gpu_smem_used) {
324  // thread jumps can change the execution order around SMEM sections guarded by
325  // `__syncthreads()`, which results in race conditions. For now, disable jump
326  // threading for shared memory queries. In the future, consider handling shared memory
327  // aggregations with a separate kernel launch
328  pass_manager.add(llvm::createJumpThreadingPass()); // Thread jumps.
329  }
330  pass_manager.add(llvm::createCFGSimplificationPass());
331 
332  // remove load/stores in PHIs if instructions can be accessed directly post thread jumps
333  pass_manager.add(llvm::createNewGVNPass());
334 
335  pass_manager.add(llvm::createDeadStoreEliminationPass());
336  pass_manager.add(llvm::createLICMPass());
337 
338  pass_manager.add(llvm::createInstructionCombiningPass());
339 
340  // module passes
341  pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
342  pass_manager.add(llvm::createGlobalOptimizerPass());
343 
344  pass_manager.add(llvm::createCFGSimplificationPass()); // cleanup after everything
345 
346  pass_manager.run(*llvm_module);
347 
348  eliminate_dead_self_recursive_funcs(*llvm_module, live_funcs);
349 }
350 #endif
351 
352 } // namespace
353 
355 
356 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine)
357  : execution_engine_(execution_engine) {}
358 
359 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine,
360  const CompilationOptions& co)
361  : execution_engine_(execution_engine) {
362  if (execution_engine_) {
364 #ifdef ENABLE_INTEL_JIT_LISTENER
365  intel_jit_listener_.reset(llvm::JITEventListener::createIntelJITEventListener());
367  execution_engine_->RegisterJITEventListener(intel_jit_listener_.get());
368  LOG(INFO) << "Registered IntelJITEventListener";
369 #else
370  LOG(WARNING) << "This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
371  "listener configuration parameter.";
372 #endif // ENABLE_INTEL_JIT_LISTENER
373  }
374  }
375 }
376 
378  llvm::ExecutionEngine* execution_engine) {
379  execution_engine_.reset(execution_engine);
380  intel_jit_listener_ = nullptr;
381  return *this;
382 }
383 
384 void verify_function_ir(const llvm::Function* func) {
385  std::stringstream err_ss;
386  llvm::raw_os_ostream err_os(err_ss);
387  err_os << "\n-----\n";
388  if (llvm::verifyFunction(*func, &err_os)) {
389  err_os << "\n-----\n";
390  func->print(err_os, nullptr);
391  err_os << "\n-----\n";
392  LOG(FATAL) << err_ss.str();
393  }
394 }
395 
396 namespace {
397 
398 std::string assemblyForCPU(ExecutionEngineWrapper& execution_engine,
399  llvm::Module* llvm_module) {
400  llvm::legacy::PassManager pass_manager;
401  auto cpu_target_machine = execution_engine->getTargetMachine();
402  CHECK(cpu_target_machine);
403  llvm::SmallString<256> code_str;
404  llvm::raw_svector_ostream os(code_str);
405 #if LLVM_VERSION_MAJOR >= 10
406  cpu_target_machine->addPassesToEmitFile(
407  pass_manager, os, nullptr, llvm::CGFT_AssemblyFile);
408 #else
409  cpu_target_machine->addPassesToEmitFile(
410  pass_manager, os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
411 #endif
412  pass_manager.run(*llvm_module);
413  return "Assembly for the CPU:\n" + std::string(code_str.str()) + "\nEnd of assembly";
414 }
415 
417  llvm::EngineBuilder& eb,
418  const CompilationOptions& co) {
419  auto timer = DEBUG_TIMER(__func__);
420  // Avoids data race in
421  // llvm::sys::DynamicLibrary::getPermanentLibrary and
422  // GDBJITRegistrationListener::notifyObjectLoaded while creating a
423  // new ExecutionEngine instance. Unfortunately we have to use global
424  // mutex here.
425  std::lock_guard<llvm::sys::Mutex> lock(g_ee_create_mutex);
426  ExecutionEngineWrapper execution_engine(eb.create(), co);
427  CHECK(execution_engine.get());
428  // Force the module data layout to match the layout for the selected target
429  llvm_module->setDataLayout(execution_engine->getDataLayout());
430 
431  LOG(ASM) << assemblyForCPU(execution_engine, llvm_module);
432 
433  execution_engine->finalizeObject();
434  return execution_engine;
435 }
436 
437 } // namespace
438 
440  llvm::Function* func,
441  const std::unordered_set<llvm::Function*>& live_funcs,
442  const CompilationOptions& co) {
443  auto timer = DEBUG_TIMER(__func__);
444  llvm::Module* llvm_module = func->getParent();
445  // run optimizations
446 #ifndef WITH_JIT_DEBUG
447  llvm::legacy::PassManager pass_manager;
448  optimize_ir(
449  func, llvm_module, pass_manager, live_funcs, /*is_gpu_smem_used=*/false, co);
450 #endif // WITH_JIT_DEBUG
451 
452  auto init_err = llvm::InitializeNativeTarget();
453  CHECK(!init_err);
454 
455  llvm::InitializeAllTargetMCs();
456  llvm::InitializeNativeTargetAsmPrinter();
457  llvm::InitializeNativeTargetAsmParser();
458 
459  std::string err_str;
460  std::unique_ptr<llvm::Module> owner(llvm_module);
461  llvm::EngineBuilder eb(std::move(owner));
462  eb.setErrorStr(&err_str);
463  eb.setEngineKind(llvm::EngineKind::JIT);
464  llvm::TargetOptions to;
465  to.EnableFastISel = true;
466  eb.setTargetOptions(to);
468  eb.setOptLevel(llvm::CodeGenOpt::None);
469  }
470 
471  return create_execution_engine(llvm_module, eb, co);
472 }
473 
474 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenCPU(
475  llvm::Function* query_func,
476  llvm::Function* multifrag_query_func,
477  const std::unordered_set<llvm::Function*>& live_funcs,
478  const CompilationOptions& co) {
479  CodeCacheKey key{serialize_llvm_object(query_func),
480  serialize_llvm_object(cgen_state_->row_func_)};
481 
482  llvm::Module* M = query_func->getParent();
483  auto* flag = llvm::mdconst::extract_or_null<llvm::ConstantInt>(
484  M->getModuleFlag("manage_memory_buffer"));
485  if (flag and flag->getZExtValue() == 1 and M->getFunction("allocate_varlen_buffer") and
486  M->getFunction("register_buffer_with_executor_rsm")) {
487  LOG(INFO) << "including executor addr to cache key\n";
488  key.push_back(std::to_string(reinterpret_cast<int64_t>(this)));
489  }
490  if (cgen_state_->filter_func_) {
491  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
492  }
493  for (const auto helper : cgen_state_->helper_functions_) {
494  key.push_back(serialize_llvm_object(helper));
495  }
496  auto cached_code = cpu_code_accessor.get_value(key);
497  if (cached_code) {
498  return cached_code;
499  }
500 
501  if (cgen_state_->needs_geos_) {
502 #ifdef ENABLE_GEOS
503  auto llvm_module = multifrag_query_func->getParent();
504  load_geos_dynamic_library();
505 
506  // Read geos runtime module and bind GEOS API function references to GEOS library
507  auto rt_geos_module_copy = llvm::CloneModule(
508  *get_geos_module(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
509  auto func = llvm::dyn_cast<llvm::Function>(gv);
510  if (!func) {
511  return true;
512  }
513  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
514  func->getLinkage() ==
515  llvm::GlobalValue::LinkageTypes::InternalLinkage ||
516  func->getLinkage() == llvm::GlobalValue::LinkageTypes::ExternalLinkage);
517  });
518  CodeGenerator::link_udf_module(rt_geos_module_copy,
519  *llvm_module,
520  cgen_state_.get(),
521  llvm::Linker::Flags::LinkOnlyNeeded);
522 #else
523  throw std::runtime_error("GEOS is disabled in this build");
524 #endif
525  }
526 
527  auto execution_engine =
528  CodeGenerator::generateNativeCPUCode(query_func, live_funcs, co);
529  auto cpu_compilation_context =
530  std::make_shared<CpuCompilationContext>(std::move(execution_engine));
531  cpu_compilation_context->setFunctionPointer(multifrag_query_func);
532  cpu_code_accessor.put(key, cpu_compilation_context);
533  return std::dynamic_pointer_cast<CompilationContext>(cpu_compilation_context);
534 }
535 
536 void CodeGenerator::link_udf_module(const std::unique_ptr<llvm::Module>& udf_module,
537  llvm::Module& llvm_module,
538  CgenState* cgen_state,
539  llvm::Linker::Flags flags) {
540  auto timer = DEBUG_TIMER(__func__);
541  // throw a runtime error if the target module contains functions
542  // with the same name as in module of UDF functions.
543  for (auto& f : *udf_module) {
544  auto func = llvm_module.getFunction(f.getName());
545  if (!(func == nullptr) && !f.isDeclaration() && flags == llvm::Linker::Flags::None) {
546  LOG(ERROR) << " Attempt to overwrite " << f.getName().str() << " in "
547  << llvm_module.getModuleIdentifier() << " from `"
548  << udf_module->getModuleIdentifier() << "`" << std::endl;
549  throw std::runtime_error(
550  "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
551  "function ***");
552  } else {
553  VLOG(1) << " Adding " << f.getName().str() << " to "
554  << llvm_module.getModuleIdentifier() << " from `"
555  << udf_module->getModuleIdentifier() << "`" << std::endl;
556  }
557  }
558 
559  auto udf_module_copy = llvm::CloneModule(*udf_module, cgen_state->vmap_);
560 
561  udf_module_copy->setDataLayout(llvm_module.getDataLayout());
562  udf_module_copy->setTargetTriple(llvm_module.getTargetTriple());
563 
564  // Initialize linker with module for RuntimeFunctions.bc
565  llvm::Linker ld(llvm_module);
566  bool link_error = false;
567 
568  link_error = ld.linkInModule(std::move(udf_module_copy), flags);
569 
570  if (link_error) {
571  throw std::runtime_error("link_udf_module: *** error linking module ***");
572  }
573 }
574 
575 namespace {
576 
577 std::string cpp_to_llvm_name(const std::string& s) {
578  if (s == "int8_t") {
579  return "i8";
580  }
581  if (s == "int16_t") {
582  return "i16";
583  }
584  if (s == "int32_t") {
585  return "i32";
586  }
587  if (s == "int64_t") {
588  return "i64";
589  }
590  CHECK(s == "float" || s == "double");
591  return s;
592 }
593 
594 std::string gen_array_any_all_sigs() {
595  std::string result;
596  for (const std::string any_or_all : {"any", "all"}) {
597  for (const std::string elem_type :
598  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
599  for (const std::string needle_type :
600  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
601  for (const std::string op_name : {"eq", "ne", "lt", "le", "gt", "ge"}) {
602  result += ("declare i1 @array_" + any_or_all + "_" + op_name + "_" + elem_type +
603  "_" + needle_type + "(i8*, i64, " + cpp_to_llvm_name(needle_type) +
604  ", " + cpp_to_llvm_name(elem_type) + ");\n");
605  }
606  }
607  }
608  }
609  return result;
610 }
611 
613  std::string result;
614  for (const std::string key_type : {"int8_t", "int16_t", "int32_t", "int64_t"}) {
615  const auto key_llvm_type = cpp_to_llvm_name(key_type);
616  result += "declare i64 @translate_null_key_" + key_type + "(" + key_llvm_type + ", " +
617  key_llvm_type + ", i64);\n";
618  }
619  return result;
620 }
621 
622 const std::string cuda_rt_decls =
623  R"( declare void @llvm.dbg.declare(metadata, metadata, metadata) declare void @llvm.dbg.value(metadata, metadata, metadata) declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind declare i64 @get_thread_index(); declare i64 @get_block_index(); declare i32 @pos_start_impl(i32*); declare i32 @group_buff_idx_impl(); declare i32 @pos_step_impl(); declare i8 @thread_warp_idx(i8); declare i64* @init_shared_mem(i64*, i32); declare i64* @init_shared_mem_nop(i64*, i32); declare i64* @declare_dynamic_shared_memory(); declare void @write_back_nop(i64*, i64*, i32); declare void @write_back_non_grouped_agg(i64*, i64*, i32); declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8); declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32); declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32); declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32); declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32); declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32); declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32); declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64); declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64); declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64); declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64); declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64); declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double); declare i64 @get_bucket_key_for_range_double(i8*, i64, double); declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double); declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64); declare i64 @agg_count_shared(i64*, i64); declare i64 @agg_count_skip_val_shared(i64*, i64, i64); declare i32 @agg_count_int32_shared(i32*, i32); declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32); declare i64 @agg_count_double_shared(i64*, double); declare i64 @agg_count_double_skip_val_shared(i64*, double, double); declare i32 @agg_count_float_shared(i32*, float); declare i32 @agg_count_float_skip_val_shared(i32*, float, float); declare i64 @agg_sum_shared(i64*, i64); declare i64 @agg_sum_skip_val_shared(i64*, i64, i64); declare i32 @agg_sum_int32_shared(i32*, i32); declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32); declare void @agg_sum_double_shared(i64*, double); declare void @agg_sum_double_skip_val_shared(i64*, double, double); declare void @agg_sum_float_shared(i32*, float); declare void @agg_sum_float_skip_val_shared(i32*, float, float); declare void @agg_max_shared(i64*, i64); declare void @agg_max_skip_val_shared(i64*, i64, i64); declare void @agg_max_int32_shared(i32*, i32); declare void @agg_max_int32_skip_val_shared(i32*, i32, i32); declare void @agg_max_int16_shared(i16*, i16); declare void @agg_max_int16_skip_val_shared(i16*, i16, i16); declare void @agg_max_int8_shared(i8*, i8); declare void @agg_max_int8_skip_val_shared(i8*, i8, i8); declare void @agg_max_double_shared(i64*, double); declare void @agg_max_double_skip_val_shared(i64*, double, double); declare void @agg_max_float_shared(i32*, float); declare void @agg_max_float_skip_val_shared(i32*, float, float); declare void @agg_min_shared(i64*, i64); declare void @agg_min_skip_val_shared(i64*, i64, i64); declare void @agg_min_int32_shared(i32*, i32); declare void @agg_min_int32_skip_val_shared(i32*, i32, i32); declare void @agg_min_int16_shared(i16*, i16); declare void @agg_min_int16_skip_val_shared(i16*, i16, i16); declare void @agg_min_int8_shared(i8*, i8); declare void @agg_min_int8_skip_val_shared(i8*, i8, i8); declare void @agg_min_double_shared(i64*, double); declare void @agg_min_double_skip_val_shared(i64*, double, double); declare void @agg_min_float_shared(i32*, float); declare void @agg_min_float_skip_val_shared(i32*, float, float); declare void @agg_id_shared(i64*, i64); declare i8* @agg_id_varlen_shared(i8*, i64, i8*, i64); declare void @agg_id_int32_shared(i32*, i32); declare void @agg_id_int16_shared(i16*, i16); declare void @agg_id_int8_shared(i8*, i8); declare void @agg_id_double_shared(i64*, double); declare void @agg_id_double_shared_slow(i64*, double*); declare void @agg_id_float_shared(i32*, float); declare i32 @checked_single_agg_id_shared(i64*, i64, i64); declare i32 @checked_single_agg_id_double_shared(i64*, double, double); declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double); declare i32 @checked_single_agg_id_float_shared(i32*, float, float); declare i1 @slotEmptyKeyCAS(i64*, i64, i64); declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32); declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16); declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8); declare i64 @datetrunc_century(i64); declare i64 @datetrunc_day(i64); declare i64 @datetrunc_decade(i64); declare i64 @datetrunc_hour(i64); declare i64 @datetrunc_millennium(i64); declare i64 @datetrunc_minute(i64); declare i64 @datetrunc_month(i64); declare i64 @datetrunc_quarter(i64); declare i64 @datetrunc_quarterday(i64); declare i64 @datetrunc_week_monday(i64); declare i64 @datetrunc_week_sunday(i64); declare i64 @datetrunc_week_saturday(i64); declare i64 @datetrunc_year(i64); declare i64 @extract_epoch(i64); declare i64 @extract_dateepoch(i64); declare i64 @extract_quarterday(i64); declare i64 @extract_hour(i64); declare i64 @extract_minute(i64); declare i64 @extract_second(i64); declare i64 @extract_millisecond(i64); declare i64 @extract_microsecond(i64); declare i64 @extract_nanosecond(i64); declare i64 @extract_dow(i64); declare i64 @extract_isodow(i64); declare i64 @extract_day(i64); declare i64 @extract_week_monday(i64); declare i64 @extract_week_sunday(i64); declare i64 @extract_week_saturday(i64); declare i64 @extract_day_of_year(i64); declare i64 @extract_month(i64); declare i64 @extract_quarter(i64); declare i64 @extract_year(i64); declare i64 @DateTruncateHighPrecisionToDate(i64, i64); declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64); declare i64 @DateDiff(i32, i64, i64); declare i64 @DateDiffNullable(i32, i64, i64, i64); declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32); declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64); declare i64 @DateAdd(i32, i64, i64); declare i64 @DateAddNullable(i32, i64, i64, i64); declare i64 @DateAddHighPrecision(i32, i64, i64, i32); declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64); declare i64 @string_decode(i8*, i64); declare i32 @array_size(i8*, i64, i32); declare i32 @array_size_nullable(i8*, i64, i32, i32); declare i32 @fast_fixlen_array_size(i8*, i32); declare i1 @array_is_null(i8*, i64); declare i1 @point_coord_array_is_null(i8*, i64); declare i8* @array_buff(i8*, i64); declare i8* @fast_fixlen_array_buff(i8*, i64); declare i8 @array_at_int8_t(i8*, i64, i32); declare i16 @array_at_int16_t(i8*, i64, i32); declare i32 @array_at_int32_t(i8*, i64, i32); declare i64 @array_at_int64_t(i8*, i64, i32); declare float @array_at_float(i8*, i64, i32); declare double @array_at_double(i8*, i64, i32); declare i8 @varlen_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_array_at_int64_t(i8*, i64, i32); declare float @varlen_array_at_float(i8*, i64, i32); declare double @varlen_array_at_double(i8*, i64, i32); declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32); declare float @varlen_notnull_array_at_float(i8*, i64, i32); declare double @varlen_notnull_array_at_double(i8*, i64, i32); declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8); declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16); declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32); declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64); declare float @array_at_float_checked(i8*, i64, i64, float); declare double @array_at_double_checked(i8*, i64, i64, double); declare i32 @char_length(i8*, i32); declare i32 @char_length_nullable(i8*, i32, i32); declare i32 @char_length_encoded(i8*, i32); declare i32 @char_length_encoded_nullable(i8*, i32, i32); declare i32 @key_for_string_encoded(i32); declare i1 @sample_ratio(double, i64); declare double @width_bucket(double, double, double, double, i32); declare double @width_bucket_reverse(double, double, double, double, i32); declare double @width_bucket_nullable(double, double, double, double, i32, double); declare double @width_bucket_reversed_nullable(double, double, double, double, i32, double); declare double @width_bucket_no_oob_check(double, double, double); declare double @width_bucket_reverse_no_oob_check(double, double, double); declare double @width_bucket_expr(double, i1, double, double, i32); declare double @width_bucket_expr_nullable(double, i1, double, double, i32, double); declare double @width_bucket_expr_no_oob_check(double, i1, double, double, i32); declare i1 @string_like(i8*, i32, i8*, i32, i8); declare i1 @string_ilike(i8*, i32, i8*, i32, i8); declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8); declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8); declare i1 @string_like_simple(i8*, i32, i8*, i32); declare i1 @string_ilike_simple(i8*, i32, i8*, i32); declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8); declare i1 @string_lt(i8*, i32, i8*, i32); declare i1 @string_le(i8*, i32, i8*, i32); declare i1 @string_gt(i8*, i32, i8*, i32); declare i1 @string_ge(i8*, i32, i8*, i32); declare i1 @string_eq(i8*, i32, i8*, i32); declare i1 @string_ne(i8*, i32, i8*, i32); declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8); declare i1 @regexp_like(i8*, i32, i8*, i32, i8); declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8); declare void @linear_probabilistic_count(i8*, i32, i8*, i32); declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64); declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64); declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64); declare void @record_error_code(i32, i32*); declare i32 @get_error_code(i32*); declare i1 @dynamic_watchdog(); declare i1 @check_interrupt(); declare void @force_sync(); declare void @sync_warp(); declare void @sync_warp_protected(i64, i64); declare void @sync_threadblock(); declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32); declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64); declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float); declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double); declare double @decompress_x_coord_geoint(i32); declare double @decompress_y_coord_geoint(i32); declare i32 @compress_x_coord_geoint(double); declare i32 @compress_y_coord_geoint(double); )" + gen_array_any_all_sigs() +
625 
626 #ifdef HAVE_CUDA
627 std::string extension_function_decls(const std::unordered_set<std::string>& udf_decls) {
628  const auto decls =
629  ExtensionFunctionsWhitelist::getLLVMDeclarations(udf_decls, /*is_gpu=*/true);
630  return boost::algorithm::join(decls, "\n");
631 }
632 
633 void legalize_nvvm_ir(llvm::Function* query_func) {
634  // optimizations might add attributes to the function
635  // and NVPTX doesn't understand all of them; play it
636  // safe and clear all attributes
637  clear_function_attributes(query_func);
638  verify_function_ir(query_func);
639 
640  std::vector<llvm::Instruction*> stackrestore_intrinsics;
641  std::vector<llvm::Instruction*> stacksave_intrinsics;
642  std::vector<llvm::Instruction*> lifetime;
643  for (auto& BB : *query_func) {
644  for (llvm::Instruction& I : BB) {
645  if (const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
646  if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
647  stacksave_intrinsics.push_back(&I);
648  } else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
649  stackrestore_intrinsics.push_back(&I);
650  } else if (II->getIntrinsicID() == llvm::Intrinsic::lifetime_start ||
651  II->getIntrinsicID() == llvm::Intrinsic::lifetime_end) {
652  lifetime.push_back(&I);
653  }
654  }
655  }
656  }
657 
658  // stacksave and stackrestore intrinsics appear together, and
659  // stackrestore uses stacksaved result as its argument
660  // so it should be removed first.
661  for (auto& II : stackrestore_intrinsics) {
662  II->eraseFromParent();
663  }
664  for (auto& II : stacksave_intrinsics) {
665  II->eraseFromParent();
666  }
667  // Remove lifetime intrinsics as well. NVPTX don't like them
668  for (auto& II : lifetime) {
669  II->eraseFromParent();
670  }
671 }
672 #endif // HAVE_CUDA
673 
674 } // namespace
675 
676 llvm::StringRef get_gpu_target_triple_string() {
677  return llvm::StringRef("nvptx64-nvidia-cuda");
678 }
679 
680 llvm::StringRef get_gpu_data_layout() {
681  return llvm::StringRef(
682  "e-p:64:64:64-i1:8:8-i8:8:8-"
683  "i16:16:16-i32:32:32-i64:64:64-"
684  "f32:32:32-f64:64:64-v16:16:16-"
685  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
686 }
687 
688 std::map<std::string, std::string> get_device_parameters(bool cpu_only) {
689  std::map<std::string, std::string> result;
690 
691  result.insert(std::make_pair("cpu_name", llvm::sys::getHostCPUName()));
692  result.insert(std::make_pair("cpu_triple", llvm::sys::getProcessTriple()));
693  result.insert(
694  std::make_pair("cpu_cores", std::to_string(llvm::sys::getHostNumPhysicalCores())));
695  result.insert(std::make_pair("cpu_threads", std::to_string(cpu_threads())));
696 
697  // https://en.cppreference.com/w/cpp/language/types
698  std::string sizeof_types;
699  sizeof_types += "bool:" + std::to_string(sizeof(bool)) + ";";
700  sizeof_types += "size_t:" + std::to_string(sizeof(size_t)) + ";";
701  sizeof_types += "ssize_t:" + std::to_string(sizeof(ssize_t)) + ";";
702  sizeof_types += "char:" + std::to_string(sizeof(char)) + ";";
703  sizeof_types += "uchar:" + std::to_string(sizeof(unsigned char)) + ";";
704  sizeof_types += "short:" + std::to_string(sizeof(short)) + ";";
705  sizeof_types += "ushort:" + std::to_string(sizeof(unsigned short int)) + ";";
706  sizeof_types += "int:" + std::to_string(sizeof(int)) + ";";
707  sizeof_types += "uint:" + std::to_string(sizeof(unsigned int)) + ";";
708  sizeof_types += "long:" + std::to_string(sizeof(long int)) + ";";
709  sizeof_types += "ulong:" + std::to_string(sizeof(unsigned long int)) + ";";
710  sizeof_types += "longlong:" + std::to_string(sizeof(long long int)) + ";";
711  sizeof_types += "ulonglong:" + std::to_string(sizeof(unsigned long long int)) + ";";
712  sizeof_types += "float:" + std::to_string(sizeof(float)) + ";";
713  sizeof_types += "double:" + std::to_string(sizeof(double)) + ";";
714  sizeof_types += "longdouble:" + std::to_string(sizeof(long double)) + ";";
715  sizeof_types += "voidptr:" + std::to_string(sizeof(void*)) + ";";
716 
717  result.insert(std::make_pair("type_sizeof", sizeof_types));
718 
719  std::string null_values;
720  null_values += "boolean1:" + std::to_string(serialized_null_value<bool>()) + ";";
721  null_values += "boolean8:" + std::to_string(serialized_null_value<int8_t>()) + ";";
722  null_values += "int8:" + std::to_string(serialized_null_value<int8_t>()) + ";";
723  null_values += "int16:" + std::to_string(serialized_null_value<int16_t>()) + ";";
724  null_values += "int32:" + std::to_string(serialized_null_value<int32_t>()) + ";";
725  null_values += "int64:" + std::to_string(serialized_null_value<int64_t>()) + ";";
726  null_values += "uint8:" + std::to_string(serialized_null_value<uint8_t>()) + ";";
727  null_values += "uint16:" + std::to_string(serialized_null_value<uint16_t>()) + ";";
728  null_values += "uint32:" + std::to_string(serialized_null_value<uint32_t>()) + ";";
729  null_values += "uint64:" + std::to_string(serialized_null_value<uint64_t>()) + ";";
730  null_values += "float32:" + std::to_string(serialized_null_value<float>()) + ";";
731  null_values += "float64:" + std::to_string(serialized_null_value<double>()) + ";";
732  null_values +=
733  "Array<boolean8>:" + std::to_string(serialized_null_value<int8_t, true>()) + ";";
734  null_values +=
735  "Array<int8>:" + std::to_string(serialized_null_value<int8_t, true>()) + ";";
736  null_values +=
737  "Array<int16>:" + std::to_string(serialized_null_value<int16_t, true>()) + ";";
738  null_values +=
739  "Array<int32>:" + std::to_string(serialized_null_value<int32_t, true>()) + ";";
740  null_values +=
741  "Array<int64>:" + std::to_string(serialized_null_value<int64_t, true>()) + ";";
742  null_values +=
743  "Array<float32>:" + std::to_string(serialized_null_value<float, true>()) + ";";
744  null_values +=
745  "Array<float64>:" + std::to_string(serialized_null_value<double, true>()) + ";";
746 
747  result.insert(std::make_pair("null_values", null_values));
748 
749  llvm::StringMap<bool> cpu_features;
750  if (llvm::sys::getHostCPUFeatures(cpu_features)) {
751  std::string features_str = "";
752  for (auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
753  features_str += (it->getValue() ? " +" : " -");
754  features_str += it->getKey().str();
755  }
756  result.insert(std::make_pair("cpu_features", features_str));
757  }
758 
759  result.insert(std::make_pair("llvm_version",
760  std::to_string(LLVM_VERSION_MAJOR) + "." +
761  std::to_string(LLVM_VERSION_MINOR) + "." +
762  std::to_string(LLVM_VERSION_PATCH)));
763 
764 #ifdef HAVE_CUDA
765  if (!cpu_only) {
766  int device_count = 0;
767  checkCudaErrors(cuDeviceGetCount(&device_count));
768  if (device_count) {
769  CUdevice device{};
770  char device_name[256];
771  int major = 0, minor = 0;
772  int driver_version;
773  checkCudaErrors(cuDeviceGet(&device, 0)); // assuming homogeneous multi-GPU system
774  checkCudaErrors(cuDeviceGetName(device_name, 256, device));
775  checkCudaErrors(cuDeviceGetAttribute(
776  &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
777  checkCudaErrors(cuDeviceGetAttribute(
778  &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
779  checkCudaErrors(cuDriverGetVersion(&driver_version));
780 
781  result.insert(std::make_pair("gpu_name", device_name));
782  result.insert(std::make_pair("gpu_count", std::to_string(device_count)));
783  result.insert(std::make_pair("gpu_compute_capability",
784  std::to_string(major) + "." + std::to_string(minor)));
785  result.insert(std::make_pair("gpu_triple", get_gpu_target_triple_string()));
786  result.insert(std::make_pair("gpu_datalayout", get_gpu_data_layout()));
787  result.insert(std::make_pair("gpu_driver",
788  "CUDA " + std::to_string(driver_version / 1000) + "." +
789  std::to_string((driver_version % 1000) / 10)));
790  }
791  }
792 #endif
793 
794  return result;
795 }
796 
797 namespace {
798 
799 #ifdef HAVE_CUDA
800 std::unordered_set<llvm::Function*> findAliveRuntimeFuncs(
801  llvm::Module& llvm_module,
802  const std::vector<llvm::Function*>& roots) {
803  std::queue<llvm::Function*> queue;
804  std::unordered_set<llvm::Function*> visited;
805  for (llvm::Function* F : roots) {
806  queue.push(F);
807  }
808 
809  while (!queue.empty()) {
810  llvm::Function* F = queue.front();
811  queue.pop();
812  if (visited.find(F) != visited.end()) {
813  continue;
814  }
815  visited.insert(F);
816 
817  for (llvm::inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) {
818  if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
819  if (CI->isInlineAsm()) // libdevice calls inline assembly code
820  continue;
821  llvm::Function* called = CI->getCalledFunction();
822  if (!called || visited.find(called) != visited.end()) {
823  continue;
824  }
825  queue.push(called);
826  }
827  }
828  }
829  return visited;
830 }
831 #endif
832 
833 } // namespace
834 
836  Executor* executor,
837  llvm::Module& llvm_module,
838  llvm::PassManagerBuilder& pass_manager_builder,
839  const GPUTarget& gpu_target) {
840 #ifdef HAVE_CUDA
841  auto timer = DEBUG_TIMER(__func__);
842 
843  if (!executor->has_libdevice_module()) {
844  // raise error
845  throw std::runtime_error(
846  "libdevice library is not available but required by the UDF module");
847  }
848 
849  // Saves functions \in module
850  std::vector<llvm::Function*> roots;
851  for (llvm::Function& fn : llvm_module) {
852  if (!fn.isDeclaration())
853  roots.emplace_back(&fn);
854  }
855 
856  // Bind libdevice to the current module
857  CodeGenerator::link_udf_module(executor->get_libdevice_module(),
858  llvm_module,
859  gpu_target.cgen_state,
860  llvm::Linker::Flags::OverrideFromSrc);
861 
862  std::unordered_set<llvm::Function*> live_funcs =
863  findAliveRuntimeFuncs(llvm_module, roots);
864 
865  std::vector<llvm::Function*> funcs_to_delete;
866  for (llvm::Function& fn : llvm_module) {
867  if (!live_funcs.count(&fn)) {
868  // deleting the function were would invalidate the iterator
869  funcs_to_delete.emplace_back(&fn);
870  }
871  }
872 
873  for (llvm::Function* f : funcs_to_delete) {
874  f->eraseFromParent();
875  }
876 
877  // activate nvvm-reflect-ftz flag on the module
878 #if LLVM_VERSION_MAJOR >= 11
879  llvm::LLVMContext& ctx = llvm_module.getContext();
880  llvm_module.setModuleFlag(llvm::Module::Override,
881  "nvvm-reflect-ftz",
882  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
883  llvm::Type::getInt32Ty(ctx), uint32_t(1))));
884 #else
885  llvm_module.addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", uint32_t(1));
886 #endif
887  for (llvm::Function& fn : llvm_module) {
888  fn.addFnAttr("nvptx-f32ftz", "true");
889  }
890 
891  // add nvvm reflect pass replacing any NVVM conditionals with constants
892  gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
893  llvm::legacy::FunctionPassManager FPM(&llvm_module);
894  pass_manager_builder.populateFunctionPassManager(FPM);
895 
896  // Run the NVVMReflectPass here rather than inside optimize_ir
897  FPM.doInitialization();
898  for (auto& F : llvm_module) {
899  FPM.run(F);
900  }
901  FPM.doFinalization();
902 #endif
903 }
904 
905 std::shared_ptr<GpuCompilationContext> CodeGenerator::generateNativeGPUCode(
906  Executor* executor,
907  llvm::Function* func,
908  llvm::Function* wrapper_func,
909  const std::unordered_set<llvm::Function*>& live_funcs,
910  const bool is_gpu_smem_used,
911  const CompilationOptions& co,
912  const GPUTarget& gpu_target) {
913 #ifdef HAVE_CUDA
914  auto timer = DEBUG_TIMER(__func__);
915  auto llvm_module = func->getParent();
916  /*
917  `func` is one of the following generated functions:
918  - `call_table_function(i8** %input_col_buffers, i64*
919  %input_row_count, i64** %output_buffers, i64* %output_row_count)`
920  that wraps the user-defined table function.
921  - `multifrag_query`
922  - `multifrag_query_hoisted_literals`
923  - ...
924 
925  `wrapper_func` is table_func_kernel(i32*, i8**, i64*, i64**,
926  i64*) that wraps `call_table_function`.
927 
928  `llvm_module` is from `build/QueryEngine/RuntimeFunctions.bc` and it
929  contains `func` and `wrapper_func`. `llvm_module` should also contain
930  the definitions of user-defined table functions.
931 
932  `live_funcs` contains table_func_kernel and call_table_function
933 
934  `gpu_target.cgen_state->module_` appears to be the same as `llvm_module`
935  */
936  CHECK(gpu_target.cgen_state->module_ == llvm_module);
937  CHECK(func->getParent() == wrapper_func->getParent());
938  llvm_module->setDataLayout(
939  "e-p:64:64:64-i1:8:8-i8:8:8-"
940  "i16:16:16-i32:32:32-i64:64:64-"
941  "f32:32:32-f64:64:64-v16:16:16-"
942  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
943  llvm_module->setTargetTriple("nvptx64-nvidia-cuda");
944  CHECK(gpu_target.nvptx_target_machine);
945  llvm::PassManagerBuilder pass_manager_builder = llvm::PassManagerBuilder();
946 
947  pass_manager_builder.OptLevel = 0;
948  llvm::legacy::PassManager module_pass_manager;
949  pass_manager_builder.populateModulePassManager(module_pass_manager);
950 
951  bool requires_libdevice = check_module_requires_libdevice(llvm_module);
952 
953  if (requires_libdevice) {
954  linkModuleWithLibdevice(executor, *llvm_module, pass_manager_builder, gpu_target);
955  }
956 
957  // run optimizations
958  optimize_ir(func, llvm_module, module_pass_manager, live_funcs, is_gpu_smem_used, co);
959  legalize_nvvm_ir(func);
960 
961  std::stringstream ss;
962  llvm::raw_os_ostream os(ss);
963 
964  llvm::LLVMContext& ctx = llvm_module->getContext();
965  // Get "nvvm.annotations" metadata node
966  llvm::NamedMDNode* md = llvm_module->getOrInsertNamedMetadata("nvvm.annotations");
967 
968  llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
969  llvm::MDString::get(ctx, "kernel"),
970  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
971  llvm::Type::getInt32Ty(ctx), 1))};
972 
973  // Append metadata to nvvm.annotations
974  md->addOperand(llvm::MDNode::get(ctx, md_vals));
975 
976  std::unordered_set<llvm::Function*> roots{wrapper_func, func};
977  if (gpu_target.row_func_not_inlined) {
978  clear_function_attributes(gpu_target.cgen_state->row_func_);
979  roots.insert(gpu_target.cgen_state->row_func_);
980  if (gpu_target.cgen_state->filter_func_) {
981  roots.insert(gpu_target.cgen_state->filter_func_);
982  }
983  }
984 
985  // prevent helper functions from being removed
986  for (auto f : gpu_target.cgen_state->helper_functions_) {
987  roots.insert(f);
988  }
989 
990  if (requires_libdevice) {
991  for (llvm::Function& F : *llvm_module) {
992  // Some libdevice functions calls another functions that starts with "__internal_"
993  // prefix.
994  // __internal_trig_reduction_slowpathd
995  // __internal_accurate_pow
996  // __internal_lgamma_pos
997  // Those functions have a "noinline" attribute which prevents the optimizer from
998  // inlining them into the body of @query_func
999  if (F.hasName() && F.getName().startswith("__internal") && !F.isDeclaration()) {
1000  roots.insert(&F);
1001  }
1002  legalize_nvvm_ir(&F);
1003  }
1004  }
1005 
1006  // Prevent the udf function(s) from being removed the way the runtime functions are
1007  std::unordered_set<std::string> udf_declarations;
1008 
1009  if (executor->has_udf_module(/*is_gpu=*/true)) {
1010  for (auto& f : executor->get_udf_module(/*is_gpu=*/true)->getFunctionList()) {
1011  llvm::Function* udf_function = llvm_module->getFunction(f.getName());
1012 
1013  if (udf_function) {
1014  legalize_nvvm_ir(udf_function);
1015  roots.insert(udf_function);
1016 
1017  // If we have a udf that declares a external function
1018  // note it so we can avoid duplicate declarations
1019  if (f.isDeclaration()) {
1020  udf_declarations.insert(f.getName().str());
1021  }
1022  }
1023  }
1024  }
1025 
1026  if (executor->has_rt_udf_module(/*is_gpu=*/true)) {
1027  for (auto& f : executor->get_rt_udf_module(/*is_gpu=*/true)->getFunctionList()) {
1028  llvm::Function* udf_function = llvm_module->getFunction(f.getName());
1029  if (udf_function) {
1030  legalize_nvvm_ir(udf_function);
1031  roots.insert(udf_function);
1032 
1033  // If we have a udf that declares a external function
1034  // note it so we can avoid duplicate declarations
1035  if (f.isDeclaration()) {
1036  udf_declarations.insert(f.getName().str());
1037  }
1038  }
1039  }
1040  }
1041 
1042  std::vector<llvm::Function*> rt_funcs;
1043  for (auto& Fn : *llvm_module) {
1044  if (roots.count(&Fn)) {
1045  continue;
1046  }
1047  rt_funcs.push_back(&Fn);
1048  }
1049  for (auto& pFn : rt_funcs) {
1050  pFn->removeFromParent();
1051  }
1052 
1053  if (requires_libdevice) {
1054  add_intrinsics_to_module(llvm_module);
1055  }
1057  llvm_module->print(os, nullptr);
1058  os.flush();
1059 
1060  for (auto& pFn : rt_funcs) {
1061  llvm_module->getFunctionList().push_back(pFn);
1062  }
1063  llvm_module->eraseNamedMetadata(md);
1064 
1065  auto cuda_llir = ss.str() + cuda_rt_decls + extension_function_decls(udf_declarations);
1066  std::string ptx;
1067  try {
1068  ptx = generatePTX(
1069  cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
1070  } catch (ParseIRError& e) {
1071  LOG(WARNING) << "Failed to generate PTX: " << e.what()
1072  << ". Switching to CPU execution target.";
1073  throw QueryMustRunOnCpu();
1074  }
1075  LOG(PTX) << "PTX for the GPU:\n" << ptx << "\nEnd of PTX";
1076 
1077  auto cubin_result = ptx_to_cubin(ptx, gpu_target.block_size, gpu_target.cuda_mgr);
1078  auto& option_keys = cubin_result.option_keys;
1079  auto& option_values = cubin_result.option_values;
1080  auto cubin = cubin_result.cubin;
1081  auto link_state = cubin_result.link_state;
1082  const auto num_options = option_keys.size();
1083 
1084  auto func_name = wrapper_func->getName().str();
1085  auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
1086  for (int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
1087  ++device_id) {
1088  gpu_compilation_context->addDeviceCode(
1089  std::make_unique<GpuDeviceCompilationContext>(cubin,
1090  func_name,
1091  device_id,
1092  gpu_target.cuda_mgr,
1093  num_options,
1094  &option_keys[0],
1095  &option_values[0]));
1096  }
1097 
1098  checkCudaErrors(cuLinkDestroy(link_state));
1099  return gpu_compilation_context;
1100 #else
1101  return {};
1102 #endif
1103 }
1104 
1105 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenGPU(
1106  llvm::Function* query_func,
1107  llvm::Function* multifrag_query_func,
1108  std::unordered_set<llvm::Function*>& live_funcs,
1109  const bool no_inline,
1110  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
1111  const bool is_gpu_smem_used,
1112  const CompilationOptions& co) {
1113 #ifdef HAVE_CUDA
1114  auto timer = DEBUG_TIMER(__func__);
1115 
1116  CHECK(cuda_mgr);
1117  CodeCacheKey key{serialize_llvm_object(query_func),
1118  serialize_llvm_object(cgen_state_->row_func_)};
1119  if (cgen_state_->filter_func_) {
1120  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
1121  }
1122  for (const auto helper : cgen_state_->helper_functions_) {
1123  key.push_back(serialize_llvm_object(helper));
1124  }
1125  auto cached_code = Executor::gpu_code_accessor.get_value(key);
1126  if (cached_code) {
1127  return cached_code;
1128  }
1129 
1130  bool row_func_not_inlined = false;
1131  if (no_inline) {
1132  for (auto it = llvm::inst_begin(cgen_state_->row_func_),
1133  e = llvm::inst_end(cgen_state_->row_func_);
1134  it != e;
1135  ++it) {
1136  if (llvm::isa<llvm::CallInst>(*it)) {
1137  auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
1138  if (get_gv_call.getCalledFunction()->getName() == "array_size" ||
1139  get_gv_call.getCalledFunction()->getName() == "linear_probabilistic_count") {
1140  mark_function_never_inline(cgen_state_->row_func_);
1141  row_func_not_inlined = true;
1142  break;
1143  }
1144  }
1145  }
1146  }
1147 
1148  initializeNVPTXBackend();
1149  CodeGenerator::GPUTarget gpu_target{nvptx_target_machine_.get(),
1150  cuda_mgr,
1151  blockSize(),
1152  cgen_state_.get(),
1153  row_func_not_inlined};
1154  std::shared_ptr<GpuCompilationContext> compilation_context;
1155 
1156  try {
1157  compilation_context = CodeGenerator::generateNativeGPUCode(this,
1158  query_func,
1159  multifrag_query_func,
1160  live_funcs,
1161  is_gpu_smem_used,
1162  co,
1163  gpu_target);
1164  } catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1165  if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1166  // Thrown if memory not able to be allocated on gpu
1167  // Retry once after evicting portion of code cache
1168  LOG(WARNING) << "Failed to allocate GPU memory for generated code. Evicting "
1170  << "% of GPU code cache and re-trying.";
1172  compilation_context = CodeGenerator::generateNativeGPUCode(this,
1173  query_func,
1174  multifrag_query_func,
1175  live_funcs,
1176  is_gpu_smem_used,
1177  co,
1178  gpu_target);
1179  } else {
1180  throw;
1181  }
1182  }
1183  Executor::gpu_code_accessor.put(key, compilation_context);
1184  return std::dynamic_pointer_cast<CompilationContext>(compilation_context);
1185 #else
1186  return nullptr;
1187 #endif
1188 }
1189 
1190 std::string CodeGenerator::generatePTX(const std::string& cuda_llir,
1191  llvm::TargetMachine* nvptx_target_machine,
1192  llvm::LLVMContext& context) {
1193  auto timer = DEBUG_TIMER(__func__);
1194  auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir, "", false);
1195 
1196  llvm::SMDiagnostic parse_error;
1197 
1198  auto llvm_module = llvm::parseIR(mem_buff->getMemBufferRef(), parse_error, context);
1199  if (!llvm_module) {
1200  LOG(IR) << "CodeGenerator::generatePTX:NVVM IR:\n" << cuda_llir << "\nEnd of NNVM IR";
1201  throw_parseIR_error(parse_error, "generatePTX", /* is_gpu= */ true);
1202  }
1203 
1204  llvm::SmallString<256> code_str;
1205  llvm::raw_svector_ostream formatted_os(code_str);
1206  CHECK(nvptx_target_machine);
1207  {
1208  llvm::legacy::PassManager ptxgen_pm;
1209  llvm_module->setDataLayout(nvptx_target_machine->createDataLayout());
1210 
1211 #if LLVM_VERSION_MAJOR >= 10
1212  nvptx_target_machine->addPassesToEmitFile(
1213  ptxgen_pm, formatted_os, nullptr, llvm::CGFT_AssemblyFile);
1214 #else
1215  nvptx_target_machine->addPassesToEmitFile(
1216  ptxgen_pm, formatted_os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1217 #endif
1218  ptxgen_pm.run(*llvm_module);
1219  }
1220 
1221 #if LLVM_VERSION_MAJOR >= 11
1222  return std::string(code_str);
1223 #else
1224  return code_str.str();
1225 #endif
1226 }
1227 
1228 std::unique_ptr<llvm::TargetMachine> CodeGenerator::initializeNVPTXBackend(
1230  auto timer = DEBUG_TIMER(__func__);
1231  llvm::InitializeAllTargets();
1232  llvm::InitializeAllTargetMCs();
1233  llvm::InitializeAllAsmPrinters();
1234  std::string err;
1235  auto target = llvm::TargetRegistry::lookupTarget("nvptx64", err);
1236  if (!target) {
1237  LOG(FATAL) << err;
1238  }
1239  return std::unique_ptr<llvm::TargetMachine>(
1240  target->createTargetMachine("nvptx64-nvidia-cuda",
1242  "",
1243  llvm::TargetOptions(),
1244  llvm::Reloc::Static));
1245 }
1246 
1247 std::string Executor::generatePTX(const std::string& cuda_llir) const {
1249  cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1250 }
1251 
1252 void Executor::initializeNVPTXBackend() const {
1253  if (nvptx_target_machine_) {
1254  return;
1255  }
1256  const auto arch = cudaMgr()->getDeviceArch();
1257  nvptx_target_machine_ = CodeGenerator::initializeNVPTXBackend(arch);
1258 }
1259 
1260 // A small number of runtime functions don't get through CgenState::emitCall. List them
1261 // explicitly here and always clone their implementation from the runtime module.
1262 bool CodeGenerator::alwaysCloneRuntimeFunction(const llvm::Function* func) {
1263  return func->getName() == "query_stub_hoisted_literals" ||
1264  func->getName() == "multifrag_query_hoisted_literals" ||
1265  func->getName() == "query_stub" || func->getName() == "multifrag_query" ||
1266  func->getName() == "fixed_width_int_decode" ||
1267  func->getName() == "fixed_width_unsigned_decode" ||
1268  func->getName() == "diff_fixed_width_int_decode" ||
1269  func->getName() == "fixed_width_double_decode" ||
1270  func->getName() == "fixed_width_float_decode" ||
1271  func->getName() == "fixed_width_small_date_decode" ||
1272  func->getName() == "record_error_code" || func->getName() == "get_error_code" ||
1273  func->getName() == "pos_start_impl" || func->getName() == "pos_step_impl" ||
1274  func->getName() == "group_buff_idx_impl" ||
1275  func->getName() == "init_shared_mem" ||
1276  func->getName() == "init_shared_mem_nop" || func->getName() == "write_back_nop";
1277 }
1278 
1279 std::unique_ptr<llvm::Module> read_llvm_module_from_bc_file(
1280  const std::string& bc_filename,
1281  llvm::LLVMContext& context) {
1282  llvm::SMDiagnostic err;
1283 
1284  auto buffer_or_error = llvm::MemoryBuffer::getFile(bc_filename);
1285  CHECK(!buffer_or_error.getError()) << "bc_filename=" << bc_filename;
1286  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1287 
1288  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1289  CHECK(!owner.takeError());
1290  CHECK(owner->get());
1291  return std::move(owner.get());
1292 }
1293 
1294 std::unique_ptr<llvm::Module> read_llvm_module_from_ir_file(
1295  const std::string& udf_ir_filename,
1296  llvm::LLVMContext& ctx,
1297  bool is_gpu = false) {
1298  llvm::SMDiagnostic parse_error;
1299 
1300  llvm::StringRef file_name_arg(udf_ir_filename);
1301 
1302  auto owner = llvm::parseIRFile(file_name_arg, parse_error, ctx);
1303  if (!owner) {
1304  throw_parseIR_error(parse_error, udf_ir_filename, is_gpu);
1305  }
1306 
1307  if (is_gpu) {
1308  llvm::Triple gpu_triple(owner->getTargetTriple());
1309  if (!gpu_triple.isNVPTX()) {
1310  LOG(WARNING)
1311  << "Expected triple nvptx64-nvidia-cuda for NVVM IR of loadtime UDFs but got "
1312  << gpu_triple.str() << ". Disabling the NVVM IR module.";
1313  return std::unique_ptr<llvm::Module>();
1314  }
1315  }
1316  return owner;
1317 }
1318 
1319 std::unique_ptr<llvm::Module> read_llvm_module_from_ir_string(
1320  const std::string& udf_ir_string,
1321  llvm::LLVMContext& ctx,
1322  bool is_gpu = false) {
1323  llvm::SMDiagnostic parse_error;
1324 
1325  auto buf = std::make_unique<llvm::MemoryBufferRef>(udf_ir_string,
1326  "Runtime UDF/UDTF LLVM/NVVM IR");
1327 
1328  auto owner = llvm::parseIR(*buf, parse_error, ctx);
1329  if (!owner) {
1330  LOG(IR) << "read_llvm_module_from_ir_string:\n"
1331  << udf_ir_string << "\nEnd of LLVM/NVVM IR";
1332  throw_parseIR_error(parse_error, "", /* is_gpu= */ is_gpu);
1333  }
1334 
1335  if (is_gpu) {
1336  llvm::Triple gpu_triple(owner->getTargetTriple());
1337  if (!gpu_triple.isNVPTX()) {
1338  LOG(IR) << "read_llvm_module_from_ir_string:\n"
1339  << udf_ir_string << "\nEnd of NNVM IR";
1340  LOG(WARNING) << "Expected triple nvptx64-nvidia-cuda for NVVM IR but got "
1341  << gpu_triple.str()
1342  << ". Executing runtime UDF/UDTFs on GPU will be disabled.";
1343  return std::unique_ptr<llvm::Module>();
1344  ;
1345  }
1346  }
1347  return owner;
1348 }
1349 
1350 namespace {
1351 
1352 void bind_pos_placeholders(const std::string& pos_fn_name,
1353  const bool use_resume_param,
1354  llvm::Function* query_func,
1355  llvm::Module* llvm_module) {
1356  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1357  ++it) {
1358  if (!llvm::isa<llvm::CallInst>(*it)) {
1359  continue;
1360  }
1361  auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1362  if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
1363  if (use_resume_param) {
1364  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1365  llvm::ReplaceInstWithInst(
1366  &pos_call,
1367  llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name + "_impl"),
1368  error_code_arg));
1369  } else {
1370  llvm::ReplaceInstWithInst(
1371  &pos_call,
1372  llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name + "_impl")));
1373  }
1374  break;
1375  }
1376  }
1377 }
1378 
1379 void set_row_func_argnames(llvm::Function* row_func,
1380  const size_t in_col_count,
1381  const size_t agg_col_count,
1382  const bool hoist_literals) {
1383  auto arg_it = row_func->arg_begin();
1384 
1385  if (agg_col_count) {
1386  for (size_t i = 0; i < agg_col_count; ++i) {
1387  arg_it->setName("out");
1388  ++arg_it;
1389  }
1390  } else {
1391  arg_it->setName("group_by_buff");
1392  ++arg_it;
1393  arg_it->setName("varlen_output_buff");
1394  ++arg_it;
1395  arg_it->setName("crt_matched");
1396  ++arg_it;
1397  arg_it->setName("total_matched");
1398  ++arg_it;
1399  arg_it->setName("old_total_matched");
1400  ++arg_it;
1401  arg_it->setName("max_matched");
1402  ++arg_it;
1403  }
1404 
1405  arg_it->setName("agg_init_val");
1406  ++arg_it;
1407 
1408  arg_it->setName("pos");
1409  ++arg_it;
1410 
1411  arg_it->setName("frag_row_off");
1412  ++arg_it;
1413 
1414  arg_it->setName("num_rows_per_scan");
1415  ++arg_it;
1416 
1417  if (hoist_literals) {
1418  arg_it->setName("literals");
1419  ++arg_it;
1420  }
1421 
1422  for (size_t i = 0; i < in_col_count; ++i) {
1423  arg_it->setName("col_buf" + std::to_string(i));
1424  ++arg_it;
1425  }
1426 
1427  arg_it->setName("join_hash_tables");
1428 }
1429 
1430 llvm::Function* create_row_function(const size_t in_col_count,
1431  const size_t agg_col_count,
1432  const bool hoist_literals,
1433  llvm::Module* llvm_module,
1434  llvm::LLVMContext& context) {
1435  std::vector<llvm::Type*> row_process_arg_types;
1436 
1437  if (agg_col_count) {
1438  // output (aggregate) arguments
1439  for (size_t i = 0; i < agg_col_count; ++i) {
1440  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1441  }
1442  } else {
1443  // group by buffer
1444  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1445  // varlen output buffer
1446  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1447  // current match count
1448  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1449  // total match count passed from the caller
1450  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1451  // old total match count returned to the caller
1452  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1453  // max matched (total number of slots in the output buffer)
1454  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1455  }
1456 
1457  // aggregate init values
1458  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1459 
1460  // position argument
1461  row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1462 
1463  // fragment row offset argument
1464  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1465 
1466  // number of rows for each scan
1467  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1469  // literals buffer argument
1470  if (hoist_literals) {
1471  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1472  }
1474  // column buffer arguments
1475  for (size_t i = 0; i < in_col_count; ++i) {
1476  row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1477  }
1478 
1479  // join hash table argument
1480  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1481 
1482  // generate the function
1483  auto ft =
1484  llvm::FunctionType::get(get_int_type(32, context), row_process_arg_types, false);
1485 
1486  auto row_func = llvm::Function::Create(
1487  ft, llvm::Function::ExternalLinkage, "row_func", llvm_module);
1488 
1489  // set the row function argument names; for debugging purposes only
1490  set_row_func_argnames(row_func, in_col_count, agg_col_count, hoist_literals);
1491 
1492  return row_func;
1493 }
1494 
1495 // Iterate through multifrag_query_func, replacing calls to query_fname with query_func.
1496 void bind_query(llvm::Function* query_func,
1497  const std::string& query_fname,
1498  llvm::Function* multifrag_query_func,
1499  llvm::Module* llvm_module) {
1500  std::vector<llvm::CallInst*> query_stubs;
1501  for (auto it = llvm::inst_begin(multifrag_query_func),
1502  e = llvm::inst_end(multifrag_query_func);
1503  it != e;
1504  ++it) {
1505  if (!llvm::isa<llvm::CallInst>(*it)) {
1506  continue;
1507  }
1508  auto& query_call = llvm::cast<llvm::CallInst>(*it);
1509  if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
1510  query_stubs.push_back(&query_call);
1511  }
1512  }
1513  for (auto& S : query_stubs) {
1514  std::vector<llvm::Value*> args;
1515  for (size_t i = 0; i < S->getNumArgOperands(); ++i) {
1516  args.push_back(S->getArgOperand(i));
1517  }
1518  llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args, ""));
1519  }
1520 }
1521 
1522 std::vector<std::string> get_agg_fnames(const std::vector<Analyzer::Expr*>& target_exprs,
1523  const bool is_group_by) {
1524  std::vector<std::string> result;
1525  for (size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1526  ++target_idx, ++agg_col_idx) {
1527  const auto target_expr = target_exprs[target_idx];
1528  CHECK(target_expr);
1529  const auto target_type_info = target_expr->get_type_info();
1530  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
1531  const bool is_varlen =
1532  (target_type_info.is_string() &&
1533  target_type_info.get_compression() == kENCODING_NONE) ||
1534  target_type_info.is_array(); // TODO: should it use is_varlen_array() ?
1535  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
1536  result.emplace_back(target_type_info.is_fp() ? "agg_id_double" : "agg_id");
1537  if (is_varlen) {
1538  result.emplace_back("agg_id");
1539  }
1540  if (target_type_info.is_geometry()) {
1541  result.emplace_back("agg_id");
1542  for (auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1543  result.emplace_back("agg_id");
1544  }
1545  }
1546  continue;
1547  }
1548  const auto agg_type = agg_expr->get_aggtype();
1549  const auto& agg_type_info =
1550  agg_type != kCOUNT ? agg_expr->get_arg()->get_type_info() : target_type_info;
1551  switch (agg_type) {
1552  case kAVG: {
1553  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1554  !agg_type_info.is_fp()) {
1555  throw std::runtime_error("AVG is only valid on integer and floating point");
1556  }
1557  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1558  ? "agg_sum"
1559  : "agg_sum_double");
1560  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1561  ? "agg_count"
1562  : "agg_count_double");
1563  break;
1564  }
1565  case kMIN: {
1566  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1567  agg_type_info.is_geometry()) {
1568  throw std::runtime_error(
1569  "MIN on strings, arrays or geospatial types not supported yet");
1570  }
1571  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1572  ? "agg_min"
1573  : "agg_min_double");
1574  break;
1575  }
1576  case kMAX: {
1577  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1578  agg_type_info.is_geometry()) {
1579  throw std::runtime_error(
1580  "MAX on strings, arrays or geospatial types not supported yet");
1581  }
1582  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1583  ? "agg_max"
1584  : "agg_max_double");
1585  break;
1586  }
1587  case kSUM: {
1588  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1589  !agg_type_info.is_fp()) {
1590  throw std::runtime_error("SUM is only valid on integer and floating point");
1591  }
1592  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1593  ? "agg_sum"
1594  : "agg_sum_double");
1595  break;
1596  }
1597  case kCOUNT:
1598  result.emplace_back(agg_expr->get_is_distinct() ? "agg_count_distinct"
1599  : "agg_count");
1600  break;
1601  case kSINGLE_VALUE: {
1602  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1603  break;
1604  }
1605  case kSAMPLE: {
1606  // Note that varlen SAMPLE arguments are handled separately above
1607  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1608  break;
1609  }
1611  result.emplace_back("agg_approximate_count_distinct");
1612  break;
1613  case kAPPROX_QUANTILE:
1614  result.emplace_back("agg_approx_quantile");
1615  break;
1616  default:
1617  CHECK(false);
1618  }
1619  }
1620  return result;
1621 }
1622 
1623 } // namespace
1624 
1625 void Executor::addUdfIrToModule(const std::string& udf_ir_filename,
1626  const bool is_cuda_ir) {
1630  udf_ir_filename;
1631 }
1632 
1633 std::unordered_set<llvm::Function*> CodeGenerator::markDeadRuntimeFuncs(
1634  llvm::Module& llvm_module,
1635  const std::vector<llvm::Function*>& roots,
1636  const std::vector<llvm::Function*>& leaves) {
1637  auto timer = DEBUG_TIMER(__func__);
1638  std::unordered_set<llvm::Function*> live_funcs;
1639  live_funcs.insert(roots.begin(), roots.end());
1640  live_funcs.insert(leaves.begin(), leaves.end());
1641 
1642  if (auto F = llvm_module.getFunction("init_shared_mem_nop")) {
1643  live_funcs.insert(F);
1644  }
1645  if (auto F = llvm_module.getFunction("write_back_nop")) {
1646  live_funcs.insert(F);
1647  }
1648 
1649  for (const llvm::Function* F : roots) {
1650  for (const llvm::BasicBlock& BB : *F) {
1651  for (const llvm::Instruction& I : BB) {
1652  if (const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1653  live_funcs.insert(CI->getCalledFunction());
1654  }
1655  }
1656  }
1657  }
1658 
1659  for (llvm::Function& F : llvm_module) {
1660  if (!live_funcs.count(&F) && !F.isDeclaration()) {
1661  F.setLinkage(llvm::GlobalValue::InternalLinkage);
1662  }
1663  }
1664 
1665  return live_funcs;
1666 }
1667 
1668 namespace {
1669 // searches for a particular variable within a specific basic block (or all if bb_name is
1670 // empty)
1671 template <typename InstType>
1672 llvm::Value* find_variable_in_basic_block(llvm::Function* func,
1673  std::string bb_name,
1674  std::string variable_name) {
1675  llvm::Value* result = nullptr;
1676  if (func == nullptr || variable_name.empty()) {
1677  return result;
1678  }
1679  bool is_found = false;
1680  for (auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1681  if (!bb_name.empty() && bb_it->getName() != bb_name) {
1682  continue;
1683  }
1684  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1685  if (llvm::isa<InstType>(*inst_it)) {
1686  if (inst_it->getName() == variable_name) {
1687  result = &*inst_it;
1688  is_found = true;
1689  break;
1690  }
1691  }
1692  }
1693  }
1694  return result;
1695 }
1696 }; // namespace
1697 
1699  llvm::Function* query_func,
1700  bool run_with_dynamic_watchdog,
1701  bool run_with_allowing_runtime_interrupt,
1702  const std::vector<JoinLoop>& join_loops,
1703  ExecutorDeviceType device_type,
1704  const std::vector<InputTableInfo>& input_table_infos) {
1705  AUTOMATIC_IR_METADATA(cgen_state_.get());
1706 
1707  // check whether the row processing was successful; currently, it can
1708  // fail by running out of group by buffer slots
1709 
1710  if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1711  // when both dynamic watchdog and runtime interrupt turns on
1712  // we use dynamic watchdog
1713  run_with_allowing_runtime_interrupt = false;
1714  }
1715 
1716  {
1717  // disable injecting query interrupt checker if the session info is invalid
1718  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
1719  if (current_query_session_.empty()) {
1720  run_with_allowing_runtime_interrupt = false;
1721  }
1722  }
1723 
1724  llvm::Value* row_count = nullptr;
1725  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1726  device_type == ExecutorDeviceType::GPU) {
1727  row_count =
1728  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1729  }
1730 
1731  bool done_splitting = false;
1732  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1733  ++bb_it) {
1734  llvm::Value* pos = nullptr;
1735  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1736  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1737  llvm::isa<llvm::PHINode>(*inst_it)) {
1738  if (inst_it->getName() == "pos") {
1739  pos = &*inst_it;
1740  }
1741  continue;
1742  }
1743  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1744  continue;
1745  }
1746  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1747  if (std::string(row_func_call.getCalledFunction()->getName()) == "row_process") {
1748  auto next_inst_it = inst_it;
1749  ++next_inst_it;
1750  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1751  auto& br_instr = bb_it->back();
1752  llvm::IRBuilder<> ir_builder(&br_instr);
1753  llvm::Value* err_lv = &*inst_it;
1754  llvm::Value* err_lv_returned_from_row_func = nullptr;
1755  if (run_with_dynamic_watchdog) {
1756  CHECK(pos);
1757  llvm::Value* call_watchdog_lv = nullptr;
1758  if (device_type == ExecutorDeviceType::GPU) {
1759  // In order to make sure all threads within a block see the same barrier,
1760  // only those blocks whose none of their threads have experienced the critical
1761  // edge will go through the dynamic watchdog computation
1762  CHECK(row_count);
1763  auto crit_edge_rem =
1764  (blockSize() & (blockSize() - 1))
1765  ? ir_builder.CreateSRem(
1766  row_count,
1767  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1768  : ir_builder.CreateAnd(
1769  row_count,
1770  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1771  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1772  crit_edge_threshold->setName("crit_edge_threshold");
1773 
1774  // only those threads where pos < crit_edge_threshold go through dynamic
1775  // watchdog call
1776  call_watchdog_lv =
1777  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1778  } else {
1779  // CPU path: run watchdog for every 64th row
1780  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1781  call_watchdog_lv = ir_builder.CreateICmp(
1782  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1783  }
1784  CHECK(call_watchdog_lv);
1785  auto error_check_bb = bb_it->splitBasicBlock(
1786  llvm::BasicBlock::iterator(br_instr), ".error_check");
1787  auto& watchdog_br_instr = bb_it->back();
1788 
1789  auto watchdog_check_bb = llvm::BasicBlock::Create(
1790  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1791  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1792  auto detected_timeout = watchdog_ir_builder.CreateCall(
1793  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1794  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1795  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1796  watchdog_ir_builder.CreateBr(error_check_bb);
1797 
1798  llvm::ReplaceInstWithInst(
1799  &watchdog_br_instr,
1800  llvm::BranchInst::Create(
1801  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1802  ir_builder.SetInsertPoint(&br_instr);
1803  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1804 
1805  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1806  unified_err_lv->addIncoming(err_lv, &*bb_it);
1807  err_lv = unified_err_lv;
1808  } else if (run_with_allowing_runtime_interrupt) {
1809  CHECK(pos);
1810  llvm::Value* call_check_interrupt_lv{nullptr};
1811  llvm::Value* interrupt_err_lv{nullptr};
1812  llvm::BasicBlock* error_check_bb{nullptr};
1813  llvm::BasicBlock* interrupt_check_bb{nullptr};
1814  llvm::Instruction* check_interrupt_br_instr{nullptr};
1815 
1816  auto has_loop_join = std::any_of(
1817  join_loops.begin(), join_loops.end(), [](const JoinLoop& join_loop) {
1818  return join_loop.isNestedLoopJoin();
1819  });
1820  auto codegen_interrupt_checker = [&]() {
1821  error_check_bb = bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr),
1822  ".error_check");
1823  check_interrupt_br_instr = &bb_it->back();
1824 
1825  interrupt_check_bb = llvm::BasicBlock::Create(
1826  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
1827  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1828  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1829  cgen_state_->module_->getFunction("check_interrupt"), {});
1830  interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1831  detected_interrupt,
1832  cgen_state_->llInt(Executor::ERR_INTERRUPTED),
1833  err_lv);
1834  interrupt_checker_ir_builder.CreateBr(error_check_bb);
1835  };
1836  if (has_loop_join) {
1837  codegen_interrupt_checker();
1838  CHECK(interrupt_check_bb);
1839  CHECK(check_interrupt_br_instr);
1840  llvm::ReplaceInstWithInst(check_interrupt_br_instr,
1841  llvm::BranchInst::Create(interrupt_check_bb));
1842  ir_builder.SetInsertPoint(&br_instr);
1843  err_lv = interrupt_err_lv;
1844  } else {
1845  if (device_type == ExecutorDeviceType::GPU) {
1846  // approximate how many times the %pos variable
1847  // is increased --> the number of iteration
1848  // here we calculate the # bit shift by considering grid/block/fragment
1849  // sizes since if we use the fixed one (i.e., per 64-th increment) some CUDA
1850  // threads cannot enter the interrupt checking block depending on the
1851  // fragment size --> a thread may not take care of 64 threads if an outer
1852  // table is not sufficiently large, and so cannot be interrupted
1853  int32_t num_shift_by_gridDim = shared::getExpOfTwo(gridSize());
1854  int32_t num_shift_by_blockDim = shared::getExpOfTwo(blockSize());
1855  int64_t total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1856  uint64_t interrupt_checking_freq = 32;
1857  auto freq_control_knob = g_running_query_interrupt_freq;
1858  CHECK_GT(freq_control_knob, 0);
1859  CHECK_LE(freq_control_knob, 1.0);
1860  if (!input_table_infos.empty()) {
1861  const auto& outer_table_info = *input_table_infos.begin();
1862  auto num_outer_table_tuples =
1863  outer_table_info.info.getFragmentNumTuplesUpperBound();
1864  if (num_outer_table_tuples > 0) {
1865  // gridSize * blockSize --> pos_step (idx of the next row per thread)
1866  // we additionally multiply two to pos_step since the number of
1867  // dispatched blocks are double of the gridSize
1868  // # tuples (of fragment) / pos_step --> maximum # increment (K)
1869  // also we multiply 1 / freq_control_knob to K to control the frequency
1870  // So, needs to check the interrupt status more frequently? make K
1871  // smaller
1872  auto max_inc = uint64_t(
1873  floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1874  if (max_inc < 2) {
1875  // too small `max_inc`, so this correction is necessary to make
1876  // `interrupt_checking_freq` be valid (i.e., larger than zero)
1877  max_inc = 2;
1878  }
1879  auto calibrated_inc =
1880  uint64_t(floor(max_inc * (1 - freq_control_knob)));
1881  interrupt_checking_freq =
1882  uint64_t(pow(2, shared::getExpOfTwo(calibrated_inc)));
1883  // add the coverage when interrupt_checking_freq > K
1884  // if so, some threads still cannot be branched to the interrupt checker
1885  // so we manually use smaller but close to the max_inc as freq
1886  if (interrupt_checking_freq > max_inc) {
1887  interrupt_checking_freq = max_inc / 2;
1888  }
1889  if (interrupt_checking_freq < 8) {
1890  // such small freq incurs too frequent interrupt status checking,
1891  // so we fixup to the minimum freq value at some reasonable degree
1892  interrupt_checking_freq = 8;
1893  }
1894  }
1895  }
1896  VLOG(1) << "Set the running query interrupt checking frequency: "
1897  << interrupt_checking_freq;
1898  // check the interrupt flag for every interrupt_checking_freq-th iteration
1899  llvm::Value* pos_shifted_per_iteration =
1900  ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1901  auto interrupt_predicate = ir_builder.CreateAnd(pos_shifted_per_iteration,
1902  interrupt_checking_freq);
1903  call_check_interrupt_lv =
1904  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1905  interrupt_predicate,
1906  cgen_state_->llInt(int64_t(0LL)));
1907  } else {
1908  // CPU path: run interrupt checker for every 64th row
1909  auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1910  call_check_interrupt_lv =
1911  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1912  interrupt_predicate,
1913  cgen_state_->llInt(int64_t(0LL)));
1914  }
1915  codegen_interrupt_checker();
1916  CHECK(call_check_interrupt_lv);
1917  CHECK(interrupt_err_lv);
1918  CHECK(interrupt_check_bb);
1919  CHECK(error_check_bb);
1920  CHECK(check_interrupt_br_instr);
1921  llvm::ReplaceInstWithInst(
1922  check_interrupt_br_instr,
1923  llvm::BranchInst::Create(
1924  interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
1925  ir_builder.SetInsertPoint(&br_instr);
1926  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1927 
1928  unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
1929  unified_err_lv->addIncoming(err_lv, &*bb_it);
1930  err_lv = unified_err_lv;
1931  }
1932  }
1933  if (!err_lv_returned_from_row_func) {
1934  err_lv_returned_from_row_func = err_lv;
1935  }
1936  if (device_type == ExecutorDeviceType::GPU && g_enable_dynamic_watchdog) {
1937  // let kernel execution finish as expected, regardless of the observed error,
1938  // unless it is from the dynamic watchdog where all threads within that block
1939  // return together.
1940  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1941  err_lv,
1942  cgen_state_->llInt(Executor::ERR_OUT_OF_TIME));
1943  } else {
1944  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1945  err_lv,
1946  cgen_state_->llInt(static_cast<int32_t>(0)));
1947  }
1948  auto error_bb = llvm::BasicBlock::Create(
1949  cgen_state_->context_, ".error_exit", query_func, new_bb);
1950  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1951  llvm::CallInst::Create(
1952  cgen_state_->module_->getFunction("record_error_code"),
1953  std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
1954  "",
1955  error_bb);
1956  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1957  llvm::ReplaceInstWithInst(&br_instr,
1958  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1959  done_splitting = true;
1960  break;
1961  }
1962  }
1963  }
1964  CHECK(done_splitting);
1965 }
1966 
1968  llvm::Module* M = cgen_state_->module_;
1969  if (M->getFunction("allocate_varlen_buffer") == nullptr)
1970  return;
1971 
1972  // read metadata
1973  bool should_track = false;
1974  auto* flag = M->getModuleFlag("manage_memory_buffer");
1975  if (auto* cnt = llvm::mdconst::extract_or_null<llvm::ConstantInt>(flag)) {
1976  if (cnt->getZExtValue() == 1) {
1977  should_track = true;
1978  }
1979  }
1980 
1981  if (!should_track) {
1982  // metadata is not present
1983  return;
1984  }
1985 
1986  LOG(INFO) << "Found 'manage_memory_buffer' metadata.";
1987  llvm::SmallVector<llvm::CallInst*, 4> calls_to_analyze;
1988 
1989  for (llvm::Function& F : *M) {
1990  for (llvm::BasicBlock& BB : F) {
1991  for (llvm::Instruction& I : BB) {
1992  if (llvm::CallInst* CI = llvm::dyn_cast<llvm::CallInst>(&I)) {
1993  // Keep track of calls to "allocate_varlen_buffer" for later processing
1994  llvm::Function* called = CI->getCalledFunction();
1995  if (called) {
1996  if (called->getName() == "allocate_varlen_buffer") {
1997  calls_to_analyze.push_back(CI);
1998  }
1999  }
2000  }
2001  }
2002  }
2003  }
2004 
2005  // for each call to "allocate_varlen_buffer", check if there's a corresponding
2006  // call to "register_buffer_with_executor_rsm". If not, add a call to it
2007  llvm::IRBuilder<> Builder(cgen_state_->context_);
2008  auto i64 = get_int_type(64, cgen_state_->context_);
2009  auto i8p = get_int_ptr_type(8, cgen_state_->context_);
2010  auto void_ = llvm::Type::getVoidTy(cgen_state_->context_);
2011  llvm::FunctionType* fnty = llvm::FunctionType::get(void_, {i64, i8p}, false);
2012  llvm::FunctionCallee register_buffer_fn =
2013  M->getOrInsertFunction("register_buffer_with_executor_rsm", fnty, {});
2014 
2015  int64_t executor_addr = reinterpret_cast<int64_t>(this);
2016  for (llvm::CallInst* CI : calls_to_analyze) {
2017  bool found = false;
2018  // for each user of the function, check if its a callinst
2019  // and if the callinst is calling "register_buffer_with_executor_rsm"
2020  // if no such instruction exist, add one registering the buffer
2021  for (llvm::User* U : CI->users()) {
2022  if (llvm::CallInst* call = llvm::dyn_cast<llvm::CallInst>(U)) {
2023  if (call->getCalledFunction() and
2024  call->getCalledFunction()->getName() == "register_buffer_with_executor_rsm") {
2025  found = true;
2026  break;
2027  }
2028  }
2029  }
2030  if (!found) {
2031  Builder.SetInsertPoint(CI->getNextNode());
2032  Builder.CreateCall(register_buffer_fn,
2033  {ll_int(executor_addr, cgen_state_->context_), CI});
2034  }
2035  }
2036 }
2037 
2038 std::vector<llvm::Value*> Executor::inlineHoistedLiterals() {
2039  AUTOMATIC_IR_METADATA(cgen_state_.get());
2040 
2041  std::vector<llvm::Value*> hoisted_literals;
2042 
2043  // row_func_ is using literals whose defs have been hoisted up to the query_func_,
2044  // extend row_func_ signature to include extra args to pass these literal values.
2045  std::vector<llvm::Type*> row_process_arg_types;
2046 
2047  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2048  E = cgen_state_->row_func_->arg_end();
2049  I != E;
2050  ++I) {
2051  row_process_arg_types.push_back(I->getType());
2052  }
2053 
2054  for (auto& element : cgen_state_->query_func_literal_loads_) {
2055  for (auto value : element.second) {
2056  row_process_arg_types.push_back(value->getType());
2057  }
2058  }
2059 
2060  auto ft = llvm::FunctionType::get(
2061  get_int_type(32, cgen_state_->context_), row_process_arg_types, false);
2062  auto row_func_with_hoisted_literals =
2063  llvm::Function::Create(ft,
2064  llvm::Function::ExternalLinkage,
2065  "row_func_hoisted_literals",
2066  cgen_state_->row_func_->getParent());
2067 
2068  auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
2069  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2070  E = cgen_state_->row_func_->arg_end();
2071  I != E;
2072  ++I) {
2073  if (I->hasName()) {
2074  row_func_arg_it->setName(I->getName());
2075  }
2076  ++row_func_arg_it;
2077  }
2078 
2079  decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{nullptr};
2080  decltype(row_func_arg_it) filter_func_arg_it{nullptr};
2081  if (cgen_state_->filter_func_) {
2082  // filter_func_ is using literals whose defs have been hoisted up to the row_func_,
2083  // extend filter_func_ signature to include extra args to pass these literal values.
2084  std::vector<llvm::Type*> filter_func_arg_types;
2085 
2086  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2087  E = cgen_state_->filter_func_->arg_end();
2088  I != E;
2089  ++I) {
2090  filter_func_arg_types.push_back(I->getType());
2091  }
2092 
2093  for (auto& element : cgen_state_->query_func_literal_loads_) {
2094  for (auto value : element.second) {
2095  filter_func_arg_types.push_back(value->getType());
2096  }
2097  }
2098 
2099  auto ft2 = llvm::FunctionType::get(
2100  get_int_type(32, cgen_state_->context_), filter_func_arg_types, false);
2101  filter_func_with_hoisted_literals =
2102  llvm::Function::Create(ft2,
2103  llvm::Function::ExternalLinkage,
2104  "filter_func_hoisted_literals",
2105  cgen_state_->filter_func_->getParent());
2106 
2107  filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
2108  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2109  E = cgen_state_->filter_func_->arg_end();
2110  I != E;
2111  ++I) {
2112  if (I->hasName()) {
2113  filter_func_arg_it->setName(I->getName());
2114  }
2115  ++filter_func_arg_it;
2116  }
2117  }
2118 
2119  std::unordered_map<int, std::vector<llvm::Value*>>
2120  query_func_literal_loads_function_arguments,
2121  query_func_literal_loads_function_arguments2;
2122 
2123  for (auto& element : cgen_state_->query_func_literal_loads_) {
2124  std::vector<llvm::Value*> argument_values, argument_values2;
2125 
2126  for (auto value : element.second) {
2127  hoisted_literals.push_back(value);
2128  argument_values.push_back(&*row_func_arg_it);
2129  if (cgen_state_->filter_func_) {
2130  argument_values2.push_back(&*filter_func_arg_it);
2131  cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
2132  }
2133  if (value->hasName()) {
2134  row_func_arg_it->setName("arg_" + value->getName());
2135  if (cgen_state_->filter_func_) {
2136  filter_func_arg_it->getContext();
2137  filter_func_arg_it->setName("arg_" + value->getName());
2138  }
2139  }
2140  ++row_func_arg_it;
2141  ++filter_func_arg_it;
2142  }
2143 
2144  query_func_literal_loads_function_arguments[element.first] = argument_values;
2145  query_func_literal_loads_function_arguments2[element.first] = argument_values2;
2146  }
2147 
2148  // copy the row_func function body over
2149  // see
2150  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2151  row_func_with_hoisted_literals->getBasicBlockList().splice(
2152  row_func_with_hoisted_literals->begin(),
2153  cgen_state_->row_func_->getBasicBlockList());
2154 
2155  // also replace row_func arguments with the arguments from row_func_hoisted_literals
2156  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2157  E = cgen_state_->row_func_->arg_end(),
2158  I2 = row_func_with_hoisted_literals->arg_begin();
2159  I != E;
2160  ++I) {
2161  I->replaceAllUsesWith(&*I2);
2162  I2->takeName(&*I);
2163  cgen_state_->filter_func_args_.replace(&*I, &*I2);
2164  ++I2;
2165  }
2166 
2167  cgen_state_->row_func_ = row_func_with_hoisted_literals;
2168 
2169  // and finally replace literal placeholders
2170  std::vector<llvm::Instruction*> placeholders;
2171  std::string prefix("__placeholder__literal_");
2172  for (auto it = llvm::inst_begin(row_func_with_hoisted_literals),
2173  e = llvm::inst_end(row_func_with_hoisted_literals);
2174  it != e;
2175  ++it) {
2176  if (it->hasName() && it->getName().startswith(prefix)) {
2177  auto offset_and_index_entry =
2178  cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
2179  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2180 
2181  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2182  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2183 
2184  it->replaceAllUsesWith(
2185  query_func_literal_loads_function_arguments[lit_off][lit_idx]);
2186  placeholders.push_back(&*it);
2187  }
2188  }
2189  for (auto placeholder : placeholders) {
2190  placeholder->removeFromParent();
2191  }
2192 
2193  if (cgen_state_->filter_func_) {
2194  // copy the filter_func function body over
2195  // see
2196  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2197  filter_func_with_hoisted_literals->getBasicBlockList().splice(
2198  filter_func_with_hoisted_literals->begin(),
2199  cgen_state_->filter_func_->getBasicBlockList());
2200 
2201  // also replace filter_func arguments with the arguments from
2202  // filter_func_hoisted_literals
2203  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2204  E = cgen_state_->filter_func_->arg_end(),
2205  I2 = filter_func_with_hoisted_literals->arg_begin();
2206  I != E;
2207  ++I) {
2208  I->replaceAllUsesWith(&*I2);
2209  I2->takeName(&*I);
2210  ++I2;
2211  }
2212 
2213  cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2214 
2215  // and finally replace literal placeholders
2216  std::vector<llvm::Instruction*> placeholders;
2217  std::string prefix("__placeholder__literal_");
2218  for (auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2219  e = llvm::inst_end(filter_func_with_hoisted_literals);
2220  it != e;
2221  ++it) {
2222  if (it->hasName() && it->getName().startswith(prefix)) {
2223  auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2224  llvm::dyn_cast<llvm::Value>(&*it));
2225  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2226 
2227  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2228  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2229 
2230  it->replaceAllUsesWith(
2231  query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2232  placeholders.push_back(&*it);
2233  }
2234  }
2235  for (auto placeholder : placeholders) {
2236  placeholder->removeFromParent();
2237  }
2238  }
2239 
2240  return hoisted_literals;
2241 }
2242 
2243 namespace {
2244 
2245 size_t get_shared_memory_size(const bool shared_mem_used,
2246  const QueryMemoryDescriptor* query_mem_desc_ptr) {
2247  return shared_mem_used
2248  ? (query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount())
2249  : 0;
2250 }
2251 
2252 bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor* query_mem_desc_ptr,
2253  const RelAlgExecutionUnit& ra_exe_unit,
2254  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2255  const ExecutorDeviceType device_type,
2256  const unsigned gpu_blocksize,
2257  const unsigned num_blocks_per_mp) {
2258  if (device_type == ExecutorDeviceType::CPU) {
2259  return false;
2260  }
2261  if (query_mem_desc_ptr->didOutputColumnar()) {
2262  return false;
2263  }
2264  CHECK(query_mem_desc_ptr);
2265  CHECK(cuda_mgr);
2266  /*
2267  * We only use shared memory strategy if GPU hardware provides native shared
2268  * memory atomics support. From CUDA Toolkit documentation:
2269  * https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html#atomic-ops "Like
2270  * Maxwell, Pascal [and Volta] provides native shared memory atomic operations
2271  * for 32-bit integer arithmetic, along with native 32 or 64-bit compare-and-swap
2272  * (CAS)."
2273  *
2274  **/
2275  if (!cuda_mgr->isArchMaxwellOrLaterForAll()) {
2276  return false;
2277  }
2278 
2279  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2282  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty()) {
2283  // TODO: relax this, if necessary
2284  if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2285  return false;
2286  }
2287  // skip shared memory usage when dealing with 1) variable length targets, 2)
2288  // not a COUNT aggregate
2289  const auto target_infos =
2290  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2291  std::unordered_set<SQLAgg> supported_aggs{kCOUNT};
2292  if (std::find_if(target_infos.begin(),
2293  target_infos.end(),
2294  [&supported_aggs](const TargetInfo& ti) {
2295  if (ti.sql_type.is_varlen() ||
2296  !supported_aggs.count(ti.agg_kind)) {
2297  return true;
2298  } else {
2299  return false;
2300  }
2301  }) == target_infos.end()) {
2302  return true;
2303  }
2304  }
2305  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2316  if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2317  return false;
2318  }
2319 
2320  // Fundamentally, we should use shared memory whenever the output buffer
2321  // is small enough so that we can fit it in the shared memory and yet expect
2322  // good occupancy.
2323  // For now, we allow keyless, row-wise layout, and only for perfect hash
2324  // group by operations.
2325  if (query_mem_desc_ptr->hasKeylessHash() &&
2326  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty() &&
2327  !query_mem_desc_ptr->useStreamingTopN()) {
2328  const size_t shared_memory_threshold_bytes = std::min(
2329  g_gpu_smem_threshold == 0 ? SIZE_MAX : g_gpu_smem_threshold,
2330  cuda_mgr->getMinSharedMemoryPerBlockForAllDevices() / num_blocks_per_mp);
2331  const auto output_buffer_size =
2332  query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount();
2333  if (output_buffer_size > shared_memory_threshold_bytes) {
2334  return false;
2335  }
2336 
2337  // skip shared memory usage when dealing with 1) variable length targets, 2)
2338  // non-basic aggregates (COUNT, SUM, MIN, MAX, AVG)
2339  // TODO: relax this if necessary
2340  const auto target_infos =
2341  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2342  std::unordered_set<SQLAgg> supported_aggs{kCOUNT};
2344  supported_aggs = {kCOUNT, kMIN, kMAX, kSUM, kAVG};
2345  }
2346  if (std::find_if(target_infos.begin(),
2347  target_infos.end(),
2348  [&supported_aggs](const TargetInfo& ti) {
2349  if (ti.sql_type.is_varlen() ||
2350  !supported_aggs.count(ti.agg_kind)) {
2351  return true;
2352  } else {
2353  return false;
2354  }
2355  }) == target_infos.end()) {
2356  return true;
2357  }
2358  }
2359  }
2360  return false;
2361 }
2362 
2363 #ifndef NDEBUG
2364 std::string serialize_llvm_metadata_footnotes(llvm::Function* query_func,
2365  CgenState* cgen_state) {
2366  std::string llvm_ir;
2367  std::unordered_set<llvm::MDNode*> md;
2368 
2369  // Loop over all instructions in the query function.
2370  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2371  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2372  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2373  instr_it->getAllMetadata(imd);
2374  for (auto [kind, node] : imd) {
2375  md.insert(node);
2376  }
2377  }
2378  }
2379 
2380  // Loop over all instructions in the row function.
2381  for (auto bb_it = cgen_state->row_func_->begin(); bb_it != cgen_state->row_func_->end();
2382  ++bb_it) {
2383  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2384  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2385  instr_it->getAllMetadata(imd);
2386  for (auto [kind, node] : imd) {
2387  md.insert(node);
2388  }
2389  }
2390  }
2391 
2392  // Loop over all instructions in the filter function.
2393  if (cgen_state->filter_func_) {
2394  for (auto bb_it = cgen_state->filter_func_->begin();
2395  bb_it != cgen_state->filter_func_->end();
2396  ++bb_it) {
2397  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2398  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2399  instr_it->getAllMetadata(imd);
2400  for (auto [kind, node] : imd) {
2401  md.insert(node);
2402  }
2403  }
2404  }
2405  }
2406 
2407  // Sort the metadata by canonical number and convert to text.
2408  if (!md.empty()) {
2409  std::map<size_t, std::string> sorted_strings;
2410  for (auto p : md) {
2411  std::string str;
2412  llvm::raw_string_ostream os(str);
2413  p->print(os, cgen_state->module_, true);
2414  os.flush();
2415  auto fields = split(str, {}, 1);
2416  if (fields.empty() || fields[0].empty()) {
2417  continue;
2418  }
2419  sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2420  }
2421  llvm_ir += "\n";
2422  for (auto [id, text] : sorted_strings) {
2423  llvm_ir += text;
2424  llvm_ir += "\n";
2425  }
2426  }
2427 
2428  return llvm_ir;
2429 }
2430 #endif // NDEBUG
2431 
2432 } // namespace
2433 
2434 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2435 Executor::compileWorkUnit(const std::vector<InputTableInfo>& query_infos,
2436  const PlanState::DeletedColumnsMap& deleted_cols_map,
2437  const RelAlgExecutionUnit& ra_exe_unit,
2438  const CompilationOptions& co,
2439  const ExecutionOptions& eo,
2440  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2441  const bool allow_lazy_fetch,
2442  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2443  const size_t max_groups_buffer_entry_guess,
2444  const int8_t crt_min_byte_width,
2445  const bool has_cardinality_estimation,
2446  ColumnCacheMap& column_cache,
2447  RenderInfo* render_info) {
2448  auto timer = DEBUG_TIMER(__func__);
2449 
2451  const auto cuda_mgr = data_mgr_->getCudaMgr();
2452  if (!cuda_mgr) {
2453  throw QueryMustRunOnCpu();
2454  }
2455  }
2456 
2457 #ifndef NDEBUG
2458  static std::uint64_t counter = 0;
2459  ++counter;
2460  VLOG(1) << "CODEGEN #" << counter << ":";
2461  LOG(IR) << "CODEGEN #" << counter << ":";
2462  LOG(PTX) << "CODEGEN #" << counter << ":";
2463  LOG(ASM) << "CODEGEN #" << counter << ":";
2464 #endif
2465 
2466  // cgenstate_manager uses RAII pattern to manage the live time of
2467  // CgenState instances.
2468  Executor::CgenStateManager cgenstate_manager(*this,
2469  allow_lazy_fetch,
2470  query_infos,
2471  deleted_cols_map,
2472  &ra_exe_unit); // locks compilation_mutex
2474  addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
2475 
2476  GroupByAndAggregate group_by_and_aggregate(
2477  this,
2478  co.device_type,
2479  ra_exe_unit,
2480  query_infos,
2481  row_set_mem_owner,
2482  has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2483  : std::nullopt);
2484  auto query_mem_desc =
2485  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
2486  max_groups_buffer_entry_guess,
2487  crt_min_byte_width,
2488  render_info,
2490 
2491  if (query_mem_desc->getQueryDescriptionType() ==
2493  !has_cardinality_estimation &&
2494  (!render_info || !render_info->isPotentialInSituRender()) && !eo.just_explain) {
2495  const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2496  throw CardinalityEstimationRequired(col_range_info.max - col_range_info.min);
2497  }
2498 
2499  const bool output_columnar = query_mem_desc->didOutputColumnar();
2500  const bool gpu_shared_mem_optimization =
2502  ra_exe_unit,
2503  cuda_mgr,
2504  co.device_type,
2505  cuda_mgr ? this->blockSize() : 1,
2506  cuda_mgr ? this->numBlocksPerMP() : 1);
2507  if (gpu_shared_mem_optimization) {
2508  // disable interleaved bins optimization on the GPU
2509  query_mem_desc->setHasInterleavedBinsOnGpu(false);
2510  LOG(DEBUG1) << "GPU shared memory is used for the " +
2511  query_mem_desc->queryDescTypeToString() + " query(" +
2512  std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
2513  query_mem_desc.get())) +
2514  " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
2515  }
2516 
2517  const GpuSharedMemoryContext gpu_smem_context(
2518  get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
2519 
2521  const size_t num_count_distinct_descs =
2522  query_mem_desc->getCountDistinctDescriptorsSize();
2523  for (size_t i = 0; i < num_count_distinct_descs; i++) {
2524  const auto& count_distinct_descriptor =
2525  query_mem_desc->getCountDistinctDescriptor(i);
2526  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::UnorderedSet ||
2527  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
2528  !co.hoist_literals)) {
2529  throw QueryMustRunOnCpu();
2530  }
2531  }
2532 
2533  // we currently do not support varlen projection based on baseline groupby when
2534  // 1) target table is multi-fragmented and 2) multiple gpus are involved for query
2535  // processing in this case, we punt the query to cpu to avoid server crash
2536  for (const auto expr : ra_exe_unit.target_exprs) {
2537  if (auto gby_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
2538  bool has_multiple_gpus = cuda_mgr ? cuda_mgr->getDeviceCount() > 1 : false;
2539  if (gby_expr->get_aggtype() == SQLAgg::kSAMPLE && has_multiple_gpus &&
2540  !g_leaf_count) {
2541  std::set<const Analyzer::ColumnVar*,
2542  bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
2544  gby_expr->collect_column_var(colvar_set, true);
2545  for (const auto cv : colvar_set) {
2546  if (cv->get_type_info().is_varlen()) {
2547  const auto tbl_id = cv->get_table_id();
2548  std::for_each(query_infos.begin(),
2549  query_infos.end(),
2550  [tbl_id](const InputTableInfo& input_table_info) {
2551  if (input_table_info.table_id == tbl_id &&
2552  input_table_info.info.fragments.size() > 1) {
2553  throw QueryMustRunOnCpu();
2554  }
2555  });
2556  }
2557  }
2558  }
2559  }
2560  }
2561  }
2562 
2563  // Read the module template and target either CPU or GPU
2564  // by binding the stream position functions to the right implementation:
2565  // stride access for GPU, contiguous for CPU
2566  CHECK(cgen_state_->module_ == nullptr);
2567  cgen_state_->set_module_shallow_copy(get_rt_module(), /*always_clone=*/true);
2568 
2569  auto is_gpu = co.device_type == ExecutorDeviceType::GPU;
2570  if (is_gpu) {
2571  cgen_state_->module_->setDataLayout(get_gpu_data_layout());
2572  cgen_state_->module_->setTargetTriple(get_gpu_target_triple_string());
2573  }
2574  if (has_udf_module(/*is_gpu=*/is_gpu)) {
2576  get_udf_module(/*is_gpu=*/is_gpu), *cgen_state_->module_, cgen_state_.get());
2577  }
2578  if (has_rt_udf_module(/*is_gpu=*/is_gpu)) {
2580  get_rt_udf_module(/*is_gpu=*/is_gpu), *cgen_state_->module_, cgen_state_.get());
2581  }
2582 
2583  AUTOMATIC_IR_METADATA(cgen_state_.get());
2584 
2585  auto agg_fnames =
2586  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
2587 
2588  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
2589 
2590  const bool is_group_by{query_mem_desc->isGroupBy()};
2591  auto [query_func, row_func_call] = is_group_by
2592  ? query_group_by_template(cgen_state_->module_,
2593  co.hoist_literals,
2594  *query_mem_desc,
2595  co.device_type,
2596  ra_exe_unit.scan_limit,
2597  gpu_smem_context)
2598  : query_template(cgen_state_->module_,
2599  agg_slot_count,
2600  co.hoist_literals,
2601  !!ra_exe_unit.estimator,
2602  gpu_smem_context);
2603  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
2604  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
2605  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
2606 
2607  cgen_state_->query_func_ = query_func;
2608  cgen_state_->row_func_call_ = row_func_call;
2609  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2610  &query_func->getEntryBlock().front());
2611 
2612  // Generate the function signature and column head fetches s.t.
2613  // double indirection isn't needed in the inner loop
2614  auto& fetch_bb = query_func->front();
2615  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2616  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2617  auto col_heads = generate_column_heads_load(ra_exe_unit.input_col_descs.size(),
2618  query_func->args().begin(),
2619  fetch_ir_builder,
2620  cgen_state_->context_);
2621  CHECK_EQ(ra_exe_unit.input_col_descs.size(), col_heads.size());
2622 
2623  cgen_state_->row_func_ = create_row_function(ra_exe_unit.input_col_descs.size(),
2624  is_group_by ? 0 : agg_slot_count,
2625  co.hoist_literals,
2626  cgen_state_->module_,
2627  cgen_state_->context_);
2628  CHECK(cgen_state_->row_func_);
2629  cgen_state_->row_func_bb_ =
2630  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
2631 
2633  auto filter_func_ft =
2634  llvm::FunctionType::get(get_int_type(32, cgen_state_->context_), {}, false);
2635  cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2636  llvm::Function::ExternalLinkage,
2637  "filter_func",
2638  cgen_state_->module_);
2639  CHECK(cgen_state_->filter_func_);
2640  cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2641  cgen_state_->context_, "entry", cgen_state_->filter_func_);
2642  }
2643 
2644  cgen_state_->current_func_ = cgen_state_->row_func_;
2645  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2646 
2647  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
2648  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
2649  const auto join_loops =
2650  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2651 
2652  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
2653  for (auto& simple_qual : ra_exe_unit.simple_quals) {
2654  plan_state_->addSimpleQual(simple_qual);
2655  }
2656  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2657  if (is_not_deleted_bb) {
2658  cgen_state_->row_func_bb_ = is_not_deleted_bb;
2659  }
2660  if (!join_loops.empty()) {
2661  codegenJoinLoops(join_loops,
2662  body_execution_unit,
2663  group_by_and_aggregate,
2664  query_func,
2665  cgen_state_->row_func_bb_,
2666  *(query_mem_desc.get()),
2667  co,
2668  eo);
2669  } else {
2670  const bool can_return_error = compileBody(
2671  ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
2672  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
2674  createErrorCheckControlFlow(query_func,
2677  join_loops,
2678  co.device_type,
2679  group_by_and_aggregate.query_infos_);
2680  }
2681  }
2682  std::vector<llvm::Value*> hoisted_literals;
2683 
2684  if (co.hoist_literals) {
2685  VLOG(1) << "number of hoisted literals: "
2686  << cgen_state_->query_func_literal_loads_.size()
2687  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2688  << " bytes";
2689  }
2690 
2691  if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2692  // we have some hoisted literals...
2693  hoisted_literals = inlineHoistedLiterals();
2694  }
2695 
2696  // replace the row func placeholder call with the call to the actual row func
2697  std::vector<llvm::Value*> row_func_args;
2698  for (size_t i = 0; i < cgen_state_->row_func_call_->getNumArgOperands(); ++i) {
2699  row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2700  }
2701  row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2702  row_func_args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
2703  // push hoisted literals arguments, if any
2704  row_func_args.insert(
2705  row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2706  llvm::ReplaceInstWithInst(
2707  cgen_state_->row_func_call_,
2708  llvm::CallInst::Create(cgen_state_->row_func_, row_func_args, ""));
2709 
2710  // replace the filter func placeholder call with the call to the actual filter func
2711  if (cgen_state_->filter_func_) {
2712  std::vector<llvm::Value*> filter_func_args;
2713  for (auto arg_it = cgen_state_->filter_func_args_.begin();
2714  arg_it != cgen_state_->filter_func_args_.end();
2715  ++arg_it) {
2716  filter_func_args.push_back(arg_it->first);
2717  }
2718  llvm::ReplaceInstWithInst(
2719  cgen_state_->filter_func_call_,
2720  llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args, ""));
2721  }
2722 
2723  // Aggregate
2724  plan_state_->init_agg_vals_ =
2725  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
2726 
2727  /*
2728  * If we have decided to use GPU shared memory (decision is not made here), then
2729  * we generate proper code for extra components that it needs (buffer initialization and
2730  * gpu reduction from shared memory to global memory). We then replace these functions
2731  * into the already compiled query_func (replacing two placeholders, write_back_nop and
2732  * init_smem_nop). The rest of the code should be as before (row_func, etc.).
2733  */
2734  if (gpu_smem_context.isSharedMemoryUsed()) {
2735  if (query_mem_desc->getQueryDescriptionType() ==
2737  GpuSharedMemCodeBuilder gpu_smem_code(
2738  cgen_state_->module_,
2739  cgen_state_->context_,
2740  *query_mem_desc,
2742  plan_state_->init_agg_vals_,
2743  executor_id_);
2744  gpu_smem_code.codegen();
2745  gpu_smem_code.injectFunctionsInto(query_func);
2746 
2747  // helper functions are used for caching purposes later
2748  cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2749  cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2750  LOG(IR) << gpu_smem_code.toString();
2751  }
2752  }
2753 
2754  auto multifrag_query_func = cgen_state_->module_->getFunction(
2755  "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
2756  CHECK(multifrag_query_func);
2757 
2759  insertErrorCodeChecker(
2760  multifrag_query_func, co.hoist_literals, eo.allow_runtime_query_interrupt);
2761  }
2762 
2763  bind_query(query_func,
2764  "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
2765  multifrag_query_func,
2766  cgen_state_->module_);
2767 
2768  std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2769  if (cgen_state_->filter_func_) {
2770  root_funcs.push_back(cgen_state_->filter_func_);
2771  }
2772  auto live_funcs = CodeGenerator::markDeadRuntimeFuncs(
2773  *cgen_state_->module_, root_funcs, {multifrag_query_func});
2774 
2775  // Always inline the row function and the filter function.
2776  // We don't want register spills in the inner loops.
2777  // LLVM seems to correctly free up alloca instructions
2778  // in these functions even when they are inlined.
2779  mark_function_always_inline(cgen_state_->row_func_);
2780  if (cgen_state_->filter_func_) {
2781  mark_function_always_inline(cgen_state_->filter_func_);
2782  }
2783 
2784 #ifndef NDEBUG
2785  // Add helpful metadata to the LLVM IR for debugging.
2787 #endif
2788 
2789  // Serialize the important LLVM IR functions to text for SQL EXPLAIN.
2790  std::string llvm_ir;
2791  if (eo.just_explain) {
2793 #ifdef WITH_JIT_DEBUG
2794  throw std::runtime_error(
2795  "Explain optimized not available when JIT runtime debug symbols are enabled");
2796 #else
2797  // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
2798  // optimized IR after NVVM reflect
2799  llvm::legacy::PassManager pass_manager;
2800  optimize_ir(query_func,
2801  cgen_state_->module_,
2802  pass_manager,
2803  live_funcs,
2804  gpu_smem_context.isSharedMemoryUsed(),
2805  co);
2806 #endif // WITH_JIT_DEBUG
2807  }
2808  llvm_ir =
2809  serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
2810  serialize_llvm_object(cgen_state_->row_func_) +
2811  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2812  : "");
2813 
2814 #ifndef NDEBUG
2815  llvm_ir += serialize_llvm_metadata_footnotes(query_func, cgen_state_.get());
2816 #endif
2817  }
2818 
2819  LOG(IR) << "\n\n" << query_mem_desc->toString() << "\n";
2820  LOG(IR) << "IR for the "
2821  << (co.device_type == ExecutorDeviceType::CPU ? "CPU:\n" : "GPU:\n");
2822 #ifdef NDEBUG
2823  LOG(IR) << serialize_llvm_object(query_func)
2824  << serialize_llvm_object(cgen_state_->row_func_)
2825  << (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2826  : "")
2827  << "\nEnd of IR";
2828 #else
2829  LOG(IR) << serialize_llvm_object(cgen_state_->module_) << "\nEnd of IR";
2830 #endif
2831 
2832  // Insert calls to "register_buffer_with_executor_rsm" for allocations
2833  // in runtime functions (i.e. from RBC) without it
2834  AutoTrackBuffersInRuntimeIR();
2835 
2836  // Run some basic validation checks on the LLVM IR before code is generated below.
2837  verify_function_ir(cgen_state_->row_func_);
2838  if (cgen_state_->filter_func_) {
2839  verify_function_ir(cgen_state_->filter_func_);
2840  }
2841 
2842  // Generate final native code from the LLVM IR.
2843  return std::make_tuple(
2846  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2847  : optimizeAndCodegenGPU(query_func,
2848  multifrag_query_func,
2849  live_funcs,
2850  is_group_by || ra_exe_unit.estimator,
2851  cuda_mgr,
2852  gpu_smem_context.isSharedMemoryUsed(),
2853  co),
2854  cgen_state_->getLiterals(),
2855  output_columnar,
2856  llvm_ir,
2857  std::move(gpu_smem_context)},
2858  std::move(query_mem_desc));
2859 }
2860 
2861 void Executor::insertErrorCodeChecker(llvm::Function* query_func,
2862  bool hoist_literals,
2863  bool allow_runtime_query_interrupt) {
2864  auto query_stub_func_name =
2865  "query_stub" + std::string(hoist_literals ? "_hoisted_literals" : "");
2866  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2867  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
2868  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
2869  continue;
2870  }
2871  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
2872  if (std::string(row_func_call.getCalledFunction()->getName()) ==
2873  query_stub_func_name) {
2874  auto next_inst_it = inst_it;
2875  ++next_inst_it;
2876  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
2877  auto& br_instr = bb_it->back();
2878  llvm::IRBuilder<> ir_builder(&br_instr);
2879  llvm::Value* err_lv = &*inst_it;
2880  auto error_check_bb =
2881  bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr), ".error_check");
2882  llvm::Value* error_code_arg = nullptr;
2883  auto arg_cnt = 0;
2884  for (auto arg_it = query_func->arg_begin(); arg_it != query_func->arg_end();
2885  arg_it++, ++arg_cnt) {
2886  // since multi_frag_* func has anonymous arguments so we use arg_offset
2887  // explicitly to capture "error_code" argument in the func's argument list
2888  if (hoist_literals) {
2889  if (arg_cnt == 9) {
2890  error_code_arg = &*arg_it;
2891  break;
2892  }
2893  } else {
2894  if (arg_cnt == 8) {
2895  error_code_arg = &*arg_it;
2896  break;
2897  }
2898  }
2899  }
2900  CHECK(error_code_arg);
2901  llvm::Value* err_code = nullptr;
2902  if (allow_runtime_query_interrupt) {
2903  // decide the final error code with a consideration of interrupt status
2904  auto& check_interrupt_br_instr = bb_it->back();
2905  auto interrupt_check_bb = llvm::BasicBlock::Create(
2906  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
2907  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
2908  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
2909  cgen_state_->module_->getFunction("check_interrupt"), {});
2910  auto detected_error = interrupt_checker_ir_builder.CreateCall(
2911  cgen_state_->module_->getFunction("get_error_code"),
2912  std::vector<llvm::Value*>{error_code_arg});
2913  err_code = interrupt_checker_ir_builder.CreateSelect(
2914  detected_interrupt,
2915  cgen_state_->llInt(Executor::ERR_INTERRUPTED),
2916  detected_error);
2917  interrupt_checker_ir_builder.CreateBr(error_check_bb);
2918  llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
2919  llvm::BranchInst::Create(interrupt_check_bb));
2920  ir_builder.SetInsertPoint(&br_instr);
2921  } else {
2922  // uses error code returned from row_func and skip to check interrupt status
2923  ir_builder.SetInsertPoint(&br_instr);
2924  err_code =
2925  ir_builder.CreateCall(cgen_state_->module_->getFunction("get_error_code"),
2926  std::vector<llvm::Value*>{error_code_arg});
2927  }
2928  err_lv = ir_builder.CreateICmp(
2929  llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
2930  auto error_bb = llvm::BasicBlock::Create(
2931  cgen_state_->context_, ".error_exit", query_func, new_bb);
2932  llvm::CallInst::Create(cgen_state_->module_->getFunction("record_error_code"),
2933  std::vector<llvm::Value*>{err_code, error_code_arg},
2934  "",
2935  error_bb);
2936  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2937  llvm::ReplaceInstWithInst(&br_instr,
2938  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2939  break;
2940  }
2941  }
2942  }
2943 }
2944 
2946  const RelAlgExecutionUnit& ra_exe_unit,
2947  const CompilationOptions& co) {
2948  AUTOMATIC_IR_METADATA(cgen_state_.get());
2949  if (!co.filter_on_deleted_column) {
2950  return nullptr;
2951  }
2952  CHECK(!ra_exe_unit.input_descs.empty());
2953  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
2954  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
2955  return nullptr;
2956  }
2957  const auto deleted_cd =
2958  plan_state_->getDeletedColForTable(outer_input_desc.getTableId());
2959  if (!deleted_cd) {
2960  return nullptr;
2961  }
2962  CHECK(deleted_cd->columnType.is_boolean());
2963  const auto deleted_expr =
2964  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
2965  outer_input_desc.getTableId(),
2966  deleted_cd->columnId,
2967  outer_input_desc.getNestLevel());
2968  CodeGenerator code_generator(this);
2969  const auto is_deleted =
2970  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
2971  const auto is_deleted_bb = llvm::BasicBlock::Create(
2972  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
2973  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
2974  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
2975  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
2976  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
2977  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2978  cgen_state_->ir_builder_.SetInsertPoint(bb);
2979  return bb;
2980 }
2981 
2982 bool Executor::compileBody(const RelAlgExecutionUnit& ra_exe_unit,
2983  GroupByAndAggregate& group_by_and_aggregate,
2985  const CompilationOptions& co,
2986  const GpuSharedMemoryContext& gpu_smem_context) {
2987  AUTOMATIC_IR_METADATA(cgen_state_.get());
2988 
2989  // Switch the code generation into a separate filter function if enabled.
2990  // Note that accesses to function arguments are still codegenned from the
2991  // row function's arguments, then later automatically forwarded and
2992  // remapped into filter function arguments by redeclareFilterFunction().
2993  cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
2994  llvm::Value* loop_done{nullptr};
2995  std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
2996  if (cgen_state_->filter_func_) {
2997  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2998  auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
2999  cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
3000  row_func_entry_bb->begin());
3001  loop_done = cgen_state_->ir_builder_.CreateAlloca(
3002  get_int_type(1, cgen_state_->context_), nullptr, "loop_done");
3003  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3004  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(true), loop_done);
3005  }
3006  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
3007  cgen_state_->current_func_ = cgen_state_->filter_func_;
3008  fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
3009  }
3010 
3011  // generate the code for the filter
3012  std::vector<Analyzer::Expr*> primary_quals;
3013  std::vector<Analyzer::Expr*> deferred_quals;
3014  bool short_circuited = CodeGenerator::prioritizeQuals(
3015  ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
3016  if (short_circuited) {
3017  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
3018  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
3019  << " quals";
3020  }
3021  llvm::Value* filter_lv = cgen_state_->llBool(true);
3022  CodeGenerator code_generator(this);
3023  for (auto expr : primary_quals) {
3024  // Generate the filter for primary quals
3025  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
3026  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
3027  }
3028  CHECK(filter_lv->getType()->isIntegerTy(1));
3029  llvm::BasicBlock* sc_false{nullptr};
3030  if (!deferred_quals.empty()) {
3031  auto sc_true = llvm::BasicBlock::Create(
3032  cgen_state_->context_, "sc_true", cgen_state_->current_func_);
3033  sc_false = llvm::BasicBlock::Create(
3034  cgen_state_->context_, "sc_false", cgen_state_->current_func_);
3035  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
3036  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
3037  if (ra_exe_unit.join_quals.empty()) {
3038  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
3039  }
3040  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
3041  filter_lv = cgen_state_->llBool(true);
3042  }
3043  for (auto expr : deferred_quals) {
3044  filter_lv = cgen_state_->ir_builder_.CreateAnd(
3045  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
3046  }
3047 
3048  CHECK(filter_lv->getType()->isIntegerTy(1));
3049  auto ret = group_by_and_aggregate.codegen(
3050  filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
3051 
3052  // Switch the code generation back to the row function if a filter
3053  // function was enabled.
3054  if (cgen_state_->filter_func_) {
3055  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3056  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(false), loop_done);
3057  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
3058  }
3059 
3060  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
3061  cgen_state_->current_func_ = cgen_state_->row_func_;
3062  cgen_state_->filter_func_call_ =
3063  cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
3064 
3065  // Create real filter function declaration after placeholder call
3066  // is emitted.
3067  redeclareFilterFunction();
3068 
3069  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
3070  auto loop_done_true = llvm::BasicBlock::Create(
3071  cgen_state_->context_, "loop_done_true", cgen_state_->row_func_);
3072  auto loop_done_false = llvm::BasicBlock::Create(
3073  cgen_state_->context_, "loop_done_false", cgen_state_->row_func_);
3074  auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(
3075  loop_done->getType()->getPointerElementType(), loop_done);
3076  cgen_state_->ir_builder_.CreateCondBr(
3077  loop_done_flag, loop_done_true, loop_done_false);
3078  cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
3079  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3080  cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
3081  } else {
3082  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
3083  }
3084  }
3085  return ret;
3086 }
3087 
3088 std::vector<llvm::Value*> generate_column_heads_load(const int num_columns,
3089  llvm::Value* byte_stream_arg,
3090  llvm::IRBuilder<>& ir_builder,
3091  llvm::LLVMContext& ctx) {
3092  CHECK(byte_stream_arg);
3093  const auto max_col_local_id = num_columns - 1;
3094 
3095  std::vector<llvm::Value*> col_heads;
3096  for (int col_id = 0; col_id <= max_col_local_id; ++col_id) {
3097  auto* gep = ir_builder.CreateGEP(
3098  byte_stream_arg->getType()->getScalarType()->getPointerElementType(),
3099  byte_stream_arg,
3100  llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id));
3101  col_heads.emplace_back(
3102  ir_builder.CreateLoad(gep->getType()->getPointerElementType(), gep));
3103  }
3104  return col_heads;
3105 }
3106 
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, const std::vector< JoinLoop > &join_loops, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:231
CodeCacheVal< CompilationContext > get_value(const CodeCacheKey &key)
double g_running_query_interrupt_freq
Definition: Execute.cpp:129
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
bool countDistinctDescriptorsLogicallyEmpty() const
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1351
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
Definition: Analyzer.h:215
void mark_function_never_inline(llvm::Function *func)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
Definition: Analyzer.h:220
ExecutorDeviceType
void optimize_ir(llvm::Function *query_func, llvm::Module *llvm_module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co)
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:217
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void AutoTrackBuffersInRuntimeIR()
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
llvm::ConstantInt * ll_int(const T v, llvm::LLVMContext &context)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *llvm_module)
std::string join(T const &container, std::string const &delim)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_string(const std::string &udf_ir_string, llvm::LLVMContext &ctx, bool is_gpu=false)
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *mod, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
std::vector< std::string > CodeCacheKey
Definition: CodeCache.h:25
ExecutorOptLevel opt_level
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:80
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:235
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *mod, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
constexpr double f
Definition: Utm.h:31
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
llvm::Function * row_func_
Definition: CgenState.h:351
bool g_enable_smem_non_grouped_agg
Definition: Execute.cpp:138
Definition: sqldefs.h:75
unsigned getExpOfTwo(unsigned n)
Definition: MathUtils.cpp:23
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned gpu_blocksize, const unsigned num_blocks_per_mp)
llvm::StringRef get_gpu_target_triple_string()
llvm::Module * module_
Definition: CgenState.h:350
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
void verify_function_ir(const llvm::Function *func)
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:166
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
ExecutionEngineWrapper create_execution_engine(llvm::Module *llvm_module, llvm::EngineBuilder &eb, const CompilationOptions &co)
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > read_llvm_module_from_ir_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx, bool is_gpu=false)
ExecutorExplainType explain_type
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
void insertErrorCodeChecker(llvm::Function *query_func, bool hoist_literals, bool allow_runtime_query_interrupt)
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1350
void initializeNVPTXBackend() const
Definition: sqldefs.h:77
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:121
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
this
Definition: Execute.cpp:269
QueryDescriptionType getQueryDescriptionType() const
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
Definition: CudaMgr.h:155
#define AUTOMATIC_IR_METADATA_DONE()
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *llvm_module, llvm::LLVMContext &context)
ExecutorDeviceType device_type
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *llvm_module)
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
llvm::Function * filter_func_
Definition: CgenState.h:352
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
static void addUdfIrToModule(const std::string &udf_ir_filename, const bool is_cuda_ir)
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:331
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *llvm_module)
#define CHECK_LE(x, y)
Definition: Logger.h:234
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool is_gpu_smem_used, const CompilationOptions &)
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(Executor *executor, llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co, const GPUTarget &gpu_target)
bool g_enable_smem_grouped_non_count_agg
Definition: Execute.cpp:135
void evictFractionEntries(const float fraction)
Definition: sqldefs.h:78
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
static std::map< ExtModuleKinds, std::string > extension_module_sources
Definition: Execute.h:479
void show_defined(llvm::Module &llvm_module)
int CUdevice
Definition: nocuda.h:20
bool g_enable_filter_function
Definition: Execute.cpp:84
static void linkModuleWithLibdevice(Executor *executor, llvm::Module &module, llvm::PassManagerBuilder &pass_manager_builder, const GPUTarget &gpu_target)
float g_fraction_code_cache_to_evict
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
Definition: LogicalIR.cpp:157
data_mgr_(data_mgr)
SQLAgg get_aggtype() const
Definition: Analyzer.h:1201
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:68
#define CHECK(condition)
Definition: Logger.h:223
#define DEBUG_TIMER(name)
Definition: Logger.h:370
llvm::ValueToValueMapTy vmap_
Definition: CgenState.h:360
std::vector< llvm::Value * > inlineHoistedLiterals()
static CodeCacheAccessor< GpuCompilationContext > gpu_code_accessor
Definition: Execute.h:1272
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
size_t g_leaf_count
Definition: ParserNode.cpp:78
Definition: sqldefs.h:76
static llvm::sys::Mutex g_ee_create_mutex
int cpu_threads()
Definition: thread_count.h:24
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls, const bool is_gpu=false)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
CubinResult ptx_to_cubin(const std::string &ptx, const unsigned block_size, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
Definition: sqldefs.h:74
llvm::Type * get_int_ptr_type(const int width, llvm::LLVMContext &context)
#define VLOG(n)
Definition: Logger.h:317
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
std::unique_ptr< llvm::Module > read_llvm_module_from_bc_file(const std::string &udf_ir_filename, llvm::LLVMContext &ctx)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
void put(const CodeCacheKey &key, CodeCacheVal< CompilationContext > &value)
size_t g_gpu_smem_threshold
Definition: Execute.cpp:130