OmniSciDB  fe05a0c208
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
NativeCodegen.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
18 #include "Execute.h"
20 #include "GpuSharedMemoryUtils.h"
23 #include "QueryTemplateGenerator.h"
24 
25 #include "CudaMgr/CudaMgr.h"
28 #include "Shared/MathUtils.h"
29 #include "StreamingTopN.h"
30 
31 #if LLVM_VERSION_MAJOR < 9
32 static_assert(false, "LLVM Version >= 9 is required.");
33 #endif
34 
35 #include <llvm/Bitcode/BitcodeReader.h>
36 #include <llvm/Bitcode/BitcodeWriter.h>
37 #include <llvm/ExecutionEngine/MCJIT.h>
38 #include <llvm/IR/Attributes.h>
39 #include <llvm/IR/GlobalValue.h>
40 #include <llvm/IR/InstIterator.h>
41 #include <llvm/IR/IntrinsicInst.h>
42 #include <llvm/IR/Intrinsics.h>
43 #include <llvm/IR/LegacyPassManager.h>
44 #include <llvm/IR/Verifier.h>
45 #include <llvm/IRReader/IRReader.h>
46 #include <llvm/Linker/Linker.h>
47 #include <llvm/Support/Casting.h>
48 #include <llvm/Support/FileSystem.h>
49 #include <llvm/Support/FormattedStream.h>
50 #include <llvm/Support/MemoryBuffer.h>
51 #include <llvm/Support/SourceMgr.h>
52 #include <llvm/Support/TargetRegistry.h>
53 #include <llvm/Support/TargetSelect.h>
54 #include <llvm/Support/raw_os_ostream.h>
55 #include <llvm/Support/raw_ostream.h>
56 #include <llvm/Transforms/IPO.h>
57 #include <llvm/Transforms/IPO/AlwaysInliner.h>
58 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
59 #include <llvm/Transforms/InstCombine/InstCombine.h>
60 #include <llvm/Transforms/Instrumentation.h>
61 #include <llvm/Transforms/Scalar.h>
62 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
63 #include <llvm/Transforms/Utils.h>
64 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
65 #include <llvm/Transforms/Utils/Cloning.h>
66 
67 #if LLVM_VERSION_MAJOR >= 11
68 #include <llvm/Support/Host.h>
69 #endif
70 
72 
73 std::unique_ptr<llvm::Module> udf_gpu_module;
74 std::unique_ptr<llvm::Module> udf_cpu_module;
75 std::unique_ptr<llvm::Module> rt_udf_gpu_module;
76 std::unique_ptr<llvm::Module> rt_udf_cpu_module;
77 
78 extern std::unique_ptr<llvm::Module> g_rt_module;
79 
80 #ifdef HAVE_CUDA
81 extern std::unique_ptr<llvm::Module> g_rt_libdevice_module;
82 #endif
83 
84 #ifdef ENABLE_GEOS
85 extern std::unique_ptr<llvm::Module> g_rt_geos_module;
86 
87 #include <llvm/Support/DynamicLibrary.h>
88 
89 #ifndef GEOS_LIBRARY_FILENAME
90 #error Configuration should include GEOS library file name
91 #endif
92 std::unique_ptr<std::string> g_libgeos_so_filename(
93  new std::string(GEOS_LIBRARY_FILENAME));
94 static llvm::sys::DynamicLibrary geos_dynamic_library;
95 static std::mutex geos_init_mutex;
96 
97 namespace {
98 
99 void load_geos_dynamic_library() {
100  std::lock_guard<std::mutex> guard(geos_init_mutex);
101 
102  if (!geos_dynamic_library.isValid()) {
103  if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
104  LOG(WARNING) << "Misconfigured GEOS library file name, trying 'libgeos_c.so'";
105  g_libgeos_so_filename.reset(new std::string("libgeos_c.so"));
106  }
107  auto filename = *g_libgeos_so_filename;
108  std::string error_message;
109  geos_dynamic_library =
110  llvm::sys::DynamicLibrary::getPermanentLibrary(filename.c_str(), &error_message);
111  if (!geos_dynamic_library.isValid()) {
112  LOG(ERROR) << "Failed to load GEOS library '" + filename + "'";
113  std::string exception_message = "Failed to load GEOS library: " + error_message;
114  throw std::runtime_error(exception_message.c_str());
115  } else {
116  LOG(INFO) << "Loaded GEOS library '" + filename + "'";
117  }
118  }
119 }
120 
121 } // namespace
122 #endif
123 
124 namespace {
125 
126 void throw_parseIR_error(const llvm::SMDiagnostic& parse_error,
127  std::string src = "",
128  const bool is_gpu = false) {
129  std::string excname = (is_gpu ? "NVVM IR ParseError: " : "LLVM IR ParseError: ");
130  llvm::raw_string_ostream ss(excname);
131  parse_error.print(src.c_str(), ss, false, false);
132  throw ParseIRError(ss.str());
133 }
134 
135 /* SHOW_DEFINED(<llvm::Module instance>) prints the function names
136  that are defined in the given LLVM Module instance.
137 
138  SHOW_FUNCTIONS(<llvm::Module instance>) prints the function names
139  of all used functions in the given LLVM Module
140  instance. Declarations are marked with `[decl]` as a name suffix.
141 
142  Useful for debugging.
143 */
144 
145 #define SHOW_DEFINED(MODULE) \
146  { \
147  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
148  ::show_defined(MODULE); \
149  }
150 
151 #define SHOW_FUNCTIONS(MODULE) \
152  { \
153  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
154  ::show_functions(MODULE); \
155  }
156 
157 template <typename T = void>
158 void show_defined(llvm::Module& module) {
159  std::cout << "defines: ";
160  for (auto& f : module.getFunctionList()) {
161  if (!f.isDeclaration()) {
162  std::cout << f.getName().str() << ", ";
163  }
164  }
165  std::cout << std::endl;
166 }
167 
168 template <typename T = void>
169 void show_defined(llvm::Module* module) {
170  if (module == nullptr) {
171  std::cout << "is null" << std::endl;
172  } else {
173  show_defined(*module);
174  }
175 }
176 
177 template <typename T = void>
178 void show_defined(std::unique_ptr<llvm::Module>& module) {
179  show_defined(module.get());
180 }
181 
182 /*
183  scan_function_calls(module, defined, undefined, ignored) computes
184  defined and undefined sets of function names:
185 
186  - defined functions are those that are defined in the given module
187 
188  - undefined functions are those that are called by defined functions
189  but that are not defined in the given module
190 
191  - ignored functions are functions that may be undefined but will not
192  be listed in the set of undefined functions.
193 
194  Useful for debugging.
195 */
196 template <typename T = void>
197 void scan_function_calls(llvm::Function& F,
198  std::unordered_set<std::string>& defined,
199  std::unordered_set<std::string>& undefined,
200  const std::unordered_set<std::string>& ignored) {
201  for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
202  if (auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
203  auto* F2 = CI->getCalledFunction();
204  if (F2 != nullptr) {
205  auto F2name = F2->getName().str();
206  if (F2->isDeclaration()) {
207  if (F2name.rfind("__", 0) !=
208  0 // assume symbols with double underscore are defined
209  && F2name.rfind("llvm.", 0) !=
210  0 // TODO: this may give false positive for NVVM intrinsics
211  && ignored.find(F2name) == ignored.end() // not in ignored list
212  ) {
213  undefined.emplace(F2name);
214  }
215  } else {
216  if (defined.find(F2name) == defined.end()) {
217  defined.emplace(F2name);
218  scan_function_calls<T>(*F2, defined, undefined, ignored);
219  }
220  }
221  }
222  }
223  }
224 }
225 
226 template <typename T = void>
227 void scan_function_calls(llvm::Module& module,
228  std::unordered_set<std::string>& defined,
229  std::unordered_set<std::string>& undefined,
230  const std::unordered_set<std::string>& ignored) {
231  for (auto& F : module) {
232  if (!F.isDeclaration()) {
233  scan_function_calls(F, defined, undefined, ignored);
234  }
235  }
236 }
237 
238 template <typename T = void>
239 std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>>
240 scan_function_calls(llvm::Module& module,
241  const std::unordered_set<std::string>& ignored = {}) {
242  std::unordered_set<std::string> defined, undefined;
243  scan_function_calls(module, defined, undefined, ignored);
244  return std::make_tuple(defined, undefined);
245 }
246 
247 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
249  llvm::Module& M,
250  const std::unordered_set<llvm::Function*>& live_funcs) {
251  std::vector<llvm::Function*> dead_funcs;
252  for (auto& F : M) {
253  bool bAlive = false;
254  if (live_funcs.count(&F)) {
255  continue;
256  }
257  for (auto U : F.users()) {
258  auto* C = llvm::dyn_cast<const llvm::CallInst>(U);
259  if (!C || C->getParent()->getParent() != &F) {
260  bAlive = true;
261  break;
262  }
263  }
264  if (!bAlive) {
265  dead_funcs.push_back(&F);
266  }
267  }
268  for (auto pFn : dead_funcs) {
269  pFn->eraseFromParent();
270  }
271 }
272 
273 #ifdef HAVE_CUDA
274 
275 // check if linking with libdevice is required
276 // libdevice functions have a __nv_* prefix
277 bool check_module_requires_libdevice(llvm::Module* module) {
278  for (llvm::Function& F : *module) {
279  if (F.hasName() && F.getName().startswith("__nv_")) {
280  LOG(INFO) << "Module requires linking with libdevice: " << std::string(F.getName());
281  return true;
282  }
283  }
284  LOG(DEBUG1) << "module does not require linking against libdevice";
285  return false;
286 }
287 
288 // Adds the missing intrinsics declarations to the given module
289 void add_intrinsics_to_module(llvm::Module* module) {
290  for (llvm::Function& F : *module) {
291  for (llvm::Instruction& I : instructions(F)) {
292  if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
293  if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
294  llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
295  llvm::Function& decl_fn =
296  *llvm::Intrinsic::getDeclaration(module, ii->getIntrinsicID(), Tys);
297  ii->setCalledFunction(&decl_fn);
298  } else {
299  // inserts the declaration into the module if not present
300  llvm::Intrinsic::getDeclaration(module, ii->getIntrinsicID());
301  }
302  }
303  }
304  }
305 }
306 
307 #endif
308 
309 void optimize_ir(llvm::Function* query_func,
310  llvm::Module* module,
311  llvm::legacy::PassManager& pass_manager,
312  const std::unordered_set<llvm::Function*>& live_funcs,
313  const CompilationOptions& co) {
314  pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
315  pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
316  pass_manager.add(llvm::createInstSimplifyLegacyPass());
317  pass_manager.add(llvm::createInstructionCombiningPass());
318  pass_manager.add(llvm::createGlobalOptimizerPass());
319 
320  pass_manager.add(llvm::createLICMPass());
322  pass_manager.add(llvm::createLoopStrengthReducePass());
323  }
324  pass_manager.run(*module);
325 
326  eliminate_dead_self_recursive_funcs(*module, live_funcs);
327 }
328 #endif
329 
330 } // namespace
331 
333 
334 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine)
335  : execution_engine_(execution_engine) {}
336 
337 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine,
338  const CompilationOptions& co)
339  : execution_engine_(execution_engine) {
340  if (execution_engine_) {
342 #ifdef ENABLE_INTEL_JIT_LISTENER
343  intel_jit_listener_.reset(llvm::JITEventListener::createIntelJITEventListener());
345  execution_engine_->RegisterJITEventListener(intel_jit_listener_.get());
346  LOG(INFO) << "Registered IntelJITEventListener";
347 #else
348  LOG(WARNING) << "This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
349  "listener configuration parameter.";
350 #endif // ENABLE_INTEL_JIT_LISTENER
351  }
352  }
353 }
354 
356  llvm::ExecutionEngine* execution_engine) {
357  execution_engine_.reset(execution_engine);
358  intel_jit_listener_ = nullptr;
359  return *this;
360 }
361 
362 void verify_function_ir(const llvm::Function* func) {
363  std::stringstream err_ss;
364  llvm::raw_os_ostream err_os(err_ss);
365  err_os << "\n-----\n";
366  if (llvm::verifyFunction(*func, &err_os)) {
367  err_os << "\n-----\n";
368  func->print(err_os, nullptr);
369  err_os << "\n-----\n";
370  LOG(FATAL) << err_ss.str();
371  }
372 }
373 
374 std::shared_ptr<CompilationContext> Executor::getCodeFromCache(const CodeCacheKey& key,
375  const CodeCache& cache) {
376  auto it = cache.find(key);
377  if (it != cache.cend()) {
378  delete cgen_state_->module_;
379  cgen_state_->module_ = it->second.second;
380  return it->second.first;
381  }
382  return {};
383 }
384 
386  std::shared_ptr<CompilationContext> compilation_context,
387  llvm::Module* module,
388  CodeCache& cache) {
389  cache.put(key,
390  std::make_pair<std::shared_ptr<CompilationContext>, decltype(module)>(
391  std::move(compilation_context), std::move(module)));
392 }
393 
394 namespace {
395 
396 std::string assemblyForCPU(ExecutionEngineWrapper& execution_engine,
397  llvm::Module* module) {
398  llvm::legacy::PassManager pass_manager;
399  auto cpu_target_machine = execution_engine->getTargetMachine();
400  CHECK(cpu_target_machine);
401  llvm::SmallString<256> code_str;
402  llvm::raw_svector_ostream os(code_str);
403 #if LLVM_VERSION_MAJOR >= 10
404  cpu_target_machine->addPassesToEmitFile(
405  pass_manager, os, nullptr, llvm::CGFT_AssemblyFile);
406 #else
407  cpu_target_machine->addPassesToEmitFile(
408  pass_manager, os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
409 #endif
410  pass_manager.run(*module);
411  return "Assembly for the CPU:\n" + std::string(code_str.str()) + "\nEnd of assembly";
412 }
413 
414 } // namespace
415 
417  llvm::Function* func,
418  const std::unordered_set<llvm::Function*>& live_funcs,
419  const CompilationOptions& co) {
420  auto module = func->getParent();
421  // run optimizations
422 #ifndef WITH_JIT_DEBUG
423  llvm::legacy::PassManager pass_manager;
424  optimize_ir(func, module, pass_manager, live_funcs, co);
425 #endif // WITH_JIT_DEBUG
426 
427  auto init_err = llvm::InitializeNativeTarget();
428  CHECK(!init_err);
429 
430  llvm::InitializeAllTargetMCs();
431  llvm::InitializeNativeTargetAsmPrinter();
432  llvm::InitializeNativeTargetAsmParser();
433 
434  std::string err_str;
435  std::unique_ptr<llvm::Module> owner(module);
436  llvm::EngineBuilder eb(std::move(owner));
437  eb.setErrorStr(&err_str);
438  eb.setEngineKind(llvm::EngineKind::JIT);
439  llvm::TargetOptions to;
440  to.EnableFastISel = true;
441  eb.setTargetOptions(to);
443  eb.setOptLevel(llvm::CodeGenOpt::None);
444  }
445 
446 #ifdef _WIN32
447  // TODO: workaround for data layout mismatch crash for now
448  auto target_machine = eb.selectTarget();
449  CHECK(target_machine);
450  module->setDataLayout(target_machine->createDataLayout());
451 #endif
452 
453  ExecutionEngineWrapper execution_engine(eb.create(), co);
454  CHECK(execution_engine.get());
455  LOG(ASM) << assemblyForCPU(execution_engine, module);
456 
457  execution_engine->finalizeObject();
458  return execution_engine;
459 }
460 
461 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenCPU(
462  llvm::Function* query_func,
463  llvm::Function* multifrag_query_func,
464  const std::unordered_set<llvm::Function*>& live_funcs,
465  const CompilationOptions& co) {
466  auto module = multifrag_query_func->getParent();
467  CodeCacheKey key{serialize_llvm_object(query_func),
468  serialize_llvm_object(cgen_state_->row_func_)};
469  if (cgen_state_->filter_func_) {
470  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
471  }
472  for (const auto helper : cgen_state_->helper_functions_) {
473  key.push_back(serialize_llvm_object(helper));
474  }
475  auto cached_code = getCodeFromCache(key, cpu_code_cache_);
476  if (cached_code) {
477  return cached_code;
478  }
479 
480  if (cgen_state_->needs_geos_) {
481 #ifdef ENABLE_GEOS
482  load_geos_dynamic_library();
483 
484  // Read geos runtime module and bind GEOS API function references to GEOS library
485  auto rt_geos_module_copy = llvm::CloneModule(
486  *g_rt_geos_module.get(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
487  auto func = llvm::dyn_cast<llvm::Function>(gv);
488  if (!func) {
489  return true;
490  }
491  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
492  func->getLinkage() ==
493  llvm::GlobalValue::LinkageTypes::InternalLinkage ||
494  func->getLinkage() == llvm::GlobalValue::LinkageTypes::ExternalLinkage);
495  });
496  CodeGenerator::link_udf_module(rt_geos_module_copy,
497  *module,
498  cgen_state_.get(),
499  llvm::Linker::Flags::LinkOnlyNeeded);
500 #else
501  throw std::runtime_error("GEOS is disabled in this build");
502 #endif
503  }
504 
505  auto execution_engine =
506  CodeGenerator::generateNativeCPUCode(query_func, live_funcs, co);
507  auto cpu_compilation_context =
508  std::make_shared<CpuCompilationContext>(std::move(execution_engine));
509  cpu_compilation_context->setFunctionPointer(multifrag_query_func);
510  addCodeToCache(key, cpu_compilation_context, module, cpu_code_cache_);
511  return cpu_compilation_context;
512 }
513 
514 void CodeGenerator::link_udf_module(const std::unique_ptr<llvm::Module>& udf_module,
515  llvm::Module& module,
516  CgenState* cgen_state,
517  llvm::Linker::Flags flags) {
518  // throw a runtime error if the target module contains functions
519  // with the same name as in module of UDF functions.
520  for (auto& f : *udf_module.get()) {
521  auto func = module.getFunction(f.getName());
522  if (!(func == nullptr) && !f.isDeclaration() && flags == llvm::Linker::Flags::None) {
523  LOG(ERROR) << " Attempt to overwrite " << f.getName().str() << " in "
524  << module.getModuleIdentifier() << " from `"
525  << udf_module->getModuleIdentifier() << "`" << std::endl;
526  throw std::runtime_error(
527  "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
528  "function ***");
529  } else {
530  VLOG(1) << " Adding " << f.getName().str() << " to "
531  << module.getModuleIdentifier() << " from `"
532  << udf_module->getModuleIdentifier() << "`" << std::endl;
533  }
534  }
535 
536  std::unique_ptr<llvm::Module> udf_module_copy;
537 
538  udf_module_copy = llvm::CloneModule(*udf_module.get(), cgen_state->vmap_);
539 
540  udf_module_copy->setDataLayout(module.getDataLayout());
541  udf_module_copy->setTargetTriple(module.getTargetTriple());
542 
543  // Initialize linker with module for RuntimeFunctions.bc
544  llvm::Linker ld(module);
545  bool link_error = false;
546 
547  link_error = ld.linkInModule(std::move(udf_module_copy), flags);
548 
549  if (link_error) {
550  throw std::runtime_error("link_udf_module: *** error linking module ***");
551  }
552 }
553 
554 namespace {
555 
556 std::string cpp_to_llvm_name(const std::string& s) {
557  if (s == "int8_t") {
558  return "i8";
559  }
560  if (s == "int16_t") {
561  return "i16";
562  }
563  if (s == "int32_t") {
564  return "i32";
565  }
566  if (s == "int64_t") {
567  return "i64";
568  }
569  CHECK(s == "float" || s == "double");
570  return s;
571 }
572 
573 std::string gen_array_any_all_sigs() {
574  std::string result;
575  for (const std::string any_or_all : {"any", "all"}) {
576  for (const std::string elem_type :
577  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
578  for (const std::string needle_type :
579  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
580  for (const std::string op_name : {"eq", "ne", "lt", "le", "gt", "ge"}) {
581  result += ("declare i1 @array_" + any_or_all + "_" + op_name + "_" + elem_type +
582  "_" + needle_type + "(i8*, i64, " + cpp_to_llvm_name(needle_type) +
583  ", " + cpp_to_llvm_name(elem_type) + ");\n");
584  }
585  }
586  }
587  }
588  return result;
589 }
590 
592  std::string result;
593  for (const std::string key_type : {"int8_t", "int16_t", "int32_t", "int64_t"}) {
594  const auto key_llvm_type = cpp_to_llvm_name(key_type);
595  result += "declare i64 @translate_null_key_" + key_type + "(" + key_llvm_type + ", " +
596  key_llvm_type + ", i64);\n";
597  }
598  return result;
599 }
600 
601 const std::string cuda_rt_decls =
602  R"( declare void @llvm.dbg.declare(metadata, metadata, metadata) declare void @llvm.dbg.value(metadata, metadata, metadata) declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind declare i64 @get_thread_index(); declare i64 @get_block_index(); declare i32 @pos_start_impl(i32*); declare i32 @group_buff_idx_impl(); declare i32 @pos_step_impl(); declare i8 @thread_warp_idx(i8); declare i64* @init_shared_mem(i64*, i32); declare i64* @init_shared_mem_nop(i64*, i32); declare i64* @declare_dynamic_shared_memory(); declare void @write_back_nop(i64*, i64*, i32); declare void @write_back_non_grouped_agg(i64*, i64*, i32); declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8); declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32); declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32); declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32); declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32); declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32); declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32); declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64); declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64); declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64); declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64); declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64); declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double); declare i64 @get_bucket_key_for_range_double(i8*, i64, double); declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double); declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64); declare i64 @agg_count_shared(i64*, i64); declare i64 @agg_count_skip_val_shared(i64*, i64, i64); declare i32 @agg_count_int32_shared(i32*, i32); declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32); declare i64 @agg_count_double_shared(i64*, double); declare i64 @agg_count_double_skip_val_shared(i64*, double, double); declare i32 @agg_count_float_shared(i32*, float); declare i32 @agg_count_float_skip_val_shared(i32*, float, float); declare i64 @agg_sum_shared(i64*, i64); declare i64 @agg_sum_skip_val_shared(i64*, i64, i64); declare i32 @agg_sum_int32_shared(i32*, i32); declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32); declare void @agg_sum_double_shared(i64*, double); declare void @agg_sum_double_skip_val_shared(i64*, double, double); declare void @agg_sum_float_shared(i32*, float); declare void @agg_sum_float_skip_val_shared(i32*, float, float); declare void @agg_max_shared(i64*, i64); declare void @agg_max_skip_val_shared(i64*, i64, i64); declare void @agg_max_int32_shared(i32*, i32); declare void @agg_max_int32_skip_val_shared(i32*, i32, i32); declare void @agg_max_int16_shared(i16*, i16); declare void @agg_max_int16_skip_val_shared(i16*, i16, i16); declare void @agg_max_int8_shared(i8*, i8); declare void @agg_max_int8_skip_val_shared(i8*, i8, i8); declare void @agg_max_double_shared(i64*, double); declare void @agg_max_double_skip_val_shared(i64*, double, double); declare void @agg_max_float_shared(i32*, float); declare void @agg_max_float_skip_val_shared(i32*, float, float); declare void @agg_min_shared(i64*, i64); declare void @agg_min_skip_val_shared(i64*, i64, i64); declare void @agg_min_int32_shared(i32*, i32); declare void @agg_min_int32_skip_val_shared(i32*, i32, i32); declare void @agg_min_int16_shared(i16*, i16); declare void @agg_min_int16_skip_val_shared(i16*, i16, i16); declare void @agg_min_int8_shared(i8*, i8); declare void @agg_min_int8_skip_val_shared(i8*, i8, i8); declare void @agg_min_double_shared(i64*, double); declare void @agg_min_double_skip_val_shared(i64*, double, double); declare void @agg_min_float_shared(i32*, float); declare void @agg_min_float_skip_val_shared(i32*, float, float); declare void @agg_id_shared(i64*, i64); declare void @agg_id_int32_shared(i32*, i32); declare void @agg_id_int16_shared(i16*, i16); declare void @agg_id_int8_shared(i8*, i8); declare void @agg_id_double_shared(i64*, double); declare void @agg_id_double_shared_slow(i64*, double*); declare void @agg_id_float_shared(i32*, float); declare i32 @checked_single_agg_id_shared(i64*, i64, i64); declare i32 @checked_single_agg_id_double_shared(i64*, double, double); declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double); declare i32 @checked_single_agg_id_float_shared(i32*, float, float); declare i1 @slotEmptyKeyCAS(i64*, i64, i64); declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32); declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16); declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8); declare i64 @datetrunc_century(i64); declare i64 @datetrunc_day(i64); declare i64 @datetrunc_decade(i64); declare i64 @datetrunc_hour(i64); declare i64 @datetrunc_millennium(i64); declare i64 @datetrunc_minute(i64); declare i64 @datetrunc_month(i64); declare i64 @datetrunc_quarter(i64); declare i64 @datetrunc_quarterday(i64); declare i64 @datetrunc_week_monday(i64); declare i64 @datetrunc_week_sunday(i64); declare i64 @datetrunc_week_saturday(i64); declare i64 @datetrunc_year(i64); declare i64 @extract_epoch(i64); declare i64 @extract_dateepoch(i64); declare i64 @extract_quarterday(i64); declare i64 @extract_hour(i64); declare i64 @extract_minute(i64); declare i64 @extract_second(i64); declare i64 @extract_millisecond(i64); declare i64 @extract_microsecond(i64); declare i64 @extract_nanosecond(i64); declare i64 @extract_dow(i64); declare i64 @extract_isodow(i64); declare i64 @extract_day(i64); declare i64 @extract_week_monday(i64); declare i64 @extract_week_sunday(i64); declare i64 @extract_week_saturday(i64); declare i64 @extract_day_of_year(i64); declare i64 @extract_month(i64); declare i64 @extract_quarter(i64); declare i64 @extract_year(i64); declare i64 @DateTruncateHighPrecisionToDate(i64, i64); declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64); declare i64 @DateDiff(i32, i64, i64); declare i64 @DateDiffNullable(i32, i64, i64, i64); declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32); declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64); declare i64 @DateAdd(i32, i64, i64); declare i64 @DateAddNullable(i32, i64, i64, i64); declare i64 @DateAddHighPrecision(i32, i64, i64, i32); declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64); declare i64 @string_decode(i8*, i64); declare i32 @array_size(i8*, i64, i32); declare i32 @array_size_nullable(i8*, i64, i32, i32); declare i32 @fast_fixlen_array_size(i8*, i32); declare i1 @array_is_null(i8*, i64); declare i1 @point_coord_array_is_null(i8*, i64); declare i8* @array_buff(i8*, i64); declare i8* @fast_fixlen_array_buff(i8*, i64); declare i8 @array_at_int8_t(i8*, i64, i32); declare i16 @array_at_int16_t(i8*, i64, i32); declare i32 @array_at_int32_t(i8*, i64, i32); declare i64 @array_at_int64_t(i8*, i64, i32); declare float @array_at_float(i8*, i64, i32); declare double @array_at_double(i8*, i64, i32); declare i8 @varlen_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_array_at_int64_t(i8*, i64, i32); declare float @varlen_array_at_float(i8*, i64, i32); declare double @varlen_array_at_double(i8*, i64, i32); declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32); declare float @varlen_notnull_array_at_float(i8*, i64, i32); declare double @varlen_notnull_array_at_double(i8*, i64, i32); declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8); declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16); declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32); declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64); declare float @array_at_float_checked(i8*, i64, i64, float); declare double @array_at_double_checked(i8*, i64, i64, double); declare i32 @char_length(i8*, i32); declare i32 @char_length_nullable(i8*, i32, i32); declare i32 @char_length_encoded(i8*, i32); declare i32 @char_length_encoded_nullable(i8*, i32, i32); declare i32 @key_for_string_encoded(i32); declare i1 @sample_ratio(double, i64); declare i1 @string_like(i8*, i32, i8*, i32, i8); declare i1 @string_ilike(i8*, i32, i8*, i32, i8); declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8); declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8); declare i1 @string_like_simple(i8*, i32, i8*, i32); declare i1 @string_ilike_simple(i8*, i32, i8*, i32); declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8); declare i1 @string_lt(i8*, i32, i8*, i32); declare i1 @string_le(i8*, i32, i8*, i32); declare i1 @string_gt(i8*, i32, i8*, i32); declare i1 @string_ge(i8*, i32, i8*, i32); declare i1 @string_eq(i8*, i32, i8*, i32); declare i1 @string_ne(i8*, i32, i8*, i32); declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8); declare i1 @regexp_like(i8*, i32, i8*, i32, i8); declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8); declare void @linear_probabilistic_count(i8*, i32, i8*, i32); declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64); declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64); declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64); declare void @record_error_code(i32, i32*); declare i32 @get_error_code(i32*); declare i1 @dynamic_watchdog(); declare i1 @check_interrupt(); declare void @force_sync(); declare void @sync_warp(); declare void @sync_warp_protected(i64, i64); declare void @sync_threadblock(); declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32); declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64); declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float); declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double); declare double @decompress_x_coord_geoint(i32); declare double @decompress_y_coord_geoint(i32); )" + gen_array_any_all_sigs() +
604 
605 #ifdef HAVE_CUDA
606 std::string extension_function_decls(const std::unordered_set<std::string>& udf_decls) {
607  const auto decls =
608  ExtensionFunctionsWhitelist::getLLVMDeclarations(udf_decls, /*is_gpu=*/true);
609  return boost::algorithm::join(decls, "\n");
610 }
611 
612 void legalize_nvvm_ir(llvm::Function* query_func) {
613  // optimizations might add attributes to the function
614  // and NVPTX doesn't understand all of them; play it
615  // safe and clear all attributes
616  clear_function_attributes(query_func);
617  verify_function_ir(query_func);
618 
619  std::vector<llvm::Instruction*> stackrestore_intrinsics;
620  std::vector<llvm::Instruction*> stacksave_intrinsics;
621  for (auto& BB : *query_func) {
622  for (llvm::Instruction& I : BB) {
623  if (const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
624  if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
625  stacksave_intrinsics.push_back(&I);
626  } else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
627  stackrestore_intrinsics.push_back(&I);
628  }
629  }
630  }
631  }
632 
633  // stacksave and stackrestore intrinsics appear together, and
634  // stackrestore uses stacksaved result as its argument
635  // so it should be removed first.
636  for (auto& II : stackrestore_intrinsics) {
637  II->eraseFromParent();
638  }
639  for (auto& II : stacksave_intrinsics) {
640  II->eraseFromParent();
641  }
642 }
643 #endif // HAVE_CUDA
644 
645 } // namespace
646 
647 llvm::StringRef get_gpu_target_triple_string() {
648  return llvm::StringRef("nvptx64-nvidia-cuda");
649 }
650 
651 llvm::StringRef get_gpu_data_layout() {
652  return llvm::StringRef(
653  "e-p:64:64:64-i1:8:8-i8:8:8-"
654  "i16:16:16-i32:32:32-i64:64:64-"
655  "f32:32:32-f64:64:64-v16:16:16-"
656  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
657 }
658 
659 std::map<std::string, std::string> get_device_parameters(bool cpu_only) {
660  std::map<std::string, std::string> result;
661 
662  result.insert(std::make_pair("cpu_name", llvm::sys::getHostCPUName()));
663  result.insert(std::make_pair("cpu_triple", llvm::sys::getProcessTriple()));
664  result.insert(
665  std::make_pair("cpu_cores", std::to_string(llvm::sys::getHostNumPhysicalCores())));
666  result.insert(std::make_pair("cpu_threads", std::to_string(cpu_threads())));
667 
668  // https://en.cppreference.com/w/cpp/language/types
669  std::string sizeof_types;
670  sizeof_types += "bool:" + std::to_string(sizeof(bool)) + ";";
671  sizeof_types += "size_t:" + std::to_string(sizeof(size_t)) + ";";
672  sizeof_types += "ssize_t:" + std::to_string(sizeof(ssize_t)) + ";";
673  sizeof_types += "char:" + std::to_string(sizeof(char)) + ";";
674  sizeof_types += "uchar:" + std::to_string(sizeof(unsigned char)) + ";";
675  sizeof_types += "short:" + std::to_string(sizeof(short)) + ";";
676  sizeof_types += "ushort:" + std::to_string(sizeof(unsigned short int)) + ";";
677  sizeof_types += "int:" + std::to_string(sizeof(int)) + ";";
678  sizeof_types += "uint:" + std::to_string(sizeof(unsigned int)) + ";";
679  sizeof_types += "long:" + std::to_string(sizeof(long int)) + ";";
680  sizeof_types += "ulong:" + std::to_string(sizeof(unsigned long int)) + ";";
681  sizeof_types += "longlong:" + std::to_string(sizeof(long long int)) + ";";
682  sizeof_types += "ulonglong:" + std::to_string(sizeof(unsigned long long int)) + ";";
683  sizeof_types += "float:" + std::to_string(sizeof(float)) + ";";
684  sizeof_types += "double:" + std::to_string(sizeof(double)) + ";";
685  sizeof_types += "longdouble:" + std::to_string(sizeof(long double)) + ";";
686  sizeof_types += "voidptr:" + std::to_string(sizeof(void*)) + ";";
687 
688  result.insert(std::make_pair("type_sizeof", sizeof_types));
689 
690  std::string null_values;
691  null_values += "boolean1:" + std::to_string(serialized_null_value<bool>()) + ";";
692  null_values += "boolean8:" + std::to_string(serialized_null_value<int8_t>()) + ";";
693  null_values += "int8:" + std::to_string(serialized_null_value<int8_t>()) + ";";
694  null_values += "int16:" + std::to_string(serialized_null_value<int16_t>()) + ";";
695  null_values += "int32:" + std::to_string(serialized_null_value<int32_t>()) + ";";
696  null_values += "int64:" + std::to_string(serialized_null_value<int64_t>()) + ";";
697  null_values += "uint8:" + std::to_string(serialized_null_value<uint8_t>()) + ";";
698  null_values += "uint16:" + std::to_string(serialized_null_value<uint16_t>()) + ";";
699  null_values += "uint32:" + std::to_string(serialized_null_value<uint32_t>()) + ";";
700  null_values += "uint64:" + std::to_string(serialized_null_value<uint64_t>()) + ";";
701  null_values += "float32:" + std::to_string(serialized_null_value<float>()) + ";";
702  null_values += "float64:" + std::to_string(serialized_null_value<double>()) + ";";
703  null_values +=
704  "Array<boolean8>:" + std::to_string(serialized_null_value<int8_t, true>()) + ";";
705  null_values +=
706  "Array<int8>:" + std::to_string(serialized_null_value<int8_t, true>()) + ";";
707  null_values +=
708  "Array<int16>:" + std::to_string(serialized_null_value<int16_t, true>()) + ";";
709  null_values +=
710  "Array<int32>:" + std::to_string(serialized_null_value<int32_t, true>()) + ";";
711  null_values +=
712  "Array<int64>:" + std::to_string(serialized_null_value<int64_t, true>()) + ";";
713  null_values +=
714  "Array<float32>:" + std::to_string(serialized_null_value<float, true>()) + ";";
715  null_values +=
716  "Array<float64>:" + std::to_string(serialized_null_value<double, true>()) + ";";
717 
718  result.insert(std::make_pair("null_values", null_values));
719 
720  llvm::StringMap<bool> cpu_features;
721  if (llvm::sys::getHostCPUFeatures(cpu_features)) {
722  std::string features_str = "";
723  for (auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
724  features_str += (it->getValue() ? " +" : " -");
725  features_str += it->getKey().str();
726  }
727  result.insert(std::make_pair("cpu_features", features_str));
728  }
729 
730  result.insert(std::make_pair("llvm_version",
731  std::to_string(LLVM_VERSION_MAJOR) + "." +
732  std::to_string(LLVM_VERSION_MINOR) + "." +
733  std::to_string(LLVM_VERSION_PATCH)));
734 
735 #ifdef HAVE_CUDA
736  if (!cpu_only) {
737  int device_count = 0;
738  checkCudaErrors(cuDeviceGetCount(&device_count));
739  if (device_count) {
740  CUdevice device{};
741  char device_name[256];
742  int major = 0, minor = 0;
743  int driver_version;
744  checkCudaErrors(cuDeviceGet(&device, 0)); // assuming homogeneous multi-GPU system
745  checkCudaErrors(cuDeviceGetName(device_name, 256, device));
746  checkCudaErrors(cuDeviceGetAttribute(
747  &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
748  checkCudaErrors(cuDeviceGetAttribute(
749  &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
750  checkCudaErrors(cuDriverGetVersion(&driver_version));
751 
752  result.insert(std::make_pair("gpu_name", device_name));
753  result.insert(std::make_pair("gpu_count", std::to_string(device_count)));
754  result.insert(std::make_pair("gpu_compute_capability",
755  std::to_string(major) + "." + std::to_string(minor)));
756  result.insert(std::make_pair("gpu_triple", get_gpu_target_triple_string()));
757  result.insert(std::make_pair("gpu_datalayout", get_gpu_data_layout()));
758  result.insert(std::make_pair("gpu_driver",
759  "CUDA " + std::to_string(driver_version / 1000) + "." +
760  std::to_string((driver_version % 1000) / 10)));
761  }
762  }
763 #endif
764 
765  return result;
766 }
767 
768 std::shared_ptr<GpuCompilationContext> CodeGenerator::generateNativeGPUCode(
769  llvm::Function* func,
770  llvm::Function* wrapper_func,
771  const std::unordered_set<llvm::Function*>& live_funcs,
772  const CompilationOptions& co,
773  const GPUTarget& gpu_target) {
774 #ifdef HAVE_CUDA
775  auto module = func->getParent();
776  /*
777  `func` is one of the following generated functions:
778  - `call_table_function(i8** %input_col_buffers, i64*
779  %input_row_count, i64** %output_buffers, i64* %output_row_count)`
780  that wraps the user-defined table function.
781  - `multifrag_query`
782  - `multifrag_query_hoisted_literals`
783  - ...
784 
785  `wrapper_func` is table_func_kernel(i32*, i8**, i64*, i64**,
786  i64*) that wraps `call_table_function`.
787 
788  `module` is from `build/QueryEngine/RuntimeFunctions.bc` and it
789  contains `func` and `wrapper_func`. `module` should also contain
790  the definitions of user-defined table functions.
791 
792  `live_funcs` contains table_func_kernel and call_table_function
793 
794  `gpu_target.cgen_state->module_` appears to be the same as `module`
795  */
796  CHECK(gpu_target.cgen_state->module_ == module);
797  module->setDataLayout(
798  "e-p:64:64:64-i1:8:8-i8:8:8-"
799  "i16:16:16-i32:32:32-i64:64:64-"
800  "f32:32:32-f64:64:64-v16:16:16-"
801  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
802  module->setTargetTriple("nvptx64-nvidia-cuda");
803  CHECK(gpu_target.nvptx_target_machine);
804  auto pass_manager_builder = llvm::PassManagerBuilder();
805 
806  pass_manager_builder.OptLevel = 0;
807  llvm::legacy::PassManager module_pass_manager;
808  pass_manager_builder.populateModulePassManager(module_pass_manager);
809 
810  bool requires_libdevice = check_module_requires_libdevice(module);
811 
812  if (requires_libdevice) {
813  // add nvvm reflect pass replacing any NVVM conditionals with constants
814  gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
815  llvm::legacy::FunctionPassManager FPM(module);
816  pass_manager_builder.populateFunctionPassManager(FPM);
817 
818  // Run the NVVMReflectPass here rather than inside optimize_ir
819  FPM.doInitialization();
820  for (auto& F : *module) {
821  FPM.run(F);
822  }
823  FPM.doFinalization();
824  }
825 
826  // run optimizations
827  optimize_ir(func, module, module_pass_manager, live_funcs, co);
828  legalize_nvvm_ir(func);
829 
830  std::stringstream ss;
831  llvm::raw_os_ostream os(ss);
832 
833  llvm::LLVMContext& ctx = module->getContext();
834  // Get "nvvm.annotations" metadata node
835  llvm::NamedMDNode* md = module->getOrInsertNamedMetadata("nvvm.annotations");
836 
837  llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
838  llvm::MDString::get(ctx, "kernel"),
839  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
840  llvm::Type::getInt32Ty(ctx), 1))};
841 
842  // Append metadata to nvvm.annotations
843  md->addOperand(llvm::MDNode::get(ctx, md_vals));
844 
845  std::unordered_set<llvm::Function*> roots{wrapper_func, func};
846  if (gpu_target.row_func_not_inlined) {
847  clear_function_attributes(gpu_target.cgen_state->row_func_);
848  roots.insert(gpu_target.cgen_state->row_func_);
849  if (gpu_target.cgen_state->filter_func_) {
850  roots.insert(gpu_target.cgen_state->filter_func_);
851  }
852  }
853 
854  // prevent helper functions from being removed
855  for (auto f : gpu_target.cgen_state->helper_functions_) {
856  roots.insert(f);
857  }
858 
859  // Prevent the udf function(s) from being removed the way the runtime functions are
860  std::unordered_set<std::string> udf_declarations;
861  if (is_udf_module_present()) {
862  for (auto& f : udf_gpu_module->getFunctionList()) {
863  llvm::Function* udf_function = module->getFunction(f.getName());
864 
865  if (udf_function) {
866  legalize_nvvm_ir(udf_function);
867  roots.insert(udf_function);
869  // If we have a udf that declares a external function
870  // note it so we can avoid duplicate declarations
871  if (f.isDeclaration()) {
872  udf_declarations.insert(f.getName().str());
873  }
874  }
875  }
876  }
877 
878  if (is_rt_udf_module_present()) {
879  for (auto& f : rt_udf_gpu_module->getFunctionList()) {
880  llvm::Function* udf_function = module->getFunction(f.getName());
881  if (udf_function) {
882  legalize_nvvm_ir(udf_function);
883  roots.insert(udf_function);
884 
885  // If we have a udf that declares a external function
886  // note it so we can avoid duplicate declarations
887  if (f.isDeclaration()) {
888  udf_declarations.insert(f.getName().str());
889  }
890  }
891  }
892  }
893 
894  std::vector<llvm::Function*> rt_funcs;
895  for (auto& Fn : *module) {
896  if (roots.count(&Fn)) {
897  continue;
898  }
899  rt_funcs.push_back(&Fn);
900  }
901  for (auto& pFn : rt_funcs) {
902  pFn->removeFromParent();
903  }
904 
905  if (requires_libdevice) {
906  add_intrinsics_to_module(module);
907  }
908 
909  module->print(os, nullptr);
910  os.flush();
911 
912  for (auto& pFn : rt_funcs) {
913  module->getFunctionList().push_back(pFn);
914  }
915  module->eraseNamedMetadata(md);
916 
917  auto cuda_llir = ss.str() + cuda_rt_decls + extension_function_decls(udf_declarations);
918  std::string ptx;
919  try {
920  ptx = generatePTX(
921  cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
922  } catch (ParseIRError& e) {
923  LOG(WARNING) << "Failed to generate PTX: " << e.what()
924  << ". Switching to CPU execution target.";
925  throw QueryMustRunOnCpu();
926  }
927  LOG(PTX) << "PTX for the GPU:\n" << ptx << "\nEnd of PTX";
928 
929  auto cubin_result = ptx_to_cubin(ptx, gpu_target.block_size, gpu_target.cuda_mgr);
930  auto& option_keys = cubin_result.option_keys;
931  auto& option_values = cubin_result.option_values;
932  auto cubin = cubin_result.cubin;
933  auto link_state = cubin_result.link_state;
934  const auto num_options = option_keys.size();
935 
936  auto func_name = wrapper_func->getName().str();
937  auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
938  for (int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
939  ++device_id) {
940  gpu_compilation_context->addDeviceCode(
941  std::make_unique<GpuDeviceCompilationContext>(cubin,
942  func_name,
943  device_id,
944  gpu_target.cuda_mgr,
945  num_options,
946  &option_keys[0],
947  &option_values[0]));
948  }
949 
950  checkCudaErrors(cuLinkDestroy(link_state));
951  return gpu_compilation_context;
952 #else
953  return {};
954 #endif
955 }
956 
957 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenGPU(
958  llvm::Function* query_func,
959  llvm::Function* multifrag_query_func,
960  std::unordered_set<llvm::Function*>& live_funcs,
961  const bool no_inline,
962  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
963  const CompilationOptions& co) {
964 #ifdef HAVE_CUDA
965  auto module = multifrag_query_func->getParent();
966 
967  CHECK(cuda_mgr);
968  CodeCacheKey key{serialize_llvm_object(query_func),
969  serialize_llvm_object(cgen_state_->row_func_)};
970  if (cgen_state_->filter_func_) {
971  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
972  }
973  for (const auto helper : cgen_state_->helper_functions_) {
974  key.push_back(serialize_llvm_object(helper));
975  }
976  auto cached_code = getCodeFromCache(key, gpu_code_cache_);
977  if (cached_code) {
978  return cached_code;
979  }
980 
981  bool row_func_not_inlined = false;
982  if (no_inline) {
983  for (auto it = llvm::inst_begin(cgen_state_->row_func_),
984  e = llvm::inst_end(cgen_state_->row_func_);
985  it != e;
986  ++it) {
987  if (llvm::isa<llvm::CallInst>(*it)) {
988  auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
989  if (get_gv_call.getCalledFunction()->getName() == "array_size" ||
990  get_gv_call.getCalledFunction()->getName() == "linear_probabilistic_count") {
991  mark_function_never_inline(cgen_state_->row_func_);
992  row_func_not_inlined = true;
993  break;
994  }
995  }
996  }
997  }
998 
999  initializeNVPTXBackend();
1000  CodeGenerator::GPUTarget gpu_target{nvptx_target_machine_.get(),
1001  cuda_mgr,
1002  blockSize(),
1003  cgen_state_.get(),
1004  row_func_not_inlined};
1005  std::shared_ptr<GpuCompilationContext> compilation_context;
1006 
1007  if (check_module_requires_libdevice(module)) {
1008  if (g_rt_libdevice_module == nullptr) {
1009  // raise error
1010  throw std::runtime_error(
1011  "libdevice library is not available but required by the UDF module");
1012  }
1013 
1014  // Bind libdevice it to the current module
1015  CodeGenerator::link_udf_module(g_rt_libdevice_module,
1016  *module,
1017  cgen_state_.get(),
1018  llvm::Linker::Flags::OverrideFromSrc);
1019 
1020  // activate nvvm-reflect-ftz flag on the module
1021  module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", (int)1);
1022  for (llvm::Function& fn : *module) {
1023  fn.addFnAttr("nvptx-f32ftz", "true");
1024  }
1025  }
1026 
1027  try {
1028  compilation_context = CodeGenerator::generateNativeGPUCode(
1029  query_func, multifrag_query_func, live_funcs, co, gpu_target);
1030  addCodeToCache(key, compilation_context, module, gpu_code_cache_);
1031  } catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1032  if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1033  // Thrown if memory not able to be allocated on gpu
1034  // Retry once after evicting portion of code cache
1035  LOG(WARNING) << "Failed to allocate GPU memory for generated code. Evicting "
1037  << "% of GPU code cache and re-trying.";
1038  gpu_code_cache_.evictFractionEntries(g_fraction_code_cache_to_evict);
1039  compilation_context = CodeGenerator::generateNativeGPUCode(
1040  query_func, multifrag_query_func, live_funcs, co, gpu_target);
1041  addCodeToCache(key, compilation_context, module, gpu_code_cache_);
1042  } else {
1043  throw;
1044  }
1045  }
1046  CHECK(compilation_context);
1047  return compilation_context;
1048 #else
1049  return nullptr;
1050 #endif
1051 }
1052 
1053 std::string CodeGenerator::generatePTX(const std::string& cuda_llir,
1054  llvm::TargetMachine* nvptx_target_machine,
1055  llvm::LLVMContext& context) {
1056  auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir, "", false);
1057 
1058  llvm::SMDiagnostic parse_error;
1059 
1060  auto module = llvm::parseIR(mem_buff->getMemBufferRef(), parse_error, context);
1061  if (!module) {
1062  LOG(IR) << "CodeGenerator::generatePTX:NVVM IR:\n" << cuda_llir << "\nEnd of NNVM IR";
1063  throw_parseIR_error(parse_error, "generatePTX", /* is_gpu= */ true);
1064  }
1065 
1066  llvm::SmallString<256> code_str;
1067  llvm::raw_svector_ostream formatted_os(code_str);
1068  CHECK(nvptx_target_machine);
1069  {
1070  llvm::legacy::PassManager ptxgen_pm;
1071  module->setDataLayout(nvptx_target_machine->createDataLayout());
1072 
1073 #if LLVM_VERSION_MAJOR >= 10
1074  nvptx_target_machine->addPassesToEmitFile(
1075  ptxgen_pm, formatted_os, nullptr, llvm::CGFT_AssemblyFile);
1076 #else
1077  nvptx_target_machine->addPassesToEmitFile(
1078  ptxgen_pm, formatted_os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1079 #endif
1080  ptxgen_pm.run(*module);
1081  }
1082 
1083 #if LLVM_VERSION_MAJOR >= 11
1084  return std::string(code_str);
1085 #else
1086  return code_str.str();
1087 #endif
1088 }
1089 
1090 std::unique_ptr<llvm::TargetMachine> CodeGenerator::initializeNVPTXBackend(
1092  llvm::InitializeAllTargets();
1093  llvm::InitializeAllTargetMCs();
1094  llvm::InitializeAllAsmPrinters();
1095  std::string err;
1096  auto target = llvm::TargetRegistry::lookupTarget("nvptx64", err);
1097  if (!target) {
1098  LOG(FATAL) << err;
1099  }
1100  return std::unique_ptr<llvm::TargetMachine>(
1101  target->createTargetMachine("nvptx64-nvidia-cuda",
1103  "",
1104  llvm::TargetOptions(),
1105  llvm::Reloc::Static));
1106 }
1107 
1108 std::string Executor::generatePTX(const std::string& cuda_llir) const {
1110  cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1111 }
1112 
1113 void Executor::initializeNVPTXBackend() const {
1114  if (nvptx_target_machine_) {
1115  return;
1116  }
1117  const auto cuda_mgr = catalog_->getDataMgr().getCudaMgr();
1118  LOG_IF(FATAL, cuda_mgr == nullptr) << "No CudaMgr instantiated, unable to check device "
1119  "architecture or generate code for nvidia GPUs.";
1120  const auto arch = cuda_mgr->getDeviceArch();
1121  nvptx_target_machine_ = CodeGenerator::initializeNVPTXBackend(arch);
1122 }
1123 
1124 // A small number of runtime functions don't get through CgenState::emitCall. List them
1125 // explicitly here and always clone their implementation from the runtime module.
1126 bool CodeGenerator::alwaysCloneRuntimeFunction(const llvm::Function* func) {
1127  return func->getName() == "query_stub_hoisted_literals" ||
1128  func->getName() == "multifrag_query_hoisted_literals" ||
1129  func->getName() == "query_stub" || func->getName() == "multifrag_query" ||
1130  func->getName() == "fixed_width_int_decode" ||
1131  func->getName() == "fixed_width_unsigned_decode" ||
1132  func->getName() == "diff_fixed_width_int_decode" ||
1133  func->getName() == "fixed_width_double_decode" ||
1134  func->getName() == "fixed_width_float_decode" ||
1135  func->getName() == "fixed_width_small_date_decode" ||
1136  func->getName() == "record_error_code" || func->getName() == "get_error_code" ||
1137  func->getName() == "pos_start_impl" || func->getName() == "pos_step_impl" ||
1138  func->getName() == "group_buff_idx_impl" ||
1139  func->getName() == "init_shared_mem" ||
1140  func->getName() == "init_shared_mem_nop" || func->getName() == "write_back_nop";
1141 }
1142 
1143 llvm::Module* read_template_module(llvm::LLVMContext& context) {
1144  llvm::SMDiagnostic err;
1145 
1146  auto buffer_or_error = llvm::MemoryBuffer::getFile(omnisci::get_root_abs_path() +
1147  "/QueryEngine/RuntimeFunctions.bc");
1148  CHECK(!buffer_or_error.getError()) << "root path=" << omnisci::get_root_abs_path();
1149  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1150 
1151  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1152  CHECK(!owner.takeError());
1153  auto module = owner.get().release();
1154  CHECK(module);
1155 
1156  return module;
1157 }
1158 
1159 #ifdef HAVE_CUDA
1160 llvm::Module* read_libdevice_module(llvm::LLVMContext& context) {
1161  llvm::SMDiagnostic err;
1162  const auto env = get_cuda_home();
1163 
1164  boost::filesystem::path cuda_path{env};
1165  cuda_path /= "nvvm";
1166  cuda_path /= "libdevice";
1167  cuda_path /= "libdevice.10.bc";
1168 
1169  if (!boost::filesystem::exists(cuda_path)) {
1170  LOG(WARNING) << "Could not find CUDA libdevice; support for some UDF "
1171  "functions might not be available.";
1172  return nullptr;
1173  }
1174 
1175  auto buffer_or_error = llvm::MemoryBuffer::getFile(cuda_path.c_str());
1176  CHECK(!buffer_or_error.getError()) << "cuda_path=" << cuda_path.c_str();
1177  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1178 
1179  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1180  CHECK(!owner.takeError());
1181  auto module = owner.get().release();
1182  CHECK(module);
1183 
1184  return module;
1185 }
1186 #endif
1187 
1188 #ifdef ENABLE_GEOS
1189 llvm::Module* read_geos_module(llvm::LLVMContext& context) {
1190  llvm::SMDiagnostic err;
1191 
1192  auto buffer_or_error = llvm::MemoryBuffer::getFile(omnisci::get_root_abs_path() +
1193  "/QueryEngine/GeosRuntime.bc");
1194  CHECK(!buffer_or_error.getError()) << "root path=" << omnisci::get_root_abs_path();
1195  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1196 
1197  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1198  CHECK(!owner.takeError());
1199  auto module = owner.get().release();
1200  CHECK(module);
1201 
1202  return module;
1203 }
1204 #endif
1205 
1206 namespace {
1207 
1208 void bind_pos_placeholders(const std::string& pos_fn_name,
1209  const bool use_resume_param,
1210  llvm::Function* query_func,
1211  llvm::Module* module) {
1212  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1213  ++it) {
1214  if (!llvm::isa<llvm::CallInst>(*it)) {
1215  continue;
1216  }
1217  auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1218  if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
1219  if (use_resume_param) {
1220  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1221  llvm::ReplaceInstWithInst(
1222  &pos_call,
1223  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl"),
1224  error_code_arg));
1225  } else {
1226  llvm::ReplaceInstWithInst(
1227  &pos_call,
1228  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl")));
1229  }
1230  break;
1231  }
1232  }
1233 }
1234 
1235 void set_row_func_argnames(llvm::Function* row_func,
1236  const size_t in_col_count,
1237  const size_t agg_col_count,
1238  const bool hoist_literals) {
1239  auto arg_it = row_func->arg_begin();
1240 
1241  if (agg_col_count) {
1242  for (size_t i = 0; i < agg_col_count; ++i) {
1243  arg_it->setName("out");
1244  ++arg_it;
1245  }
1246  } else {
1247  arg_it->setName("group_by_buff");
1248  ++arg_it;
1249  arg_it->setName("crt_matched");
1250  ++arg_it;
1251  arg_it->setName("total_matched");
1252  ++arg_it;
1253  arg_it->setName("old_total_matched");
1254  ++arg_it;
1255  arg_it->setName("max_matched");
1256  ++arg_it;
1257  }
1258 
1259  arg_it->setName("agg_init_val");
1260  ++arg_it;
1261 
1262  arg_it->setName("pos");
1263  ++arg_it;
1264 
1265  arg_it->setName("frag_row_off");
1266  ++arg_it;
1267 
1268  arg_it->setName("num_rows_per_scan");
1269  ++arg_it;
1270 
1271  if (hoist_literals) {
1272  arg_it->setName("literals");
1273  ++arg_it;
1274  }
1275 
1276  for (size_t i = 0; i < in_col_count; ++i) {
1277  arg_it->setName("col_buf" + std::to_string(i));
1278  ++arg_it;
1279  }
1280 
1281  arg_it->setName("join_hash_tables");
1282 }
1283 
1284 llvm::Function* create_row_function(const size_t in_col_count,
1285  const size_t agg_col_count,
1286  const bool hoist_literals,
1287  llvm::Module* module,
1288  llvm::LLVMContext& context) {
1289  std::vector<llvm::Type*> row_process_arg_types;
1290 
1291  if (agg_col_count) {
1292  // output (aggregate) arguments
1293  for (size_t i = 0; i < agg_col_count; ++i) {
1294  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1295  }
1296  } else {
1297  // group by buffer
1298  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1299  // current match count
1300  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1301  // total match count passed from the caller
1302  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1303  // old total match count returned to the caller
1304  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1305  // max matched (total number of slots in the output buffer)
1306  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1307  }
1308 
1309  // aggregate init values
1310  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1311 
1312  // position argument
1313  row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1314 
1315  // fragment row offset argument
1316  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1318  // number of rows for each scan
1319  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1320 
1321  // literals buffer argument
1322  if (hoist_literals) {
1323  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1324  }
1325 
1326  // column buffer arguments
1327  for (size_t i = 0; i < in_col_count; ++i) {
1328  row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1329  }
1330 
1331  // join hash table argument
1332  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1333 
1334  // generate the function
1335  auto ft =
1336  llvm::FunctionType::get(get_int_type(32, context), row_process_arg_types, false);
1337 
1338  auto row_func =
1339  llvm::Function::Create(ft, llvm::Function::ExternalLinkage, "row_func", module);
1340 
1341  // set the row function argument names; for debugging purposes only
1342  set_row_func_argnames(row_func, in_col_count, agg_col_count, hoist_literals);
1343 
1344  return row_func;
1345 }
1346 
1347 // Iterate through multifrag_query_func, replacing calls to query_fname with query_func.
1348 void bind_query(llvm::Function* query_func,
1349  const std::string& query_fname,
1350  llvm::Function* multifrag_query_func,
1351  llvm::Module* module) {
1352  std::vector<llvm::CallInst*> query_stubs;
1353  for (auto it = llvm::inst_begin(multifrag_query_func),
1354  e = llvm::inst_end(multifrag_query_func);
1355  it != e;
1356  ++it) {
1357  if (!llvm::isa<llvm::CallInst>(*it)) {
1358  continue;
1359  }
1360  auto& query_call = llvm::cast<llvm::CallInst>(*it);
1361  if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
1362  query_stubs.push_back(&query_call);
1363  }
1364  }
1365  for (auto& S : query_stubs) {
1366  std::vector<llvm::Value*> args;
1367  for (size_t i = 0; i < S->getNumArgOperands(); ++i) {
1368  args.push_back(S->getArgOperand(i));
1369  }
1370  llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args, ""));
1371  }
1372 }
1373 
1374 std::vector<std::string> get_agg_fnames(const std::vector<Analyzer::Expr*>& target_exprs,
1375  const bool is_group_by) {
1376  std::vector<std::string> result;
1377  for (size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1378  ++target_idx, ++agg_col_idx) {
1379  const auto target_expr = target_exprs[target_idx];
1380  CHECK(target_expr);
1381  const auto target_type_info = target_expr->get_type_info();
1382  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
1383  const bool is_varlen =
1384  (target_type_info.is_string() &&
1385  target_type_info.get_compression() == kENCODING_NONE) ||
1386  target_type_info.is_array(); // TODO: should it use is_varlen_array() ?
1387  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
1388  result.emplace_back(target_type_info.is_fp() ? "agg_id_double" : "agg_id");
1389  if (is_varlen) {
1390  result.emplace_back("agg_id");
1391  }
1392  if (target_type_info.is_geometry()) {
1393  result.emplace_back("agg_id");
1394  for (auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1395  result.emplace_back("agg_id");
1396  }
1397  }
1398  continue;
1399  }
1400  const auto agg_type = agg_expr->get_aggtype();
1401  const auto& agg_type_info =
1402  agg_type != kCOUNT ? agg_expr->get_arg()->get_type_info() : target_type_info;
1403  switch (agg_type) {
1404  case kAVG: {
1405  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1406  !agg_type_info.is_fp()) {
1407  throw std::runtime_error("AVG is only valid on integer and floating point");
1408  }
1409  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1410  ? "agg_sum"
1411  : "agg_sum_double");
1412  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1413  ? "agg_count"
1414  : "agg_count_double");
1415  break;
1416  }
1417  case kMIN: {
1418  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1419  agg_type_info.is_geometry()) {
1420  throw std::runtime_error(
1421  "MIN on strings, arrays or geospatial types not supported yet");
1422  }
1423  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1424  ? "agg_min"
1425  : "agg_min_double");
1426  break;
1427  }
1428  case kMAX: {
1429  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1430  agg_type_info.is_geometry()) {
1431  throw std::runtime_error(
1432  "MAX on strings, arrays or geospatial types not supported yet");
1433  }
1434  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1435  ? "agg_max"
1436  : "agg_max_double");
1437  break;
1438  }
1439  case kSUM: {
1440  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1441  !agg_type_info.is_fp()) {
1442  throw std::runtime_error("SUM is only valid on integer and floating point");
1443  }
1444  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1445  ? "agg_sum"
1446  : "agg_sum_double");
1447  break;
1448  }
1449  case kCOUNT:
1450  result.emplace_back(agg_expr->get_is_distinct() ? "agg_count_distinct"
1451  : "agg_count");
1452  break;
1453  case kSINGLE_VALUE: {
1454  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1455  break;
1456  }
1457  case kSAMPLE: {
1458  // Note that varlen SAMPLE arguments are handled separately above
1459  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1460  break;
1461  }
1463  result.emplace_back("agg_approximate_count_distinct");
1464  break;
1465  case kAPPROX_MEDIAN:
1466  result.emplace_back("agg_approx_median");
1467  break;
1468  default:
1469  CHECK(false);
1470  }
1471  }
1472  return result;
1473 }
1474 
1475 } // namespace
1476 
1477 std::unique_ptr<llvm::Module> g_rt_module(read_template_module(getGlobalLLVMContext()));
1478 
1479 #ifdef ENABLE_GEOS
1480 std::unique_ptr<llvm::Module> g_rt_geos_module(read_geos_module(getGlobalLLVMContext()));
1481 #endif
1482 
1483 #ifdef HAVE_CUDA
1484 std::unique_ptr<llvm::Module> g_rt_libdevice_module(
1485  read_libdevice_module(getGlobalLLVMContext()));
1486 #endif
1487 
1488 bool is_udf_module_present(bool cpu_only) {
1489  return (cpu_only || udf_gpu_module != nullptr) && (udf_cpu_module != nullptr);
1490 }
1491 
1492 bool is_rt_udf_module_present(bool cpu_only) {
1493  return (cpu_only || rt_udf_gpu_module != nullptr) && (rt_udf_cpu_module != nullptr);
1494 }
1495 
1496 void read_udf_gpu_module(const std::string& udf_ir_filename) {
1497  llvm::SMDiagnostic parse_error;
1498 
1499  llvm::StringRef file_name_arg(udf_ir_filename);
1500  udf_gpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1501 
1502  if (!udf_gpu_module) {
1503  throw_parseIR_error(parse_error, udf_ir_filename, /* is_gpu= */ true);
1504  }
1505 
1506  llvm::Triple gpu_triple(udf_gpu_module->getTargetTriple());
1507  if (!gpu_triple.isNVPTX()) {
1508  LOG(WARNING)
1509  << "Expected triple nvptx64-nvidia-cuda for NVVM IR of loadtime UDFs but got "
1510  << gpu_triple.str() << ". Disabling the NVVM IR module.";
1511  udf_gpu_module = nullptr;
1512  }
1513 }
1514 
1515 void read_udf_cpu_module(const std::string& udf_ir_filename) {
1516  llvm::SMDiagnostic parse_error;
1517 
1518  llvm::StringRef file_name_arg(udf_ir_filename);
1519 
1520  udf_cpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1521  if (!udf_cpu_module) {
1522  throw_parseIR_error(parse_error, udf_ir_filename);
1523  }
1524 }
1525 
1526 void read_rt_udf_gpu_module(const std::string& udf_ir_string) {
1527  llvm::SMDiagnostic parse_error;
1528 
1529  auto buf =
1530  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for GPU");
1531 
1532  rt_udf_gpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1533  if (!rt_udf_gpu_module) {
1534  LOG(IR) << "read_rt_udf_gpu_module:NVVM IR:\n" << udf_ir_string << "\nEnd of NNVM IR";
1535  throw_parseIR_error(parse_error, "", /* is_gpu= */ true);
1536  }
1537 
1538  llvm::Triple gpu_triple(rt_udf_gpu_module->getTargetTriple());
1539  if (!gpu_triple.isNVPTX()) {
1540  LOG(IR) << "read_rt_udf_gpu_module:NVVM IR:\n" << udf_ir_string << "\nEnd of NNVM IR";
1541  LOG(WARNING) << "Expected triple nvptx64-nvidia-cuda for NVVM IR but got "
1542  << gpu_triple.str()
1543  << ". Executing runtime UDFs on GPU will be disabled.";
1544  rt_udf_gpu_module = nullptr;
1545  return;
1546  }
1547 }
1548 
1549 void read_rt_udf_cpu_module(const std::string& udf_ir_string) {
1550  llvm::SMDiagnostic parse_error;
1551 
1552  auto buf =
1553  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for CPU");
1554 
1555  rt_udf_cpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1556  if (!rt_udf_cpu_module) {
1557  LOG(IR) << "read_rt_udf_cpu_module:LLVM IR:\n" << udf_ir_string << "\nEnd of LLVM IR";
1558  throw_parseIR_error(parse_error);
1559  }
1560 }
1561 
1562 std::unordered_set<llvm::Function*> CodeGenerator::markDeadRuntimeFuncs(
1563  llvm::Module& module,
1564  const std::vector<llvm::Function*>& roots,
1565  const std::vector<llvm::Function*>& leaves) {
1566  std::unordered_set<llvm::Function*> live_funcs;
1567  live_funcs.insert(roots.begin(), roots.end());
1568  live_funcs.insert(leaves.begin(), leaves.end());
1569 
1570  if (auto F = module.getFunction("init_shared_mem_nop")) {
1571  live_funcs.insert(F);
1572  }
1573  if (auto F = module.getFunction("write_back_nop")) {
1574  live_funcs.insert(F);
1575  }
1576 
1577  for (const llvm::Function* F : roots) {
1578  for (const llvm::BasicBlock& BB : *F) {
1579  for (const llvm::Instruction& I : BB) {
1580  if (const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1581  live_funcs.insert(CI->getCalledFunction());
1582  }
1583  }
1584  }
1585  }
1586 
1587  for (llvm::Function& F : module) {
1588  if (!live_funcs.count(&F) && !F.isDeclaration()) {
1589  F.setLinkage(llvm::GlobalValue::InternalLinkage);
1590  }
1591  }
1592 
1593  return live_funcs;
1594 }
1595 
1596 namespace {
1597 // searches for a particular variable within a specific basic block (or all if bb_name is
1598 // empty)
1599 template <typename InstType>
1600 llvm::Value* find_variable_in_basic_block(llvm::Function* func,
1601  std::string bb_name,
1602  std::string variable_name) {
1603  llvm::Value* result = nullptr;
1604  if (func == nullptr || variable_name.empty()) {
1605  return result;
1606  }
1607  bool is_found = false;
1608  for (auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1609  if (!bb_name.empty() && bb_it->getName() != bb_name) {
1610  continue;
1611  }
1612  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1613  if (llvm::isa<InstType>(*inst_it)) {
1614  if (inst_it->getName() == variable_name) {
1615  result = &*inst_it;
1616  is_found = true;
1617  break;
1618  }
1619  }
1620  }
1621  }
1622  return result;
1623 }
1624 }; // namespace
1625 
1627  llvm::Function* query_func,
1628  bool run_with_dynamic_watchdog,
1629  bool run_with_allowing_runtime_interrupt,
1630  ExecutorDeviceType device_type,
1631  const std::vector<InputTableInfo>& input_table_infos) {
1632  AUTOMATIC_IR_METADATA(cgen_state_.get());
1633 
1634  // check whether the row processing was successful; currently, it can
1635  // fail by running out of group by buffer slots
1636 
1637  if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1638  // when both dynamic watchdog and runtime interrupt turns on
1639  // we use dynamic watchdog
1640  run_with_allowing_runtime_interrupt = false;
1641  }
1642 
1643  {
1644  // disable injecting query interrupt checker if the session info is invalid
1645  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
1646  if (current_query_session_.empty()) {
1647  run_with_allowing_runtime_interrupt = false;
1648  }
1649  }
1650 
1651  llvm::Value* row_count = nullptr;
1652  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1653  device_type == ExecutorDeviceType::GPU) {
1654  row_count =
1655  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1656  }
1657 
1658  bool done_splitting = false;
1659  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1660  ++bb_it) {
1661  llvm::Value* pos = nullptr;
1662  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1663  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1664  llvm::isa<llvm::PHINode>(*inst_it)) {
1665  if (inst_it->getName() == "pos") {
1666  pos = &*inst_it;
1667  }
1668  continue;
1669  }
1670  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1671  continue;
1672  }
1673  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1674  if (std::string(row_func_call.getCalledFunction()->getName()) == "row_process") {
1675  auto next_inst_it = inst_it;
1676  ++next_inst_it;
1677  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1678  auto& br_instr = bb_it->back();
1679  llvm::IRBuilder<> ir_builder(&br_instr);
1680  llvm::Value* err_lv = &*inst_it;
1681  llvm::Value* err_lv_returned_from_row_func = nullptr;
1682  if (run_with_dynamic_watchdog) {
1683  CHECK(pos);
1684  llvm::Value* call_watchdog_lv = nullptr;
1685  if (device_type == ExecutorDeviceType::GPU) {
1686  // In order to make sure all threads within a block see the same barrier,
1687  // only those blocks whose none of their threads have experienced the critical
1688  // edge will go through the dynamic watchdog computation
1689  CHECK(row_count);
1690  auto crit_edge_rem =
1691  (blockSize() & (blockSize() - 1))
1692  ? ir_builder.CreateSRem(
1693  row_count,
1694  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1695  : ir_builder.CreateAnd(
1696  row_count,
1697  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1698  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1699  crit_edge_threshold->setName("crit_edge_threshold");
1700 
1701  // only those threads where pos < crit_edge_threshold go through dynamic
1702  // watchdog call
1703  call_watchdog_lv =
1704  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1705  } else {
1706  // CPU path: run watchdog for every 64th row
1707  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1708  call_watchdog_lv = ir_builder.CreateICmp(
1709  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1710  }
1711  CHECK(call_watchdog_lv);
1712  auto error_check_bb = bb_it->splitBasicBlock(
1713  llvm::BasicBlock::iterator(br_instr), ".error_check");
1714  auto& watchdog_br_instr = bb_it->back();
1715 
1716  auto watchdog_check_bb = llvm::BasicBlock::Create(
1717  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1718  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1719  auto detected_timeout = watchdog_ir_builder.CreateCall(
1720  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1721  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1722  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1723  watchdog_ir_builder.CreateBr(error_check_bb);
1725  llvm::ReplaceInstWithInst(
1726  &watchdog_br_instr,
1727  llvm::BranchInst::Create(
1728  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1729  ir_builder.SetInsertPoint(&br_instr);
1730  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1731 
1732  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1733  unified_err_lv->addIncoming(err_lv, &*bb_it);
1734  err_lv = unified_err_lv;
1735  } else if (run_with_allowing_runtime_interrupt) {
1736  CHECK(pos);
1737  llvm::Value* call_check_interrupt_lv = nullptr;
1738  if (device_type == ExecutorDeviceType::GPU) {
1739  // approximate how many times the %pos variable
1740  // is increased --> the number of iteration
1741  // here we calculate the # bit shift by considering grid/block/fragment sizes
1742  // since if we use the fixed one (i.e., per 64-th increment)
1743  // some CUDA threads cannot enter the interrupt checking block depending on
1744  // the fragment size --> a thread may not take care of 64 threads if an outer
1745  // table is not sufficiently large, and so cannot be interrupted
1746  int32_t num_shift_by_gridDim = shared::getExpOfTwo(gridSize());
1747  int32_t num_shift_by_blockDim = shared::getExpOfTwo(blockSize());
1748  int total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1749  uint64_t interrupt_checking_freq = 32;
1750  auto freq_control_knob = g_running_query_interrupt_freq;
1751  CHECK_GT(freq_control_knob, 0);
1752  CHECK_LE(freq_control_knob, 1.0);
1753  if (!input_table_infos.empty()) {
1754  const auto& outer_table_info = *input_table_infos.begin();
1755  auto num_outer_table_tuples = outer_table_info.info.getNumTuples();
1756  if (outer_table_info.table_id < 0) {
1757  auto* rs = (*outer_table_info.info.fragments.begin()).resultSet;
1758  CHECK(rs);
1759  num_outer_table_tuples = rs->entryCount();
1760  } else {
1761  auto num_frags = outer_table_info.info.fragments.size();
1762  if (num_frags > 0) {
1763  num_outer_table_tuples =
1764  outer_table_info.info.fragments.begin()->getNumTuples();
1765  }
1766  }
1767  if (num_outer_table_tuples > 0) {
1768  // gridSize * blockSize --> pos_step (idx of the next row per thread)
1769  // we additionally multiply two to pos_step since the number of
1770  // dispatched blocks are double of the gridSize
1771  // # tuples (of fragment) / pos_step --> maximum # increment (K)
1772  // also we multiply 1 / freq_control_knob to K to control the frequency
1773  // So, needs to check the interrupt status more frequently? make K smaller
1774  auto max_inc = uint64_t(
1775  floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1776  if (max_inc < 2) {
1777  // too small `max_inc`, so this correction is necessary to make
1778  // `interrupt_checking_freq` be valid (i.e., larger than zero)
1779  max_inc = 2;
1780  }
1781  auto calibrated_inc = uint64_t(floor(max_inc * (1 - freq_control_knob)));
1782  interrupt_checking_freq =
1783  uint64_t(pow(2, shared::getExpOfTwo(calibrated_inc)));
1784  // add the coverage when interrupt_checking_freq > K
1785  // if so, some threads still cannot be branched to the interrupt checker
1786  // so we manually use smaller but close to the max_inc as freq
1787  if (interrupt_checking_freq > max_inc) {
1788  interrupt_checking_freq = max_inc / 2;
1789  }
1790  if (interrupt_checking_freq < 8) {
1791  // such small freq incurs too frequent interrupt status checking,
1792  // so we fixup to the minimum freq value at some reasonable degree
1793  interrupt_checking_freq = 8;
1794  }
1795  }
1796  }
1797  VLOG(1) << "Set the running query interrupt checking frequency: "
1798  << interrupt_checking_freq;
1799  // check the interrupt flag for every interrupt_checking_freq-th iteration
1800  llvm::Value* pos_shifted_per_iteration =
1801  ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1802  auto interrupt_predicate =
1803  ir_builder.CreateAnd(pos_shifted_per_iteration, interrupt_checking_freq);
1804  call_check_interrupt_lv =
1805  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1806  interrupt_predicate,
1807  cgen_state_->llInt(int64_t(0LL)));
1808  } else {
1809  // CPU path: run interrupt checker for every 64th row
1810  auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1811  call_check_interrupt_lv =
1812  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1813  interrupt_predicate,
1814  cgen_state_->llInt(int64_t(0LL)));
1815  }
1816  CHECK(call_check_interrupt_lv);
1817  auto error_check_bb = bb_it->splitBasicBlock(
1818  llvm::BasicBlock::iterator(br_instr), ".error_check");
1819  auto& check_interrupt_br_instr = bb_it->back();
1820 
1821  auto interrupt_check_bb = llvm::BasicBlock::Create(
1822  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
1823  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1824  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1825  cgen_state_->module_->getFunction("check_interrupt"), {});
1826  auto interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1827  detected_interrupt, cgen_state_->llInt(Executor::ERR_INTERRUPTED), err_lv);
1828  interrupt_checker_ir_builder.CreateBr(error_check_bb);
1829 
1830  llvm::ReplaceInstWithInst(
1831  &check_interrupt_br_instr,
1832  llvm::BranchInst::Create(
1833  interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
1834  ir_builder.SetInsertPoint(&br_instr);
1835  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1836 
1837  unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
1838  unified_err_lv->addIncoming(err_lv, &*bb_it);
1839  err_lv = unified_err_lv;
1840  }
1841  if (!err_lv_returned_from_row_func) {
1842  err_lv_returned_from_row_func = err_lv;
1843  }
1844  if (device_type == ExecutorDeviceType::GPU && g_enable_dynamic_watchdog) {
1845  // let kernel execution finish as expected, regardless of the observed error,
1846  // unless it is from the dynamic watchdog where all threads within that block
1847  // return together.
1848  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1849  err_lv,
1850  cgen_state_->llInt(Executor::ERR_OUT_OF_TIME));
1851  } else {
1852  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1853  err_lv,
1854  cgen_state_->llInt(static_cast<int32_t>(0)));
1855  }
1856  auto error_bb = llvm::BasicBlock::Create(
1857  cgen_state_->context_, ".error_exit", query_func, new_bb);
1858  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1859  llvm::CallInst::Create(
1860  cgen_state_->module_->getFunction("record_error_code"),
1861  std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
1862  "",
1863  error_bb);
1864  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1865  llvm::ReplaceInstWithInst(&br_instr,
1866  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1867  done_splitting = true;
1868  break;
1869  }
1870  }
1871  }
1872  CHECK(done_splitting);
1873 }
1874 
1875 std::vector<llvm::Value*> Executor::inlineHoistedLiterals() {
1876  AUTOMATIC_IR_METADATA(cgen_state_.get());
1877 
1878  std::vector<llvm::Value*> hoisted_literals;
1879 
1880  // row_func_ is using literals whose defs have been hoisted up to the query_func_,
1881  // extend row_func_ signature to include extra args to pass these literal values.
1882  std::vector<llvm::Type*> row_process_arg_types;
1883 
1884  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1885  E = cgen_state_->row_func_->arg_end();
1886  I != E;
1887  ++I) {
1888  row_process_arg_types.push_back(I->getType());
1889  }
1890 
1891  for (auto& element : cgen_state_->query_func_literal_loads_) {
1892  for (auto value : element.second) {
1893  row_process_arg_types.push_back(value->getType());
1894  }
1895  }
1896 
1897  auto ft = llvm::FunctionType::get(
1898  get_int_type(32, cgen_state_->context_), row_process_arg_types, false);
1899  auto row_func_with_hoisted_literals =
1900  llvm::Function::Create(ft,
1901  llvm::Function::ExternalLinkage,
1902  "row_func_hoisted_literals",
1903  cgen_state_->row_func_->getParent());
1904 
1905  auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
1906  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1907  E = cgen_state_->row_func_->arg_end();
1908  I != E;
1909  ++I) {
1910  if (I->hasName()) {
1911  row_func_arg_it->setName(I->getName());
1912  }
1913  ++row_func_arg_it;
1914  }
1915 
1916  decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{nullptr};
1917  decltype(row_func_arg_it) filter_func_arg_it{nullptr};
1918  if (cgen_state_->filter_func_) {
1919  // filter_func_ is using literals whose defs have been hoisted up to the row_func_,
1920  // extend filter_func_ signature to include extra args to pass these literal values.
1921  std::vector<llvm::Type*> filter_func_arg_types;
1922 
1923  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
1924  E = cgen_state_->filter_func_->arg_end();
1925  I != E;
1926  ++I) {
1927  filter_func_arg_types.push_back(I->getType());
1928  }
1929 
1930  for (auto& element : cgen_state_->query_func_literal_loads_) {
1931  for (auto value : element.second) {
1932  filter_func_arg_types.push_back(value->getType());
1933  }
1934  }
1935 
1936  auto ft2 = llvm::FunctionType::get(
1937  get_int_type(32, cgen_state_->context_), filter_func_arg_types, false);
1938  filter_func_with_hoisted_literals =
1939  llvm::Function::Create(ft2,
1940  llvm::Function::ExternalLinkage,
1941  "filter_func_hoisted_literals",
1942  cgen_state_->filter_func_->getParent());
1943 
1944  filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
1945  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
1946  E = cgen_state_->filter_func_->arg_end();
1947  I != E;
1948  ++I) {
1949  if (I->hasName()) {
1950  filter_func_arg_it->setName(I->getName());
1951  }
1952  ++filter_func_arg_it;
1953  }
1954  }
1955 
1956  std::unordered_map<int, std::vector<llvm::Value*>>
1957  query_func_literal_loads_function_arguments,
1958  query_func_literal_loads_function_arguments2;
1959 
1960  for (auto& element : cgen_state_->query_func_literal_loads_) {
1961  std::vector<llvm::Value*> argument_values, argument_values2;
1962 
1963  for (auto value : element.second) {
1964  hoisted_literals.push_back(value);
1965  argument_values.push_back(&*row_func_arg_it);
1966  if (cgen_state_->filter_func_) {
1967  argument_values2.push_back(&*filter_func_arg_it);
1968  cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
1969  }
1970  if (value->hasName()) {
1971  row_func_arg_it->setName("arg_" + value->getName());
1972  if (cgen_state_->filter_func_) {
1973  filter_func_arg_it->getContext();
1974  filter_func_arg_it->setName("arg_" + value->getName());
1975  }
1976  }
1977  ++row_func_arg_it;
1978  ++filter_func_arg_it;
1979  }
1980 
1981  query_func_literal_loads_function_arguments[element.first] = argument_values;
1982  query_func_literal_loads_function_arguments2[element.first] = argument_values2;
1983  }
1984 
1985  // copy the row_func function body over
1986  // see
1987  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
1988  row_func_with_hoisted_literals->getBasicBlockList().splice(
1989  row_func_with_hoisted_literals->begin(),
1990  cgen_state_->row_func_->getBasicBlockList());
1991 
1992  // also replace row_func arguments with the arguments from row_func_hoisted_literals
1993  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1994  E = cgen_state_->row_func_->arg_end(),
1995  I2 = row_func_with_hoisted_literals->arg_begin();
1996  I != E;
1997  ++I) {
1998  I->replaceAllUsesWith(&*I2);
1999  I2->takeName(&*I);
2000  cgen_state_->filter_func_args_.replace(&*I, &*I2);
2001  ++I2;
2002  }
2003 
2004  cgen_state_->row_func_ = row_func_with_hoisted_literals;
2005 
2006  // and finally replace literal placeholders
2007  std::vector<llvm::Instruction*> placeholders;
2008  std::string prefix("__placeholder__literal_");
2009  for (auto it = llvm::inst_begin(row_func_with_hoisted_literals),
2010  e = llvm::inst_end(row_func_with_hoisted_literals);
2011  it != e;
2012  ++it) {
2013  if (it->hasName() && it->getName().startswith(prefix)) {
2014  auto offset_and_index_entry =
2015  cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
2016  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2017 
2018  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2019  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2020 
2021  it->replaceAllUsesWith(
2022  query_func_literal_loads_function_arguments[lit_off][lit_idx]);
2023  placeholders.push_back(&*it);
2024  }
2025  }
2026  for (auto placeholder : placeholders) {
2027  placeholder->removeFromParent();
2028  }
2029 
2030  if (cgen_state_->filter_func_) {
2031  // copy the filter_func function body over
2032  // see
2033  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2034  filter_func_with_hoisted_literals->getBasicBlockList().splice(
2035  filter_func_with_hoisted_literals->begin(),
2036  cgen_state_->filter_func_->getBasicBlockList());
2037 
2038  // also replace filter_func arguments with the arguments from
2039  // filter_func_hoisted_literals
2040  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2041  E = cgen_state_->filter_func_->arg_end(),
2042  I2 = filter_func_with_hoisted_literals->arg_begin();
2043  I != E;
2044  ++I) {
2045  I->replaceAllUsesWith(&*I2);
2046  I2->takeName(&*I);
2047  ++I2;
2048  }
2049 
2050  cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2051 
2052  // and finally replace literal placeholders
2053  std::vector<llvm::Instruction*> placeholders;
2054  std::string prefix("__placeholder__literal_");
2055  for (auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2056  e = llvm::inst_end(filter_func_with_hoisted_literals);
2057  it != e;
2058  ++it) {
2059  if (it->hasName() && it->getName().startswith(prefix)) {
2060  auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2061  llvm::dyn_cast<llvm::Value>(&*it));
2062  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2063 
2064  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2065  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2066 
2067  it->replaceAllUsesWith(
2068  query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2069  placeholders.push_back(&*it);
2070  }
2071  }
2072  for (auto placeholder : placeholders) {
2073  placeholder->removeFromParent();
2074  }
2075  }
2076 
2077  return hoisted_literals;
2078 }
2079 
2080 namespace {
2081 
2082 size_t get_shared_memory_size(const bool shared_mem_used,
2083  const QueryMemoryDescriptor* query_mem_desc_ptr) {
2084  return shared_mem_used
2085  ? (query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount())
2086  : 0;
2087 }
2088 
2089 bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor* query_mem_desc_ptr,
2090  const RelAlgExecutionUnit& ra_exe_unit,
2091  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2092  const ExecutorDeviceType device_type,
2093  const unsigned gpu_blocksize,
2094  const unsigned num_blocks_per_mp) {
2095  if (device_type == ExecutorDeviceType::CPU) {
2096  return false;
2097  }
2098  if (query_mem_desc_ptr->didOutputColumnar()) {
2099  return false;
2100  }
2101  CHECK(query_mem_desc_ptr);
2102  CHECK(cuda_mgr);
2103  /*
2104  * We only use shared memory strategy if GPU hardware provides native shared
2105  * memory atomics support. From CUDA Toolkit documentation:
2106  * https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html#atomic-ops "Like
2107  * Maxwell, Pascal [and Volta] provides native shared memory atomic operations
2108  * for 32-bit integer arithmetic, along with native 32 or 64-bit compare-and-swap
2109  * (CAS)."
2110  *
2111  **/
2112  if (!cuda_mgr->isArchMaxwellOrLaterForAll()) {
2113  return false;
2114  }
2115 
2116  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2119  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty()) {
2120  // TODO: relax this, if necessary
2121  if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2122  return false;
2123  }
2124  // skip shared memory usage when dealing with 1) variable length targets, 2)
2125  // not a COUNT aggregate
2126  const auto target_infos =
2127  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2128  std::unordered_set<SQLAgg> supported_aggs{kCOUNT};
2129  if (std::find_if(target_infos.begin(),
2130  target_infos.end(),
2131  [&supported_aggs](const TargetInfo& ti) {
2132  if (ti.sql_type.is_varlen() ||
2133  !supported_aggs.count(ti.agg_kind)) {
2134  return true;
2135  } else {
2136  return false;
2137  }
2138  }) == target_infos.end()) {
2139  return true;
2140  }
2141  }
2142  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2153  if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2154  return false;
2155  }
2156 
2157  // Fundamentally, we should use shared memory whenever the output buffer
2158  // is small enough so that we can fit it in the shared memory and yet expect
2159  // good occupancy.
2160  // For now, we allow keyless, row-wise layout, and only for perfect hash
2161  // group by operations.
2162  if (query_mem_desc_ptr->hasKeylessHash() &&
2163  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty() &&
2164  !query_mem_desc_ptr->useStreamingTopN()) {
2165  const size_t shared_memory_threshold_bytes = std::min(
2167  cuda_mgr->getMinSharedMemoryPerBlockForAllDevices() / num_blocks_per_mp);
2168  const auto output_buffer_size =
2169  query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount();
2170  if (output_buffer_size > shared_memory_threshold_bytes) {
2171  return false;
2172  }
2173 
2174  // skip shared memory usage when dealing with 1) variable length targets, 2)
2175  // non-basic aggregates (COUNT, SUM, MIN, MAX, AVG)
2176  // TODO: relax this if necessary
2177  const auto target_infos =
2178  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2179  std::unordered_set<SQLAgg> supported_aggs{kCOUNT};
2181  supported_aggs = {kCOUNT, kMIN, kMAX, kSUM, kAVG};
2182  }
2183  if (std::find_if(target_infos.begin(),
2184  target_infos.end(),
2185  [&supported_aggs](const TargetInfo& ti) {
2186  if (ti.sql_type.is_varlen() ||
2187  !supported_aggs.count(ti.agg_kind)) {
2188  return true;
2189  } else {
2190  return false;
2191  }
2192  }) == target_infos.end()) {
2193  return true;
2194  }
2195  }
2196  }
2197  return false;
2198 }
2199 
2200 #ifndef NDEBUG
2201 std::string serialize_llvm_metadata_footnotes(llvm::Function* query_func,
2202  CgenState* cgen_state) {
2203  std::string llvm_ir;
2204  std::unordered_set<llvm::MDNode*> md;
2205 
2206  // Loop over all instructions in the query function.
2207  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2208  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2209  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2210  instr_it->getAllMetadata(imd);
2211  for (auto [kind, node] : imd) {
2212  md.insert(node);
2213  }
2214  }
2215  }
2216 
2217  // Loop over all instructions in the row function.
2218  for (auto bb_it = cgen_state->row_func_->begin(); bb_it != cgen_state->row_func_->end();
2219  ++bb_it) {
2220  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2221  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2222  instr_it->getAllMetadata(imd);
2223  for (auto [kind, node] : imd) {
2224  md.insert(node);
2225  }
2226  }
2227  }
2228 
2229  // Loop over all instructions in the filter function.
2230  if (cgen_state->filter_func_) {
2231  for (auto bb_it = cgen_state->filter_func_->begin();
2232  bb_it != cgen_state->filter_func_->end();
2233  ++bb_it) {
2234  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2235  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2236  instr_it->getAllMetadata(imd);
2237  for (auto [kind, node] : imd) {
2238  md.insert(node);
2239  }
2240  }
2241  }
2242  }
2243 
2244  // Sort the metadata by canonical number and convert to text.
2245  if (!md.empty()) {
2246  std::map<size_t, std::string> sorted_strings;
2247  for (auto p : md) {
2248  std::string str;
2249  llvm::raw_string_ostream os(str);
2250  p->print(os, cgen_state->module_, true);
2251  os.flush();
2252  auto fields = split(str, {}, 1);
2253  if (fields.empty() || fields[0].empty()) {
2254  continue;
2255  }
2256  sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2257  }
2258  llvm_ir += "\n";
2259  for (auto [id, text] : sorted_strings) {
2260  llvm_ir += text;
2261  llvm_ir += "\n";
2262  }
2263  }
2264 
2265  return llvm_ir;
2266 }
2267 #endif // NDEBUG
2268 
2269 } // namespace
2270 
2271 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2272 Executor::compileWorkUnit(const std::vector<InputTableInfo>& query_infos,
2273  const PlanState::DeletedColumnsMap& deleted_cols_map,
2274  const RelAlgExecutionUnit& ra_exe_unit,
2275  const CompilationOptions& co,
2276  const ExecutionOptions& eo,
2277  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2278  const bool allow_lazy_fetch,
2279  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2280  const size_t max_groups_buffer_entry_guess,
2281  const int8_t crt_min_byte_width,
2282  const bool has_cardinality_estimation,
2283  ColumnCacheMap& column_cache,
2284  RenderInfo* render_info) {
2285  auto timer = DEBUG_TIMER(__func__);
2286 
2288  const auto cuda_mgr = catalog_->getDataMgr().getCudaMgr();
2289  if (!cuda_mgr) {
2290  throw QueryMustRunOnCpu();
2291  }
2292  }
2293 
2294 #ifndef NDEBUG
2295  static std::uint64_t counter = 0;
2296  ++counter;
2297  VLOG(1) << "CODEGEN #" << counter << ":";
2298  LOG(IR) << "CODEGEN #" << counter << ":";
2299  LOG(PTX) << "CODEGEN #" << counter << ":";
2300  LOG(ASM) << "CODEGEN #" << counter << ":";
2301 #endif
2302 
2303  nukeOldState(allow_lazy_fetch, query_infos, deleted_cols_map, &ra_exe_unit);
2304 
2305  addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
2306 
2307  GroupByAndAggregate group_by_and_aggregate(
2308  this,
2309  co.device_type,
2310  ra_exe_unit,
2311  query_infos,
2312  row_set_mem_owner,
2313  has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2314  : std::nullopt);
2315  auto query_mem_desc =
2316  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
2317  max_groups_buffer_entry_guess,
2318  crt_min_byte_width,
2319  render_info,
2321 
2322  if (query_mem_desc->getQueryDescriptionType() ==
2324  !has_cardinality_estimation &&
2325  (!render_info || !render_info->isPotentialInSituRender()) && !eo.just_explain) {
2326  const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2327  throw CardinalityEstimationRequired(col_range_info.max - col_range_info.min);
2328  }
2329 
2330  const bool output_columnar = query_mem_desc->didOutputColumnar();
2331  const bool gpu_shared_mem_optimization =
2333  ra_exe_unit,
2334  cuda_mgr,
2335  co.device_type,
2336  cuda_mgr ? this->blockSize() : 1,
2337  cuda_mgr ? this->numBlocksPerMP() : 1);
2338  if (gpu_shared_mem_optimization) {
2339  // disable interleaved bins optimization on the GPU
2340  query_mem_desc->setHasInterleavedBinsOnGpu(false);
2341  LOG(DEBUG1) << "GPU shared memory is used for the " +
2342  query_mem_desc->queryDescTypeToString() + " query(" +
2343  std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
2344  query_mem_desc.get())) +
2345  " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
2346  }
2347 
2348  const GpuSharedMemoryContext gpu_smem_context(
2349  get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
2350 
2352  const size_t num_count_distinct_descs =
2353  query_mem_desc->getCountDistinctDescriptorsSize();
2354  for (size_t i = 0; i < num_count_distinct_descs; i++) {
2355  const auto& count_distinct_descriptor =
2356  query_mem_desc->getCountDistinctDescriptor(i);
2357  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::StdSet ||
2358  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
2359  !co.hoist_literals)) {
2360  throw QueryMustRunOnCpu();
2361  }
2362  }
2363  }
2364 
2365  // Read the module template and target either CPU or GPU
2366  // by binding the stream position functions to the right implementation:
2367  // stride access for GPU, contiguous for CPU
2368  auto rt_module_copy = llvm::CloneModule(
2369  *g_rt_module.get(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
2370  auto func = llvm::dyn_cast<llvm::Function>(gv);
2371  if (!func) {
2372  return true;
2373  }
2374  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2375  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
2377  });
2379  if (is_udf_module_present(true)) {
2380  CodeGenerator::link_udf_module(udf_cpu_module, *rt_module_copy, cgen_state_.get());
2381  }
2382  if (is_rt_udf_module_present(true)) {
2384  rt_udf_cpu_module, *rt_module_copy, cgen_state_.get());
2385  }
2386  } else {
2387  rt_module_copy->setDataLayout(get_gpu_data_layout());
2388  rt_module_copy->setTargetTriple(get_gpu_target_triple_string());
2389  if (is_udf_module_present()) {
2390  CodeGenerator::link_udf_module(udf_gpu_module, *rt_module_copy, cgen_state_.get());
2391  }
2392  if (is_rt_udf_module_present()) {
2394  rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
2395  }
2396  }
2397 
2398  cgen_state_->module_ = rt_module_copy.release();
2399  AUTOMATIC_IR_METADATA(cgen_state_.get());
2400 
2401  auto agg_fnames =
2402  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
2403 
2404  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
2405 
2406  const bool is_group_by{query_mem_desc->isGroupBy()};
2407  auto [query_func, row_func_call] = is_group_by
2408  ? query_group_by_template(cgen_state_->module_,
2409  co.hoist_literals,
2411  co.device_type,
2412  ra_exe_unit.scan_limit,
2413  gpu_smem_context)
2414  : query_template(cgen_state_->module_,
2415  agg_slot_count,
2416  co.hoist_literals,
2417  !!ra_exe_unit.estimator,
2418  gpu_smem_context);
2419  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
2420  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
2421  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
2422 
2423  cgen_state_->query_func_ = query_func;
2424  cgen_state_->row_func_call_ = row_func_call;
2425  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2426  &query_func->getEntryBlock().front());
2427 
2428  // Generate the function signature and column head fetches s.t.
2429  // double indirection isn't needed in the inner loop
2430  auto& fetch_bb = query_func->front();
2431  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2432  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2433  auto col_heads = generate_column_heads_load(ra_exe_unit.input_col_descs.size(),
2434  query_func->args().begin(),
2435  fetch_ir_builder,
2436  cgen_state_->context_);
2437  CHECK_EQ(ra_exe_unit.input_col_descs.size(), col_heads.size());
2438 
2439  cgen_state_->row_func_ = create_row_function(ra_exe_unit.input_col_descs.size(),
2440  is_group_by ? 0 : agg_slot_count,
2441  co.hoist_literals,
2442  cgen_state_->module_,
2443  cgen_state_->context_);
2444  CHECK(cgen_state_->row_func_);
2445  cgen_state_->row_func_bb_ =
2446  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
2447 
2449  auto filter_func_ft =
2450  llvm::FunctionType::get(get_int_type(32, cgen_state_->context_), {}, false);
2451  cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2452  llvm::Function::ExternalLinkage,
2453  "filter_func",
2454  cgen_state_->module_);
2455  CHECK(cgen_state_->filter_func_);
2456  cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2457  cgen_state_->context_, "entry", cgen_state_->filter_func_);
2458  }
2459 
2460  cgen_state_->current_func_ = cgen_state_->row_func_;
2461  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2462 
2463  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
2464  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
2465  const auto join_loops =
2466  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2467 
2468  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
2469  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2470  if (is_not_deleted_bb) {
2471  cgen_state_->row_func_bb_ = is_not_deleted_bb;
2472  }
2473  if (!join_loops.empty()) {
2474  codegenJoinLoops(join_loops,
2475  body_execution_unit,
2476  group_by_and_aggregate,
2477  query_func,
2478  cgen_state_->row_func_bb_,
2479  *(query_mem_desc.get()),
2480  co,
2481  eo);
2482  } else {
2483  const bool can_return_error = compileBody(
2484  ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
2485  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
2487  createErrorCheckControlFlow(query_func,
2490  co.device_type,
2491  group_by_and_aggregate.query_infos_);
2492  }
2493  }
2494  std::vector<llvm::Value*> hoisted_literals;
2495 
2496  if (co.hoist_literals) {
2497  VLOG(1) << "number of hoisted literals: "
2498  << cgen_state_->query_func_literal_loads_.size()
2499  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2500  << " bytes";
2501  }
2502 
2503  if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2504  // we have some hoisted literals...
2505  hoisted_literals = inlineHoistedLiterals();
2506  }
2507 
2508  // replace the row func placeholder call with the call to the actual row func
2509  std::vector<llvm::Value*> row_func_args;
2510  for (size_t i = 0; i < cgen_state_->row_func_call_->getNumArgOperands(); ++i) {
2511  row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2512  }
2513  row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2514  row_func_args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
2515  // push hoisted literals arguments, if any
2516  row_func_args.insert(
2517  row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2518  llvm::ReplaceInstWithInst(
2519  cgen_state_->row_func_call_,
2520  llvm::CallInst::Create(cgen_state_->row_func_, row_func_args, ""));
2521 
2522  // replace the filter func placeholder call with the call to the actual filter func
2523  if (cgen_state_->filter_func_) {
2524  std::vector<llvm::Value*> filter_func_args;
2525  for (auto arg_it = cgen_state_->filter_func_args_.begin();
2526  arg_it != cgen_state_->filter_func_args_.end();
2527  ++arg_it) {
2528  filter_func_args.push_back(arg_it->first);
2529  }
2530  llvm::ReplaceInstWithInst(
2531  cgen_state_->filter_func_call_,
2532  llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args, ""));
2533  }
2534 
2535  // Aggregate
2536  plan_state_->init_agg_vals_ =
2537  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
2538 
2539  /*
2540  * If we have decided to use GPU shared memory (decision is not made here), then
2541  * we generate proper code for extra components that it needs (buffer initialization and
2542  * gpu reduction from shared memory to global memory). We then replace these functions
2543  * into the already compiled query_func (replacing two placeholders, write_back_nop and
2544  * init_smem_nop). The rest of the code should be as before (row_func, etc.).
2545  */
2546  if (gpu_smem_context.isSharedMemoryUsed()) {
2547  if (query_mem_desc->getQueryDescriptionType() ==
2549  GpuSharedMemCodeBuilder gpu_smem_code(
2550  cgen_state_->module_,
2551  cgen_state_->context_,
2552  *query_mem_desc,
2554  plan_state_->init_agg_vals_);
2555  gpu_smem_code.codegen();
2556  gpu_smem_code.injectFunctionsInto(query_func);
2557 
2558  // helper functions are used for caching purposes later
2559  cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2560  cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2561  LOG(IR) << gpu_smem_code.toString();
2562  }
2563  }
2564 
2565  auto multifrag_query_func = cgen_state_->module_->getFunction(
2566  "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
2567  CHECK(multifrag_query_func);
2568 
2570  insertErrorCodeChecker(
2571  multifrag_query_func, co.hoist_literals, eo.allow_runtime_query_interrupt);
2572  }
2573 
2574  bind_query(query_func,
2575  "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
2576  multifrag_query_func,
2577  cgen_state_->module_);
2578 
2579  std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2580  if (cgen_state_->filter_func_) {
2581  root_funcs.push_back(cgen_state_->filter_func_);
2582  }
2583  auto live_funcs = CodeGenerator::markDeadRuntimeFuncs(
2584  *cgen_state_->module_, root_funcs, {multifrag_query_func});
2585 
2586  // Always inline the row function and the filter function.
2587  // We don't want register spills in the inner loops.
2588  // LLVM seems to correctly free up alloca instructions
2589  // in these functions even when they are inlined.
2590  mark_function_always_inline(cgen_state_->row_func_);
2591  if (cgen_state_->filter_func_) {
2592  mark_function_always_inline(cgen_state_->filter_func_);
2593  }
2594 
2595 #ifndef NDEBUG
2596  // Add helpful metadata to the LLVM IR for debugging.
2598 #endif
2599 
2600  // Serialize the important LLVM IR functions to text for SQL EXPLAIN.
2601  std::string llvm_ir;
2602  if (eo.just_explain) {
2604 #ifdef WITH_JIT_DEBUG
2605  throw std::runtime_error(
2606  "Explain optimized not available when JIT runtime debug symbols are enabled");
2607 #else
2608  // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
2609  // optimized IR after NVVM reflect
2610  llvm::legacy::PassManager pass_manager;
2611  optimize_ir(query_func, cgen_state_->module_, pass_manager, live_funcs, co);
2612 #endif // WITH_JIT_DEBUG
2613  }
2614  llvm_ir =
2615  serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
2616  serialize_llvm_object(cgen_state_->row_func_) +
2617  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2618  : "");
2619 
2620 #ifndef NDEBUG
2621  llvm_ir += serialize_llvm_metadata_footnotes(query_func, cgen_state_.get());
2622 #endif
2623  }
2624 
2625  LOG(IR) << "\n\n" << query_mem_desc->toString() << "\n";
2626  LOG(IR) << "IR for the "
2627  << (co.device_type == ExecutorDeviceType::CPU ? "CPU:\n" : "GPU:\n");
2628 #ifdef NDEBUG
2629  LOG(IR) << serialize_llvm_object(query_func)
2630  << serialize_llvm_object(cgen_state_->row_func_)
2631  << (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2632  : "")
2633  << "\nEnd of IR";
2634 #else
2635  LOG(IR) << serialize_llvm_object(cgen_state_->module_) << "\nEnd of IR";
2636 #endif
2637 
2638  // Run some basic validation checks on the LLVM IR before code is generated below.
2639  verify_function_ir(cgen_state_->row_func_);
2640  if (cgen_state_->filter_func_) {
2641  verify_function_ir(cgen_state_->filter_func_);
2642  }
2643 
2644  // Generate final native code from the LLVM IR.
2645  return std::make_tuple(
2648  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2649  : optimizeAndCodegenGPU(query_func,
2650  multifrag_query_func,
2651  live_funcs,
2652  is_group_by || ra_exe_unit.estimator,
2653  cuda_mgr,
2654  co),
2655  cgen_state_->getLiterals(),
2656  output_columnar,
2657  llvm_ir,
2658  std::move(gpu_smem_context)},
2659  std::move(query_mem_desc));
2660 }
2661 
2662 void Executor::insertErrorCodeChecker(llvm::Function* query_func,
2663  bool hoist_literals,
2664  bool allow_runtime_query_interrupt) {
2665  auto query_stub_func_name =
2666  "query_stub" + std::string(hoist_literals ? "_hoisted_literals" : "");
2667  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2668  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
2669  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
2670  continue;
2671  }
2672  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
2673  if (std::string(row_func_call.getCalledFunction()->getName()) ==
2674  query_stub_func_name) {
2675  auto next_inst_it = inst_it;
2676  ++next_inst_it;
2677  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
2678  auto& br_instr = bb_it->back();
2679  llvm::IRBuilder<> ir_builder(&br_instr);
2680  llvm::Value* err_lv = &*inst_it;
2681  auto error_check_bb =
2682  bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr), ".error_check");
2683  llvm::Value* error_code_arg = nullptr;
2684  auto arg_cnt = 0;
2685  for (auto arg_it = query_func->arg_begin(); arg_it != query_func->arg_end();
2686  arg_it++, ++arg_cnt) {
2687  // since multi_frag_* func has anonymous arguments so we use arg_offset
2688  // explicitly to capture "error_code" argument in the func's argument list
2689  if (hoist_literals) {
2690  if (arg_cnt == 9) {
2691  error_code_arg = &*arg_it;
2692  break;
2693  }
2694  } else {
2695  if (arg_cnt == 8) {
2696  error_code_arg = &*arg_it;
2697  break;
2698  }
2699  }
2700  }
2701  CHECK(error_code_arg);
2702  llvm::Value* err_code = nullptr;
2703  if (allow_runtime_query_interrupt) {
2704  // decide the final error code with a consideration of interrupt status
2705  auto& check_interrupt_br_instr = bb_it->back();
2706  auto interrupt_check_bb = llvm::BasicBlock::Create(
2707  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
2708  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
2709  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
2710  cgen_state_->module_->getFunction("check_interrupt"), {});
2711  auto detected_error = interrupt_checker_ir_builder.CreateCall(
2712  cgen_state_->module_->getFunction("get_error_code"),
2713  std::vector<llvm::Value*>{error_code_arg});
2714  err_code = interrupt_checker_ir_builder.CreateSelect(
2715  detected_interrupt,
2716  cgen_state_->llInt(Executor::ERR_INTERRUPTED),
2717  detected_error);
2718  interrupt_checker_ir_builder.CreateBr(error_check_bb);
2719  llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
2720  llvm::BranchInst::Create(interrupt_check_bb));
2721  ir_builder.SetInsertPoint(&br_instr);
2722  } else {
2723  // uses error code returned from row_func and skip to check interrupt status
2724  ir_builder.SetInsertPoint(&br_instr);
2725  err_code =
2726  ir_builder.CreateCall(cgen_state_->module_->getFunction("get_error_code"),
2727  std::vector<llvm::Value*>{error_code_arg});
2728  }
2729  err_lv = ir_builder.CreateICmp(
2730  llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
2731  auto error_bb = llvm::BasicBlock::Create(
2732  cgen_state_->context_, ".error_exit", query_func, new_bb);
2733  llvm::CallInst::Create(cgen_state_->module_->getFunction("record_error_code"),
2734  std::vector<llvm::Value*>{err_code, error_code_arg},
2735  "",
2736  error_bb);
2737  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2738  llvm::ReplaceInstWithInst(&br_instr,
2739  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2740  break;
2741  }
2742  }
2743  }
2744 }
2745 
2747  const RelAlgExecutionUnit& ra_exe_unit,
2748  const CompilationOptions& co) {
2749  AUTOMATIC_IR_METADATA(cgen_state_.get());
2750  if (!co.filter_on_deleted_column) {
2751  return nullptr;
2752  }
2753  CHECK(!ra_exe_unit.input_descs.empty());
2754  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
2755  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
2756  return nullptr;
2757  }
2758  const auto deleted_cd =
2759  plan_state_->getDeletedColForTable(outer_input_desc.getTableId());
2760  if (!deleted_cd) {
2761  return nullptr;
2762  }
2763  CHECK(deleted_cd->columnType.is_boolean());
2764  const auto deleted_expr =
2765  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
2766  outer_input_desc.getTableId(),
2767  deleted_cd->columnId,
2768  outer_input_desc.getNestLevel());
2769  CodeGenerator code_generator(this);
2770  const auto is_deleted =
2771  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
2772  const auto is_deleted_bb = llvm::BasicBlock::Create(
2773  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
2774  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
2775  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
2776  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
2777  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
2778  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2779  cgen_state_->ir_builder_.SetInsertPoint(bb);
2780  return bb;
2781 }
2782 
2783 bool Executor::compileBody(const RelAlgExecutionUnit& ra_exe_unit,
2784  GroupByAndAggregate& group_by_and_aggregate,
2786  const CompilationOptions& co,
2787  const GpuSharedMemoryContext& gpu_smem_context) {
2788  AUTOMATIC_IR_METADATA(cgen_state_.get());
2789 
2790  // Switch the code generation into a separate filter function if enabled.
2791  // Note that accesses to function arguments are still codegenned from the
2792  // row function's arguments, then later automatically forwarded and
2793  // remapped into filter function arguments by redeclareFilterFunction().
2794  cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
2795  llvm::Value* loop_done{nullptr};
2796  std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
2797  if (cgen_state_->filter_func_) {
2798  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2799  auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
2800  cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
2801  row_func_entry_bb->begin());
2802  loop_done = cgen_state_->ir_builder_.CreateAlloca(
2803  get_int_type(1, cgen_state_->context_), nullptr, "loop_done");
2804  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2805  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(true), loop_done);
2806  }
2807  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
2808  cgen_state_->current_func_ = cgen_state_->filter_func_;
2809  fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
2810  }
2811 
2812  // generate the code for the filter
2813  std::vector<Analyzer::Expr*> primary_quals;
2814  std::vector<Analyzer::Expr*> deferred_quals;
2815  bool short_circuited = CodeGenerator::prioritizeQuals(
2816  ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
2817  if (short_circuited) {
2818  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
2819  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
2820  << " quals";
2821  }
2822  llvm::Value* filter_lv = cgen_state_->llBool(true);
2823  CodeGenerator code_generator(this);
2824  for (auto expr : primary_quals) {
2825  // Generate the filter for primary quals
2826  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
2827  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
2828  }
2829  CHECK(filter_lv->getType()->isIntegerTy(1));
2830  llvm::BasicBlock* sc_false{nullptr};
2831  if (!deferred_quals.empty()) {
2832  auto sc_true = llvm::BasicBlock::Create(
2833  cgen_state_->context_, "sc_true", cgen_state_->current_func_);
2834  sc_false = llvm::BasicBlock::Create(
2835  cgen_state_->context_, "sc_false", cgen_state_->current_func_);
2836  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
2837  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
2838  if (ra_exe_unit.join_quals.empty()) {
2839  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
2840  }
2841  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
2842  filter_lv = cgen_state_->llBool(true);
2843  }
2844  for (auto expr : deferred_quals) {
2845  filter_lv = cgen_state_->ir_builder_.CreateAnd(
2846  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
2847  }
2848 
2849  CHECK(filter_lv->getType()->isIntegerTy(1));
2850  auto ret = group_by_and_aggregate.codegen(
2851  filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
2852 
2853  // Switch the code generation back to the row function if a filter
2854  // function was enabled.
2855  if (cgen_state_->filter_func_) {
2856  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2857  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(false), loop_done);
2858  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2859  }
2860 
2861  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2862  cgen_state_->current_func_ = cgen_state_->row_func_;
2863  cgen_state_->filter_func_call_ =
2864  cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
2865 
2866  // Create real filter function declaration after placeholder call
2867  // is emitted.
2868  redeclareFilterFunction();
2869 
2870  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2871  auto loop_done_true = llvm::BasicBlock::Create(
2872  cgen_state_->context_, "loop_done_true", cgen_state_->row_func_);
2873  auto loop_done_false = llvm::BasicBlock::Create(
2874  cgen_state_->context_, "loop_done_false", cgen_state_->row_func_);
2875  auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(loop_done);
2876  cgen_state_->ir_builder_.CreateCondBr(
2877  loop_done_flag, loop_done_true, loop_done_false);
2878  cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
2879  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2880  cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
2881  } else {
2882  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2883  }
2884  }
2885  return ret;
2886 }
2887 
2888 std::unique_ptr<llvm::Module> runtime_module_shallow_copy(CgenState* cgen_state) {
2889  return llvm::CloneModule(
2890  *g_rt_module.get(), cgen_state->vmap_, [](const llvm::GlobalValue* gv) {
2891  auto func = llvm::dyn_cast<llvm::Function>(gv);
2892  if (!func) {
2893  return true;
2894  }
2895  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2896  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage);
2897  });
2898 }
2899 
2900 std::vector<llvm::Value*> generate_column_heads_load(const int num_columns,
2901  llvm::Value* byte_stream_arg,
2902  llvm::IRBuilder<>& ir_builder,
2903  llvm::LLVMContext& ctx) {
2904  CHECK(byte_stream_arg);
2905  const auto max_col_local_id = num_columns - 1;
2906 
2907  std::vector<llvm::Value*> col_heads;
2908  for (int col_id = 0; col_id <= max_col_local_id; ++col_id) {
2909  col_heads.emplace_back(ir_builder.CreateLoad(ir_builder.CreateGEP(
2910  byte_stream_arg, llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id))));
2911  }
2912  return col_heads;
2913 }
2914 
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
void read_rt_udf_gpu_module(const std::string &udf_ir)
catalog_(nullptr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:211
std::string filename(char const *path)
Definition: Logger.cpp:62
double g_running_query_interrupt_freq
Definition: Execute.cpp:118
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
std::unique_ptr< llvm::Module > rt_udf_cpu_module
bool countDistinctDescriptorsLogicallyEmpty() const
std::unique_ptr< llvm::Module > runtime_module_shallow_copy(CgenState *cgen_state)
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1120
void mark_function_never_inline(llvm::Function *func)
std::unique_ptr< llvm::Module > udf_gpu_module
void show_defined(llvm::Module &module)
ExecutorDeviceType
#define SIZE_MAX
void read_rt_udf_cpu_module(const std::string &udf_ir)
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:194
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
bool is_udf_module_present(bool cpu_only=false)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
std::string join(T const &container, std::string const &delim)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
void read_udf_cpu_module(const std::string &udf_ir_filename)
void read_udf_gpu_module(const std::string &udf_ir_filename)
std::vector< std::string > CodeCacheKey
Definition: CodeCache.h:25
ExecutorOptLevel opt_level
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:77
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
void optimize_ir(llvm::Function *query_func, llvm::Module *module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
#define CHECK_GT(x, y)
Definition: Logger.h:215
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
#define LOG_IF(severity, condition)
Definition: Logger.h:293
gpu_code_cache_(code_cache_size)
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *module)
llvm::Function * row_func_
Definition: CgenState.h:325
cpu_code_cache_(code_cache_size)
std::shared_ptr< CompilationContext > getCodeFromCache(const CodeCacheKey &, const CodeCache &)
bool g_enable_smem_non_grouped_agg
Definition: Execute.cpp:127
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co, const GPUTarget &gpu_target)
Definition: sqldefs.h:73
unsigned getExpOfTwo(unsigned n)
Definition: MathUtils.cpp:23
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned gpu_blocksize, const unsigned num_blocks_per_mp)
std::string get_cuda_home(void)
Definition: CudaMgr.cpp:404
llvm::StringRef get_gpu_target_triple_string()
llvm::Module * module_
Definition: CgenState.h:324
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
void verify_function_ir(const llvm::Function *func)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:167
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > g_rt_module
ExecutorExplainType explain_type
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
void insertErrorCodeChecker(llvm::Function *query_func, bool hoist_literals, bool allow_runtime_query_interrupt)
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1119
void initializeNVPTXBackend() const
Definition: sqldefs.h:75
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:114
const_list_iterator_t cend() const
Definition: LruCache.hpp:58
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
char * to
QueryDescriptionType getQueryDescriptionType() const
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
Definition: CudaMgr.h:148
static void addCodeToCache(const CodeCacheKey &, std::shared_ptr< CompilationContext >, llvm::Module *, CodeCache &)
#define AUTOMATIC_IR_METADATA_DONE()
ExecutorDeviceType device_type
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
llvm::Function * filter_func_
Definition: CgenState.h:326
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:287
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
#define CHECK_LE(x, y)
Definition: Logger.h:214
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
llvm::Module * read_template_module(llvm::LLVMContext &context)
bool g_enable_smem_grouped_non_count_agg
Definition: Execute.cpp:124
Definition: sqldefs.h:76
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
std::unique_ptr< llvm::Module > udf_cpu_module
int CUdevice
Definition: nocuda.h:20
bool g_enable_filter_function
Definition: Execute.cpp:79
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
llvm::LLVMContext & getGlobalLLVMContext()
float g_fraction_code_cache_to_evict
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
Definition: LogicalIR.cpp:157
SQLAgg get_aggtype() const
Definition: Analyzer.h:1095
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:64
#define CHECK(condition)
Definition: Logger.h:203
#define DEBUG_TIMER(name)
Definition: Logger.h:319
llvm::ValueToValueMapTy vmap_
Definition: CgenState.h:334
std::string get_root_abs_path()
char * f
std::vector< llvm::Value * > inlineHoistedLiterals()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *module, llvm::LLVMContext &context)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
Definition: sqldefs.h:74
NvidiaDeviceArch getDeviceArch() const
Definition: CudaMgr.h:172
int cpu_threads()
Definition: thread_count.h:24
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls, const bool is_gpu=false)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
CubinResult ptx_to_cubin(const std::string &ptx, const unsigned block_size, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
Definition: sqldefs.h:72
bool is_rt_udf_module_present(bool cpu_only=false)
void put(const key_t &key, value_t &&value)
Definition: LruCache.hpp:27
const_list_iterator_t find(const key_t &key) const
Definition: LruCache.hpp:49
#define VLOG(n)
Definition: Logger.h:297
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
size_t g_gpu_smem_threshold
Definition: Execute.cpp:119