OmniSciDB  bf83d84833
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
NativeCodegen.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
18 #include "Execute.h"
20 #include "GpuSharedMemoryUtils.h"
23 #include "QueryTemplateGenerator.h"
24 
25 #include "CudaMgr/CudaMgr.h"
28 #include "Shared/MathUtils.h"
29 #include "StreamingTopN.h"
30 
31 #if LLVM_VERSION_MAJOR < 9
32 static_assert(false, "LLVM Version >= 9 is required.");
33 #endif
34 
35 #include <llvm/Bitcode/BitcodeReader.h>
36 #include <llvm/Bitcode/BitcodeWriter.h>
37 #include <llvm/ExecutionEngine/MCJIT.h>
38 #include <llvm/IR/Attributes.h>
39 #include <llvm/IR/GlobalValue.h>
40 #include <llvm/IR/InstIterator.h>
41 #include <llvm/IR/IntrinsicInst.h>
42 #include <llvm/IR/Intrinsics.h>
43 #include <llvm/IR/LegacyPassManager.h>
44 #include <llvm/IR/Verifier.h>
45 #include <llvm/IRReader/IRReader.h>
46 #include <llvm/Linker/Linker.h>
47 #include <llvm/Support/Casting.h>
48 #include <llvm/Support/FileSystem.h>
49 #include <llvm/Support/FormattedStream.h>
50 #include <llvm/Support/MemoryBuffer.h>
51 #include <llvm/Support/SourceMgr.h>
52 #include <llvm/Support/TargetRegistry.h>
53 #include <llvm/Support/TargetSelect.h>
54 #include <llvm/Support/raw_os_ostream.h>
55 #include <llvm/Support/raw_ostream.h>
56 #include <llvm/Transforms/IPO.h>
57 #include <llvm/Transforms/IPO/AlwaysInliner.h>
58 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
59 #include <llvm/Transforms/InstCombine/InstCombine.h>
60 #include <llvm/Transforms/Instrumentation.h>
61 #include <llvm/Transforms/Scalar.h>
62 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
63 #include <llvm/Transforms/Utils.h>
64 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
65 #include <llvm/Transforms/Utils/Cloning.h>
66 
67 #if LLVM_VERSION_MAJOR >= 11
68 #include <llvm/Support/Host.h>
69 #endif
70 
72 
73 std::unique_ptr<llvm::Module> udf_gpu_module;
74 std::unique_ptr<llvm::Module> udf_cpu_module;
75 std::unique_ptr<llvm::Module> rt_udf_gpu_module;
76 std::unique_ptr<llvm::Module> rt_udf_cpu_module;
77 
78 extern std::unique_ptr<llvm::Module> g_rt_module;
79 
80 #ifdef HAVE_CUDA
81 extern std::unique_ptr<llvm::Module> g_rt_libdevice_module;
82 #endif
83 
84 #ifdef ENABLE_GEOS
85 extern std::unique_ptr<llvm::Module> g_rt_geos_module;
86 
87 #include <llvm/Support/DynamicLibrary.h>
88 
89 #ifndef GEOS_LIBRARY_FILENAME
90 #error Configuration should include GEOS library file name
91 #endif
92 std::unique_ptr<std::string> g_libgeos_so_filename(
93  new std::string(GEOS_LIBRARY_FILENAME));
94 static llvm::sys::DynamicLibrary geos_dynamic_library;
95 static std::mutex geos_init_mutex;
96 
97 namespace {
98 
99 void load_geos_dynamic_library() {
100  std::lock_guard<std::mutex> guard(geos_init_mutex);
101 
102  if (!geos_dynamic_library.isValid()) {
103  if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
104  LOG(WARNING) << "Misconfigured GEOS library file name, trying 'libgeos_c.so'";
105  g_libgeos_so_filename.reset(new std::string("libgeos_c.so"));
106  }
107  auto filename = *g_libgeos_so_filename;
108  std::string error_message;
109  geos_dynamic_library =
110  llvm::sys::DynamicLibrary::getPermanentLibrary(filename.c_str(), &error_message);
111  if (!geos_dynamic_library.isValid()) {
112  LOG(ERROR) << "Failed to load GEOS library '" + filename + "'";
113  std::string exception_message = "Failed to load GEOS library: " + error_message;
114  throw std::runtime_error(exception_message.c_str());
115  } else {
116  LOG(INFO) << "Loaded GEOS library '" + filename + "'";
117  }
118  }
119 }
120 
121 } // namespace
122 #endif
123 
124 namespace {
125 
126 void throw_parseIR_error(const llvm::SMDiagnostic& parse_error,
127  std::string src = "",
128  const bool is_gpu = false) {
129  std::string excname = (is_gpu ? "NVVM IR ParseError: " : "LLVM IR ParseError: ");
130  llvm::raw_string_ostream ss(excname);
131  parse_error.print(src.c_str(), ss, false, false);
132  throw ParseIRError(ss.str());
133 }
134 
135 /* SHOW_DEFINED(<llvm::Module instance>) prints the function names
136  that are defined in the given LLVM Module instance.
137 
138  SHOW_FUNCTIONS(<llvm::Module instance>) prints the function names
139  of all used functions in the given LLVM Module
140  instance. Declarations are marked with `[decl]` as a name suffix.
141 
142  Useful for debugging.
143 */
144 
145 #define SHOW_DEFINED(MODULE) \
146  { \
147  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
148  ::show_defined(MODULE); \
149  }
150 
151 #define SHOW_FUNCTIONS(MODULE) \
152  { \
153  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
154  ::show_functions(MODULE); \
155  }
156 
157 template <typename T = void>
158 void show_defined(llvm::Module& module) {
159  std::cout << "defines: ";
160  for (auto& f : module.getFunctionList()) {
161  if (!f.isDeclaration()) {
162  std::cout << f.getName().str() << ", ";
163  }
164  }
165  std::cout << std::endl;
166 }
167 
168 template <typename T = void>
169 void show_defined(llvm::Module* module) {
170  if (module == nullptr) {
171  std::cout << "is null" << std::endl;
172  } else {
173  show_defined(*module);
174  }
175 }
176 
177 template <typename T = void>
178 void show_defined(std::unique_ptr<llvm::Module>& module) {
179  show_defined(module.get());
180 }
181 
182 /*
183  scan_function_calls(module, defined, undefined, ignored) computes
184  defined and undefined sets of function names:
185 
186  - defined functions are those that are defined in the given module
187 
188  - undefined functions are those that are called by defined functions
189  but that are not defined in the given module
190 
191  - ignored functions are functions that may be undefined but will not
192  be listed in the set of undefined functions.
193 
194  Useful for debugging.
195 */
196 template <typename T = void>
197 void scan_function_calls(llvm::Function& F,
198  std::unordered_set<std::string>& defined,
199  std::unordered_set<std::string>& undefined,
200  const std::unordered_set<std::string>& ignored) {
201  for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
202  if (auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
203  auto* F2 = CI->getCalledFunction();
204  if (F2 != nullptr) {
205  auto F2name = F2->getName().str();
206  if (F2->isDeclaration()) {
207  if (F2name.rfind("__", 0) !=
208  0 // assume symbols with double underscore are defined
209  && F2name.rfind("llvm.", 0) !=
210  0 // TODO: this may give false positive for NVVM intrinsics
211  && ignored.find(F2name) == ignored.end() // not in ignored list
212  ) {
213  undefined.emplace(F2name);
214  }
215  } else {
216  if (defined.find(F2name) == defined.end()) {
217  defined.emplace(F2name);
218  scan_function_calls(*F2, defined, undefined, ignored);
219  }
220  }
221  }
222  }
223  }
224 }
225 
226 template <typename T = void>
227 void scan_function_calls(llvm::Module& module,
228  std::unordered_set<std::string>& defined,
229  std::unordered_set<std::string>& undefined,
230  const std::unordered_set<std::string>& ignored) {
231  for (auto& F : module) {
232  if (!F.isDeclaration()) {
233  scan_function_calls(F, defined, undefined, ignored);
234  }
235  }
236 }
237 
238 template <typename T = void>
239 std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>>
240 scan_function_calls(llvm::Module& module,
241  const std::unordered_set<std::string>& ignored = {}) {
242  std::unordered_set<std::string> defined, undefined;
243  scan_function_calls(module, defined, undefined, ignored);
244  return std::make_tuple(defined, undefined);
245 }
246 
247 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
249  llvm::Module& M,
250  const std::unordered_set<llvm::Function*>& live_funcs) {
251  std::vector<llvm::Function*> dead_funcs;
252  for (auto& F : M) {
253  bool bAlive = false;
254  if (live_funcs.count(&F)) {
255  continue;
256  }
257  for (auto U : F.users()) {
258  auto* C = llvm::dyn_cast<const llvm::CallInst>(U);
259  if (!C || C->getParent()->getParent() != &F) {
260  bAlive = true;
261  break;
262  }
263  }
264  if (!bAlive) {
265  dead_funcs.push_back(&F);
266  }
267  }
268  for (auto pFn : dead_funcs) {
269  pFn->eraseFromParent();
270  }
271 }
272 
273 #ifdef HAVE_CUDA
274 
275 // check if linking with libdevice is required
276 // libdevice functions have a __nv_* prefix
277 bool check_module_requires_libdevice(llvm::Module* module) {
278  for (llvm::Function& F : *module) {
279  if (F.hasName() && F.getName().startswith("__nv_")) {
280  LOG(INFO) << "Module requires linking with libdevice: " << std::string(F.getName());
281  return true;
282  }
283  }
284  LOG(DEBUG1) << "module does not require linking against libdevice";
285  return false;
286 }
287 
288 // Adds the missing intrinsics declarations to the given module
289 void add_intrinsics_to_module(llvm::Module* module) {
290  for (llvm::Function& F : *module) {
291  for (llvm::Instruction& I : instructions(F)) {
292  if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
293  if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
294  llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
295  llvm::Function& decl_fn =
296  *llvm::Intrinsic::getDeclaration(module, ii->getIntrinsicID(), Tys);
297  ii->setCalledFunction(&decl_fn);
298  } else {
299  // inserts the declaration into the module if not present
300  llvm::Intrinsic::getDeclaration(module, ii->getIntrinsicID());
301  }
302  }
303  }
304  }
305 }
306 
307 #endif
308 
309 void optimize_ir(llvm::Function* query_func,
310  llvm::Module* module,
311  llvm::legacy::PassManager& pass_manager,
312  const std::unordered_set<llvm::Function*>& live_funcs,
313  const CompilationOptions& co) {
314  pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
315  pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
316  pass_manager.add(llvm::createInstSimplifyLegacyPass());
317  pass_manager.add(llvm::createInstructionCombiningPass());
318  pass_manager.add(llvm::createGlobalOptimizerPass());
319 
320  pass_manager.add(llvm::createLICMPass());
322  pass_manager.add(llvm::createLoopStrengthReducePass());
323  }
324  pass_manager.run(*module);
325 
326  eliminate_dead_self_recursive_funcs(*module, live_funcs);
327 }
328 #endif
329 
330 } // namespace
331 
333 
334 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine)
335  : execution_engine_(execution_engine) {}
336 
337 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine,
338  const CompilationOptions& co)
339  : execution_engine_(execution_engine) {
340  if (execution_engine_) {
342 #ifdef ENABLE_INTEL_JIT_LISTENER
343  intel_jit_listener_.reset(llvm::JITEventListener::createIntelJITEventListener());
345  execution_engine_->RegisterJITEventListener(intel_jit_listener_.get());
346  LOG(INFO) << "Registered IntelJITEventListener";
347 #else
348  LOG(WARNING) << "This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
349  "listener configuration parameter.";
350 #endif // ENABLE_INTEL_JIT_LISTENER
351  }
352  }
353 }
354 
356  llvm::ExecutionEngine* execution_engine) {
357  execution_engine_.reset(execution_engine);
358  intel_jit_listener_ = nullptr;
359  return *this;
360 }
361 
362 void verify_function_ir(const llvm::Function* func) {
363  std::stringstream err_ss;
364  llvm::raw_os_ostream err_os(err_ss);
365  err_os << "\n-----\n";
366  if (llvm::verifyFunction(*func, &err_os)) {
367  err_os << "\n-----\n";
368  func->print(err_os, nullptr);
369  err_os << "\n-----\n";
370  LOG(FATAL) << err_ss.str();
371  }
372 }
373 
374 std::shared_ptr<CompilationContext> Executor::getCodeFromCache(const CodeCacheKey& key,
375  const CodeCache& cache) {
376  auto it = cache.find(key);
377  if (it != cache.cend()) {
378  delete cgen_state_->module_;
379  cgen_state_->module_ = it->second.second;
380  return it->second.first;
381  }
382  return {};
383 }
384 
386  std::shared_ptr<CompilationContext> compilation_context,
387  llvm::Module* module,
388  CodeCache& cache) {
389  cache.put(key,
390  std::make_pair<std::shared_ptr<CompilationContext>, decltype(module)>(
391  std::move(compilation_context), std::move(module)));
392 }
393 
394 namespace {
395 
396 std::string assemblyForCPU(ExecutionEngineWrapper& execution_engine,
397  llvm::Module* module) {
398  llvm::legacy::PassManager pass_manager;
399  auto cpu_target_machine = execution_engine->getTargetMachine();
400  CHECK(cpu_target_machine);
401  llvm::SmallString<256> code_str;
402  llvm::raw_svector_ostream os(code_str);
403 #if LLVM_VERSION_MAJOR >= 10
404  cpu_target_machine->addPassesToEmitFile(
405  pass_manager, os, nullptr, llvm::CGFT_AssemblyFile);
406 #else
407  cpu_target_machine->addPassesToEmitFile(
408  pass_manager, os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
409 #endif
410  pass_manager.run(*module);
411  return "Assembly for the CPU:\n" + std::string(code_str.str()) + "\nEnd of assembly";
412 }
413 
414 } // namespace
415 
417  llvm::Function* func,
418  const std::unordered_set<llvm::Function*>& live_funcs,
419  const CompilationOptions& co) {
420  auto module = func->getParent();
421  // run optimizations
422 #ifndef WITH_JIT_DEBUG
423  llvm::legacy::PassManager pass_manager;
424  optimize_ir(func, module, pass_manager, live_funcs, co);
425 #endif // WITH_JIT_DEBUG
426 
427  auto init_err = llvm::InitializeNativeTarget();
428  CHECK(!init_err);
429 
430  llvm::InitializeAllTargetMCs();
431  llvm::InitializeNativeTargetAsmPrinter();
432  llvm::InitializeNativeTargetAsmParser();
433 
434  std::string err_str;
435  std::unique_ptr<llvm::Module> owner(module);
436  llvm::EngineBuilder eb(std::move(owner));
437  eb.setErrorStr(&err_str);
438  eb.setEngineKind(llvm::EngineKind::JIT);
439  llvm::TargetOptions to;
440  to.EnableFastISel = true;
441  eb.setTargetOptions(to);
443  eb.setOptLevel(llvm::CodeGenOpt::None);
444  }
445 
446  ExecutionEngineWrapper execution_engine(eb.create(), co);
447  CHECK(execution_engine.get());
448  LOG(ASM) << assemblyForCPU(execution_engine, module);
449 
450  execution_engine->finalizeObject();
451  return execution_engine;
452 }
453 
454 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenCPU(
455  llvm::Function* query_func,
456  llvm::Function* multifrag_query_func,
457  const std::unordered_set<llvm::Function*>& live_funcs,
458  const CompilationOptions& co) {
459  auto module = multifrag_query_func->getParent();
460  CodeCacheKey key{serialize_llvm_object(query_func),
461  serialize_llvm_object(cgen_state_->row_func_)};
462  if (cgen_state_->filter_func_) {
463  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
464  }
465  for (const auto helper : cgen_state_->helper_functions_) {
466  key.push_back(serialize_llvm_object(helper));
467  }
468  auto cached_code = getCodeFromCache(key, cpu_code_cache_);
469  if (cached_code) {
470  return cached_code;
471  }
472 
473  if (cgen_state_->needs_geos_) {
474 #ifdef ENABLE_GEOS
475  load_geos_dynamic_library();
476 
477  // Read geos runtime module and bind GEOS API function references to GEOS library
478  auto rt_geos_module_copy = llvm::CloneModule(
479  *g_rt_geos_module.get(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
480  auto func = llvm::dyn_cast<llvm::Function>(gv);
481  if (!func) {
482  return true;
483  }
484  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
485  func->getLinkage() ==
486  llvm::GlobalValue::LinkageTypes::InternalLinkage ||
487  func->getLinkage() == llvm::GlobalValue::LinkageTypes::ExternalLinkage);
488  });
489  CodeGenerator::link_udf_module(rt_geos_module_copy,
490  *module,
491  cgen_state_.get(),
492  llvm::Linker::Flags::LinkOnlyNeeded);
493 #else
494  throw std::runtime_error("GEOS is disabled in this build");
495 #endif
496  }
497 
498  auto execution_engine =
499  CodeGenerator::generateNativeCPUCode(query_func, live_funcs, co);
500  auto cpu_compilation_context =
501  std::make_shared<CpuCompilationContext>(std::move(execution_engine));
502  cpu_compilation_context->setFunctionPointer(multifrag_query_func);
503  addCodeToCache(key, cpu_compilation_context, module, cpu_code_cache_);
504  return cpu_compilation_context;
505 }
506 
507 void CodeGenerator::link_udf_module(const std::unique_ptr<llvm::Module>& udf_module,
508  llvm::Module& module,
509  CgenState* cgen_state,
510  llvm::Linker::Flags flags) {
511  // throw a runtime error if the target module contains functions
512  // with the same name as in module of UDF functions.
513  for (auto& f : *udf_module.get()) {
514  auto func = module.getFunction(f.getName());
515  if (!(func == nullptr) && !f.isDeclaration() && flags == llvm::Linker::Flags::None) {
516  LOG(ERROR) << " Attempt to overwrite " << f.getName().str() << " in "
517  << module.getModuleIdentifier() << " from `"
518  << udf_module->getModuleIdentifier() << "`" << std::endl;
519  throw std::runtime_error(
520  "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
521  "function ***");
522  } else {
523  VLOG(1) << " Adding " << f.getName().str() << " to "
524  << module.getModuleIdentifier() << " from `"
525  << udf_module->getModuleIdentifier() << "`" << std::endl;
526  }
527  }
528 
529  std::unique_ptr<llvm::Module> udf_module_copy;
530 
531  udf_module_copy = llvm::CloneModule(*udf_module.get(), cgen_state->vmap_);
532 
533  udf_module_copy->setDataLayout(module.getDataLayout());
534  udf_module_copy->setTargetTriple(module.getTargetTriple());
535 
536  // Initialize linker with module for RuntimeFunctions.bc
537  llvm::Linker ld(module);
538  bool link_error = false;
539 
540  link_error = ld.linkInModule(std::move(udf_module_copy), flags);
541 
542  if (link_error) {
543  throw std::runtime_error("link_udf_module: *** error linking module ***");
544  }
545 }
546 
547 namespace {
548 
549 std::string cpp_to_llvm_name(const std::string& s) {
550  if (s == "int8_t") {
551  return "i8";
552  }
553  if (s == "int16_t") {
554  return "i16";
555  }
556  if (s == "int32_t") {
557  return "i32";
558  }
559  if (s == "int64_t") {
560  return "i64";
561  }
562  CHECK(s == "float" || s == "double");
563  return s;
564 }
565 
566 std::string gen_array_any_all_sigs() {
567  std::string result;
568  for (const std::string any_or_all : {"any", "all"}) {
569  for (const std::string elem_type :
570  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
571  for (const std::string needle_type :
572  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
573  for (const std::string op_name : {"eq", "ne", "lt", "le", "gt", "ge"}) {
574  result += ("declare i1 @array_" + any_or_all + "_" + op_name + "_" + elem_type +
575  "_" + needle_type + "(i8*, i64, " + cpp_to_llvm_name(needle_type) +
576  ", " + cpp_to_llvm_name(elem_type) + ");\n");
577  }
578  }
579  }
580  }
581  return result;
582 }
583 
585  std::string result;
586  for (const std::string key_type : {"int8_t", "int16_t", "int32_t", "int64_t"}) {
587  const auto key_llvm_type = cpp_to_llvm_name(key_type);
588  result += "declare i64 @translate_null_key_" + key_type + "(" + key_llvm_type + ", " +
589  key_llvm_type + ", i64);\n";
590  }
591  return result;
592 }
593 
594 const std::string cuda_rt_decls =
595  R"( declare void @llvm.dbg.declare(metadata, metadata, metadata) declare void @llvm.dbg.value(metadata, metadata, metadata) declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind declare i64 @get_thread_index(); declare i64 @get_block_index(); declare i32 @pos_start_impl(i32*); declare i32 @group_buff_idx_impl(); declare i32 @pos_step_impl(); declare i8 @thread_warp_idx(i8); declare i64* @init_shared_mem(i64*, i32); declare i64* @init_shared_mem_nop(i64*, i32); declare i64* @declare_dynamic_shared_memory(); declare void @write_back_nop(i64*, i64*, i32); declare void @write_back_non_grouped_agg(i64*, i64*, i32); declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8); declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32); declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32); declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32); declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32); declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32); declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32); declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64); declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64); declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64); declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64); declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64); declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double); declare i64 @get_bucket_key_for_range_double(i8*, i64, double); declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double); declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64); declare i64 @agg_count_shared(i64*, i64); declare i64 @agg_count_skip_val_shared(i64*, i64, i64); declare i32 @agg_count_int32_shared(i32*, i32); declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32); declare i64 @agg_count_double_shared(i64*, double); declare i64 @agg_count_double_skip_val_shared(i64*, double, double); declare i32 @agg_count_float_shared(i32*, float); declare i32 @agg_count_float_skip_val_shared(i32*, float, float); declare i64 @agg_sum_shared(i64*, i64); declare i64 @agg_sum_skip_val_shared(i64*, i64, i64); declare i32 @agg_sum_int32_shared(i32*, i32); declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32); declare void @agg_sum_double_shared(i64*, double); declare void @agg_sum_double_skip_val_shared(i64*, double, double); declare void @agg_sum_float_shared(i32*, float); declare void @agg_sum_float_skip_val_shared(i32*, float, float); declare void @agg_max_shared(i64*, i64); declare void @agg_max_skip_val_shared(i64*, i64, i64); declare void @agg_max_int32_shared(i32*, i32); declare void @agg_max_int32_skip_val_shared(i32*, i32, i32); declare void @agg_max_int16_shared(i16*, i16); declare void @agg_max_int16_skip_val_shared(i16*, i16, i16); declare void @agg_max_int8_shared(i8*, i8); declare void @agg_max_int8_skip_val_shared(i8*, i8, i8); declare void @agg_max_double_shared(i64*, double); declare void @agg_max_double_skip_val_shared(i64*, double, double); declare void @agg_max_float_shared(i32*, float); declare void @agg_max_float_skip_val_shared(i32*, float, float); declare void @agg_min_shared(i64*, i64); declare void @agg_min_skip_val_shared(i64*, i64, i64); declare void @agg_min_int32_shared(i32*, i32); declare void @agg_min_int32_skip_val_shared(i32*, i32, i32); declare void @agg_min_int16_shared(i16*, i16); declare void @agg_min_int16_skip_val_shared(i16*, i16, i16); declare void @agg_min_int8_shared(i8*, i8); declare void @agg_min_int8_skip_val_shared(i8*, i8, i8); declare void @agg_min_double_shared(i64*, double); declare void @agg_min_double_skip_val_shared(i64*, double, double); declare void @agg_min_float_shared(i32*, float); declare void @agg_min_float_skip_val_shared(i32*, float, float); declare void @agg_id_shared(i64*, i64); declare void @agg_id_int32_shared(i32*, i32); declare void @agg_id_int16_shared(i16*, i16); declare void @agg_id_int8_shared(i8*, i8); declare void @agg_id_double_shared(i64*, double); declare void @agg_id_double_shared_slow(i64*, double*); declare void @agg_id_float_shared(i32*, float); declare i32 @checked_single_agg_id_shared(i64*, i64, i64); declare i32 @checked_single_agg_id_double_shared(i64*, double, double); declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double); declare i32 @checked_single_agg_id_float_shared(i32*, float, float); declare i1 @slotEmptyKeyCAS(i64*, i64, i64); declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32); declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16); declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8); declare i64 @datetrunc_century(i64); declare i64 @datetrunc_day(i64); declare i64 @datetrunc_decade(i64); declare i64 @datetrunc_hour(i64); declare i64 @datetrunc_millennium(i64); declare i64 @datetrunc_minute(i64); declare i64 @datetrunc_month(i64); declare i64 @datetrunc_quarter(i64); declare i64 @datetrunc_quarterday(i64); declare i64 @datetrunc_week_monday(i64); declare i64 @datetrunc_week_sunday(i64); declare i64 @datetrunc_week_saturday(i64); declare i64 @datetrunc_year(i64); declare i64 @extract_epoch(i64); declare i64 @extract_dateepoch(i64); declare i64 @extract_quarterday(i64); declare i64 @extract_hour(i64); declare i64 @extract_minute(i64); declare i64 @extract_second(i64); declare i64 @extract_millisecond(i64); declare i64 @extract_microsecond(i64); declare i64 @extract_nanosecond(i64); declare i64 @extract_dow(i64); declare i64 @extract_isodow(i64); declare i64 @extract_day(i64); declare i64 @extract_week_monday(i64); declare i64 @extract_week_sunday(i64); declare i64 @extract_week_saturday(i64); declare i64 @extract_day_of_year(i64); declare i64 @extract_month(i64); declare i64 @extract_quarter(i64); declare i64 @extract_year(i64); declare i64 @DateTruncateHighPrecisionToDate(i64, i64); declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64); declare i64 @DateDiff(i32, i64, i64); declare i64 @DateDiffNullable(i32, i64, i64, i64); declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32); declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64); declare i64 @DateAdd(i32, i64, i64); declare i64 @DateAddNullable(i32, i64, i64, i64); declare i64 @DateAddHighPrecision(i32, i64, i64, i32); declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64); declare i64 @string_decode(i8*, i64); declare i32 @array_size(i8*, i64, i32); declare i32 @array_size_nullable(i8*, i64, i32, i32); declare i32 @fast_fixlen_array_size(i8*, i32); declare i1 @array_is_null(i8*, i64); declare i1 @point_coord_array_is_null(i8*, i64); declare i8* @array_buff(i8*, i64); declare i8* @fast_fixlen_array_buff(i8*, i64); declare i8 @array_at_int8_t(i8*, i64, i32); declare i16 @array_at_int16_t(i8*, i64, i32); declare i32 @array_at_int32_t(i8*, i64, i32); declare i64 @array_at_int64_t(i8*, i64, i32); declare float @array_at_float(i8*, i64, i32); declare double @array_at_double(i8*, i64, i32); declare i8 @varlen_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_array_at_int64_t(i8*, i64, i32); declare float @varlen_array_at_float(i8*, i64, i32); declare double @varlen_array_at_double(i8*, i64, i32); declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32); declare float @varlen_notnull_array_at_float(i8*, i64, i32); declare double @varlen_notnull_array_at_double(i8*, i64, i32); declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8); declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16); declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32); declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64); declare float @array_at_float_checked(i8*, i64, i64, float); declare double @array_at_double_checked(i8*, i64, i64, double); declare i32 @char_length(i8*, i32); declare i32 @char_length_nullable(i8*, i32, i32); declare i32 @char_length_encoded(i8*, i32); declare i32 @char_length_encoded_nullable(i8*, i32, i32); declare i32 @key_for_string_encoded(i32); declare i1 @sample_ratio(double, i64); declare i1 @string_like(i8*, i32, i8*, i32, i8); declare i1 @string_ilike(i8*, i32, i8*, i32, i8); declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8); declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8); declare i1 @string_like_simple(i8*, i32, i8*, i32); declare i1 @string_ilike_simple(i8*, i32, i8*, i32); declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8); declare i1 @string_lt(i8*, i32, i8*, i32); declare i1 @string_le(i8*, i32, i8*, i32); declare i1 @string_gt(i8*, i32, i8*, i32); declare i1 @string_ge(i8*, i32, i8*, i32); declare i1 @string_eq(i8*, i32, i8*, i32); declare i1 @string_ne(i8*, i32, i8*, i32); declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8); declare i1 @regexp_like(i8*, i32, i8*, i32, i8); declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8); declare void @linear_probabilistic_count(i8*, i32, i8*, i32); declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64); declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64); declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64); declare void @agg_approx_median(i64*, double); declare void @agg_approx_median_skip_val(i64*, double, i64); declare void @record_error_code(i32, i32*); declare i32 @get_error_code(i32*); declare i1 @dynamic_watchdog(); declare i1 @check_interrupt(); declare void @force_sync(); declare void @sync_warp(); declare void @sync_warp_protected(i64, i64); declare void @sync_threadblock(); declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32); declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64); declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float); declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double); )" + gen_array_any_all_sigs() +
597 
598 #ifdef HAVE_CUDA
599 std::string extension_function_decls(const std::unordered_set<std::string>& udf_decls) {
600  const auto decls = ExtensionFunctionsWhitelist::getLLVMDeclarations(udf_decls);
601  return boost::algorithm::join(decls, "\n");
602 }
603 
604 void legalize_nvvm_ir(llvm::Function* query_func) {
605  // optimizations might add attributes to the function
606  // and NVPTX doesn't understand all of them; play it
607  // safe and clear all attributes
608  clear_function_attributes(query_func);
609  verify_function_ir(query_func);
610 
611  std::vector<llvm::Instruction*> stackrestore_intrinsics;
612  std::vector<llvm::Instruction*> stacksave_intrinsics;
613  for (auto& BB : *query_func) {
614  for (llvm::Instruction& I : BB) {
615  if (const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
616  if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
617  stacksave_intrinsics.push_back(&I);
618  } else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
619  stackrestore_intrinsics.push_back(&I);
620  }
621  }
622  }
623  }
624 
625  // stacksave and stackrestore intrinsics appear together, and
626  // stackrestore uses stacksaved result as its argument
627  // so it should be removed first.
628  for (auto& II : stackrestore_intrinsics) {
629  II->eraseFromParent();
630  }
631  for (auto& II : stacksave_intrinsics) {
632  II->eraseFromParent();
633  }
634 }
635 #endif // HAVE_CUDA
636 
637 } // namespace
638 
639 llvm::StringRef get_gpu_target_triple_string() {
640  return llvm::StringRef("nvptx64-nvidia-cuda");
641 }
642 
643 llvm::StringRef get_gpu_data_layout() {
644  return llvm::StringRef(
645  "e-p:64:64:64-i1:8:8-i8:8:8-"
646  "i16:16:16-i32:32:32-i64:64:64-"
647  "f32:32:32-f64:64:64-v16:16:16-"
648  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
649 }
650 
651 std::map<std::string, std::string> get_device_parameters(bool cpu_only) {
652  std::map<std::string, std::string> result;
653 
654  result.insert(std::make_pair("cpu_name", llvm::sys::getHostCPUName()));
655  result.insert(std::make_pair("cpu_triple", llvm::sys::getProcessTriple()));
656  result.insert(
657  std::make_pair("cpu_cores", std::to_string(llvm::sys::getHostNumPhysicalCores())));
658  result.insert(std::make_pair("cpu_threads", std::to_string(cpu_threads())));
659 
660  std::string null_values;
661  null_values += "boolean1:" + std::to_string(serialized_null_value<bool>()) + ";";
662  null_values += "boolean8:" + std::to_string(serialized_null_value<int8_t>()) + ";";
663  null_values += "int8:" + std::to_string(serialized_null_value<int8_t>()) + ";";
664  null_values += "int16:" + std::to_string(serialized_null_value<int16_t>()) + ";";
665  null_values += "int32:" + std::to_string(serialized_null_value<int32_t>()) + ";";
666  null_values += "int64:" + std::to_string(serialized_null_value<int64_t>()) + ";";
667  null_values += "uint8:" + std::to_string(serialized_null_value<uint8_t>()) + ";";
668  null_values += "uint16:" + std::to_string(serialized_null_value<uint16_t>()) + ";";
669  null_values += "uint32:" + std::to_string(serialized_null_value<uint32_t>()) + ";";
670  null_values += "uint64:" + std::to_string(serialized_null_value<uint64_t>()) + ";";
671  null_values += "float32:" + std::to_string(serialized_null_value<float>()) + ";";
672  null_values += "float64:" + std::to_string(serialized_null_value<double>()) + ";";
673  null_values +=
674  "Array<boolean8>:" + std::to_string(serialized_null_value<int8_t, true>()) + ";";
675  null_values +=
676  "Array<int8>:" + std::to_string(serialized_null_value<int8_t, true>()) + ";";
677  null_values +=
678  "Array<int16>:" + std::to_string(serialized_null_value<int16_t, true>()) + ";";
679  null_values +=
680  "Array<int32>:" + std::to_string(serialized_null_value<int32_t, true>()) + ";";
681  null_values +=
682  "Array<int64>:" + std::to_string(serialized_null_value<int64_t, true>()) + ";";
683  null_values +=
684  "Array<float32>:" + std::to_string(serialized_null_value<float, true>()) + ";";
685  null_values +=
686  "Array<float64>:" + std::to_string(serialized_null_value<double, true>()) + ";";
687 
688  result.insert(std::make_pair("null_values", null_values));
689 
690  llvm::StringMap<bool> cpu_features;
691  if (llvm::sys::getHostCPUFeatures(cpu_features)) {
692  std::string features_str = "";
693  for (auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
694  features_str += (it->getValue() ? " +" : " -");
695  features_str += it->getKey().str();
696  }
697  result.insert(std::make_pair("cpu_features", features_str));
698  }
699 
700  result.insert(std::make_pair("llvm_version",
701  std::to_string(LLVM_VERSION_MAJOR) + "." +
702  std::to_string(LLVM_VERSION_MINOR) + "." +
703  std::to_string(LLVM_VERSION_PATCH)));
704 
705 #ifdef HAVE_CUDA
706  if (!cpu_only) {
707  int device_count = 0;
708  checkCudaErrors(cuDeviceGetCount(&device_count));
709  if (device_count) {
710  CUdevice device{};
711  char device_name[256];
712  int major = 0, minor = 0;
713  int driver_version;
714  checkCudaErrors(cuDeviceGet(&device, 0)); // assuming homogeneous multi-GPU system
715  checkCudaErrors(cuDeviceGetName(device_name, 256, device));
716  checkCudaErrors(cuDeviceGetAttribute(
717  &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
718  checkCudaErrors(cuDeviceGetAttribute(
719  &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
720  checkCudaErrors(cuDriverGetVersion(&driver_version));
721 
722  result.insert(std::make_pair("gpu_name", device_name));
723  result.insert(std::make_pair("gpu_count", std::to_string(device_count)));
724  result.insert(std::make_pair("gpu_compute_capability",
725  std::to_string(major) + "." + std::to_string(minor)));
726  result.insert(std::make_pair("gpu_triple", get_gpu_target_triple_string()));
727  result.insert(std::make_pair("gpu_datalayout", get_gpu_data_layout()));
728  result.insert(std::make_pair("gpu_driver",
729  "CUDA " + std::to_string(driver_version / 1000) + "." +
730  std::to_string((driver_version % 1000) / 10)));
731  }
732  }
733 #endif
734 
735  return result;
736 }
737 
738 std::shared_ptr<GpuCompilationContext> CodeGenerator::generateNativeGPUCode(
739  llvm::Function* func,
740  llvm::Function* wrapper_func,
741  const std::unordered_set<llvm::Function*>& live_funcs,
742  const CompilationOptions& co,
743  const GPUTarget& gpu_target) {
744 #ifdef HAVE_CUDA
745  auto module = func->getParent();
746  /*
747  `func` is one of the following generated functions:
748  - `call_table_function(i8** %input_col_buffers, i64*
749  %input_row_count, i64** %output_buffers, i64* %output_row_count)`
750  that wraps the user-defined table function.
751  - `multifrag_query`
752  - `multifrag_query_hoisted_literals`
753  - ...
754 
755  `wrapper_func` is table_func_kernel(i32*, i8**, i64*, i64**,
756  i64*) that wraps `call_table_function`.
757 
758  `module` is from `build/QueryEngine/RuntimeFunctions.bc` and it
759  contains `func` and `wrapper_func`. `module` should also contain
760  the definitions of user-defined table functions.
761 
762  `live_funcs` contains table_func_kernel and call_table_function
763 
764  `gpu_target.cgen_state->module_` appears to be the same as `module`
765  */
766  CHECK(gpu_target.cgen_state->module_ == module);
767  module->setDataLayout(
768  "e-p:64:64:64-i1:8:8-i8:8:8-"
769  "i16:16:16-i32:32:32-i64:64:64-"
770  "f32:32:32-f64:64:64-v16:16:16-"
771  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
772  module->setTargetTriple("nvptx64-nvidia-cuda");
773  CHECK(gpu_target.nvptx_target_machine);
774  auto pass_manager_builder = llvm::PassManagerBuilder();
775 
776  pass_manager_builder.OptLevel = 0;
777  llvm::legacy::PassManager module_pass_manager;
778  pass_manager_builder.populateModulePassManager(module_pass_manager);
779 
780  bool requires_libdevice = check_module_requires_libdevice(module);
781 
782  if (requires_libdevice) {
783  // add nvvm reflect pass replacing any NVVM conditionals with constants
784  gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
785  llvm::legacy::FunctionPassManager FPM(module);
786  pass_manager_builder.populateFunctionPassManager(FPM);
787 
788  // Run the NVVMReflectPass here rather than inside optimize_ir
789  FPM.doInitialization();
790  for (auto& F : *module) {
791  FPM.run(F);
792  }
793  FPM.doFinalization();
794  }
795 
796  // run optimizations
797  optimize_ir(func, module, module_pass_manager, live_funcs, co);
798  legalize_nvvm_ir(func);
799 
800  std::stringstream ss;
801  llvm::raw_os_ostream os(ss);
802 
803  llvm::LLVMContext& ctx = module->getContext();
804  // Get "nvvm.annotations" metadata node
805  llvm::NamedMDNode* md = module->getOrInsertNamedMetadata("nvvm.annotations");
806 
807  llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
808  llvm::MDString::get(ctx, "kernel"),
809  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
810  llvm::Type::getInt32Ty(ctx), 1))};
811 
812  // Append metadata to nvvm.annotations
813  md->addOperand(llvm::MDNode::get(ctx, md_vals));
814 
815  std::unordered_set<llvm::Function*> roots{wrapper_func, func};
816  if (gpu_target.row_func_not_inlined) {
817  clear_function_attributes(gpu_target.cgen_state->row_func_);
818  roots.insert(gpu_target.cgen_state->row_func_);
819  if (gpu_target.cgen_state->filter_func_) {
820  roots.insert(gpu_target.cgen_state->filter_func_);
821  }
822  }
823 
824  // prevent helper functions from being removed
825  for (auto f : gpu_target.cgen_state->helper_functions_) {
826  roots.insert(f);
827  }
828 
829  // Prevent the udf function(s) from being removed the way the runtime functions are
830  std::unordered_set<std::string> udf_declarations;
831  if (is_udf_module_present()) {
832  for (auto& f : udf_gpu_module->getFunctionList()) {
833  llvm::Function* udf_function = module->getFunction(f.getName());
834 
835  if (udf_function) {
836  legalize_nvvm_ir(udf_function);
837  roots.insert(udf_function);
838 
839  // If we have a udf that declares a external function
840  // note it so we can avoid duplicate declarations
841  if (f.isDeclaration()) {
842  udf_declarations.insert(f.getName().str());
843  }
844  }
845  }
846  }
847 
849  for (auto& f : rt_udf_gpu_module->getFunctionList()) {
850  llvm::Function* udf_function = module->getFunction(f.getName());
851  if (udf_function) {
852  legalize_nvvm_ir(udf_function);
853  roots.insert(udf_function);
854 
855  // If we have a udf that declares a external function
856  // note it so we can avoid duplicate declarations
857  if (f.isDeclaration()) {
858  udf_declarations.insert(f.getName().str());
859  }
860  }
861  }
862  }
863 
864  std::vector<llvm::Function*> rt_funcs;
865  for (auto& Fn : *module) {
866  if (roots.count(&Fn)) {
867  continue;
868  }
869  rt_funcs.push_back(&Fn);
870  }
871  for (auto& pFn : rt_funcs) {
872  pFn->removeFromParent();
873  }
874 
875  if (requires_libdevice) {
876  add_intrinsics_to_module(module);
877  }
878 
879  module->print(os, nullptr);
880  os.flush();
881 
882  for (auto& pFn : rt_funcs) {
883  module->getFunctionList().push_back(pFn);
884  }
885  module->eraseNamedMetadata(md);
886 
887  auto cuda_llir = ss.str() + cuda_rt_decls + extension_function_decls(udf_declarations);
888  std::string ptx;
889  try {
890  ptx = generatePTX(
891  cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
892  } catch (ParseIRError& e) {
893  LOG(WARNING) << "Failed to generate PTX: " << e.what()
894  << ". Switching to CPU execution target.";
895  throw QueryMustRunOnCpu();
896  }
897  LOG(PTX) << "PTX for the GPU:\n" << ptx << "\nEnd of PTX";
898 
899  auto cubin_result = ptx_to_cubin(ptx, gpu_target.block_size, gpu_target.cuda_mgr);
900  auto& option_keys = cubin_result.option_keys;
901  auto& option_values = cubin_result.option_values;
902  auto cubin = cubin_result.cubin;
903  auto link_state = cubin_result.link_state;
904  const auto num_options = option_keys.size();
905 
906  auto func_name = wrapper_func->getName().str();
907  auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
908  for (int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
909  ++device_id) {
910  gpu_compilation_context->addDeviceCode(
911  std::make_unique<GpuDeviceCompilationContext>(cubin,
912  func_name,
913  device_id,
914  gpu_target.cuda_mgr,
915  num_options,
916  &option_keys[0],
917  &option_values[0]));
918  }
919 
920  checkCudaErrors(cuLinkDestroy(link_state));
921  return gpu_compilation_context;
922 #else
923  return {};
924 #endif
925 }
926 
927 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenGPU(
928  llvm::Function* query_func,
929  llvm::Function* multifrag_query_func,
930  std::unordered_set<llvm::Function*>& live_funcs,
931  const bool no_inline,
932  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
933  const CompilationOptions& co) {
934 #ifdef HAVE_CUDA
935  auto module = multifrag_query_func->getParent();
936 
937  CHECK(cuda_mgr);
938  CodeCacheKey key{serialize_llvm_object(query_func),
939  serialize_llvm_object(cgen_state_->row_func_)};
940  if (cgen_state_->filter_func_) {
941  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
942  }
943  for (const auto helper : cgen_state_->helper_functions_) {
944  key.push_back(serialize_llvm_object(helper));
945  }
946  auto cached_code = getCodeFromCache(key, gpu_code_cache_);
947  if (cached_code) {
948  return cached_code;
949  }
950 
951  bool row_func_not_inlined = false;
952  if (no_inline) {
953  for (auto it = llvm::inst_begin(cgen_state_->row_func_),
954  e = llvm::inst_end(cgen_state_->row_func_);
955  it != e;
956  ++it) {
957  if (llvm::isa<llvm::CallInst>(*it)) {
958  auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
959  if (get_gv_call.getCalledFunction()->getName() == "array_size" ||
960  get_gv_call.getCalledFunction()->getName() == "linear_probabilistic_count") {
961  mark_function_never_inline(cgen_state_->row_func_);
962  row_func_not_inlined = true;
963  break;
964  }
965  }
966  }
967  }
968 
969  initializeNVPTXBackend();
970  CodeGenerator::GPUTarget gpu_target{nvptx_target_machine_.get(),
971  cuda_mgr,
972  blockSize(),
973  cgen_state_.get(),
974  row_func_not_inlined};
975  std::shared_ptr<GpuCompilationContext> compilation_context;
976 
977  if (check_module_requires_libdevice(module)) {
978  if (g_rt_libdevice_module == nullptr) {
979  // raise error
980  throw std::runtime_error(
981  "libdevice library is not available but required by the UDF module");
982  }
983 
984  // Bind libdevice it to the current module
985  CodeGenerator::link_udf_module(g_rt_libdevice_module,
986  *module,
987  cgen_state_.get(),
988  llvm::Linker::Flags::OverrideFromSrc);
989 
990  // activate nvvm-reflect-ftz flag on the module
991  module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", (int)1);
992  for (llvm::Function& fn : *module) {
993  fn.addFnAttr("nvptx-f32ftz", "true");
994  }
995  }
996 
997  try {
998  compilation_context = CodeGenerator::generateNativeGPUCode(
999  query_func, multifrag_query_func, live_funcs, co, gpu_target);
1000  addCodeToCache(key, compilation_context, module, gpu_code_cache_);
1001  } catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1002  if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1003  // Thrown if memory not able to be allocated on gpu
1004  // Retry once after evicting portion of code cache
1005  LOG(WARNING) << "Failed to allocate GPU memory for generated code. Evicting "
1007  << "% of GPU code cache and re-trying.";
1008  gpu_code_cache_.evictFractionEntries(g_fraction_code_cache_to_evict);
1009  compilation_context = CodeGenerator::generateNativeGPUCode(
1010  query_func, multifrag_query_func, live_funcs, co, gpu_target);
1011  addCodeToCache(key, compilation_context, module, gpu_code_cache_);
1012  } else {
1013  throw;
1014  }
1015  }
1016  CHECK(compilation_context);
1017  return compilation_context;
1018 #else
1019  return nullptr;
1020 #endif
1021 }
1022 
1023 std::string CodeGenerator::generatePTX(const std::string& cuda_llir,
1024  llvm::TargetMachine* nvptx_target_machine,
1025  llvm::LLVMContext& context) {
1026  auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir, "", false);
1027 
1028  llvm::SMDiagnostic parse_error;
1029 
1030  auto module = llvm::parseIR(mem_buff->getMemBufferRef(), parse_error, context);
1031  if (!module) {
1032  LOG(IR) << "CodeGenerator::generatePTX:NVVM IR:\n" << cuda_llir << "\nEnd of NNVM IR";
1033  throw_parseIR_error(parse_error, "generatePTX", /* is_gpu= */ true);
1034  }
1035 
1036  llvm::SmallString<256> code_str;
1037  llvm::raw_svector_ostream formatted_os(code_str);
1038  CHECK(nvptx_target_machine);
1039  {
1040  llvm::legacy::PassManager ptxgen_pm;
1041  module->setDataLayout(nvptx_target_machine->createDataLayout());
1042 
1043 #if LLVM_VERSION_MAJOR >= 10
1044  nvptx_target_machine->addPassesToEmitFile(
1045  ptxgen_pm, formatted_os, nullptr, llvm::CGFT_AssemblyFile);
1046 #else
1047  nvptx_target_machine->addPassesToEmitFile(
1048  ptxgen_pm, formatted_os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1049 #endif
1050  ptxgen_pm.run(*module);
1051  }
1052 
1053 #if LLVM_VERSION_MAJOR >= 11
1054  return std::string(code_str);
1055 #else
1056  return code_str.str();
1057 #endif
1058 }
1059 
1060 std::unique_ptr<llvm::TargetMachine> CodeGenerator::initializeNVPTXBackend(
1062  llvm::InitializeAllTargets();
1063  llvm::InitializeAllTargetMCs();
1064  llvm::InitializeAllAsmPrinters();
1065  std::string err;
1066  auto target = llvm::TargetRegistry::lookupTarget("nvptx64", err);
1067  if (!target) {
1068  LOG(FATAL) << err;
1069  }
1070  return std::unique_ptr<llvm::TargetMachine>(
1071  target->createTargetMachine("nvptx64-nvidia-cuda",
1073  "",
1074  llvm::TargetOptions(),
1075  llvm::Reloc::Static));
1076 }
1077 
1078 std::string Executor::generatePTX(const std::string& cuda_llir) const {
1080  cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1081 }
1082 
1083 void Executor::initializeNVPTXBackend() const {
1084  if (nvptx_target_machine_) {
1085  return;
1086  }
1087  const auto cuda_mgr = catalog_->getDataMgr().getCudaMgr();
1088  LOG_IF(FATAL, cuda_mgr == nullptr) << "No CudaMgr instantiated, unable to check device "
1089  "architecture or generate code for nvidia GPUs.";
1090  const auto arch = cuda_mgr->getDeviceArch();
1091  nvptx_target_machine_ = CodeGenerator::initializeNVPTXBackend(arch);
1092 }
1093 
1094 // A small number of runtime functions don't get through CgenState::emitCall. List them
1095 // explicitly here and always clone their implementation from the runtime module.
1096 bool CodeGenerator::alwaysCloneRuntimeFunction(const llvm::Function* func) {
1097  return func->getName() == "query_stub_hoisted_literals" ||
1098  func->getName() == "multifrag_query_hoisted_literals" ||
1099  func->getName() == "query_stub" || func->getName() == "multifrag_query" ||
1100  func->getName() == "fixed_width_int_decode" ||
1101  func->getName() == "fixed_width_unsigned_decode" ||
1102  func->getName() == "diff_fixed_width_int_decode" ||
1103  func->getName() == "fixed_width_double_decode" ||
1104  func->getName() == "fixed_width_float_decode" ||
1105  func->getName() == "fixed_width_small_date_decode" ||
1106  func->getName() == "record_error_code" || func->getName() == "get_error_code";
1107 }
1108 
1109 llvm::Module* read_template_module(llvm::LLVMContext& context) {
1110  llvm::SMDiagnostic err;
1111 
1112  auto buffer_or_error = llvm::MemoryBuffer::getFile(omnisci::get_root_abs_path() +
1113  "/QueryEngine/RuntimeFunctions.bc");
1114  CHECK(!buffer_or_error.getError());
1115  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1116 
1117  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1118  CHECK(!owner.takeError());
1119  auto module = owner.get().release();
1120  CHECK(module);
1121 
1122  return module;
1123 }
1124 
1125 #ifdef HAVE_CUDA
1126 llvm::Module* read_libdevice_module(llvm::LLVMContext& context) {
1127  llvm::SMDiagnostic err;
1128  const auto env = get_cuda_home();
1129 
1130  boost::filesystem::path cuda_path{env};
1131  cuda_path /= "nvvm";
1132  cuda_path /= "libdevice";
1133  cuda_path /= "libdevice.10.bc";
1134 
1135  if (!boost::filesystem::exists(cuda_path)) {
1136  LOG(WARNING) << "Could not find CUDA libdevice; support for some UDF "
1137  "functions might not be available.";
1138  return nullptr;
1139  }
1140 
1141  auto buffer_or_error = llvm::MemoryBuffer::getFile(cuda_path.c_str());
1142  CHECK(!buffer_or_error.getError());
1143  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1144 
1145  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1146  CHECK(!owner.takeError());
1147  auto module = owner.get().release();
1148  CHECK(module);
1149 
1150  return module;
1151 }
1152 #endif
1153 
1154 #ifdef ENABLE_GEOS
1155 llvm::Module* read_geos_module(llvm::LLVMContext& context) {
1156  llvm::SMDiagnostic err;
1157 
1158  auto buffer_or_error = llvm::MemoryBuffer::getFile(omnisci::get_root_abs_path() +
1159  "/QueryEngine/GeosRuntime.bc");
1160  CHECK(!buffer_or_error.getError());
1161  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1162 
1163  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1164  CHECK(!owner.takeError());
1165  auto module = owner.get().release();
1166  CHECK(module);
1167 
1168  return module;
1169 }
1170 #endif
1171 
1172 namespace {
1173 
1174 void bind_pos_placeholders(const std::string& pos_fn_name,
1175  const bool use_resume_param,
1176  llvm::Function* query_func,
1177  llvm::Module* module) {
1178  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1179  ++it) {
1180  if (!llvm::isa<llvm::CallInst>(*it)) {
1181  continue;
1182  }
1183  auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1184  if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
1185  if (use_resume_param) {
1186  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1187  llvm::ReplaceInstWithInst(
1188  &pos_call,
1189  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl"),
1190  error_code_arg));
1191  } else {
1192  llvm::ReplaceInstWithInst(
1193  &pos_call,
1194  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl")));
1195  }
1196  break;
1197  }
1198  }
1199 }
1200 
1201 void set_row_func_argnames(llvm::Function* row_func,
1202  const size_t in_col_count,
1203  const size_t agg_col_count,
1204  const bool hoist_literals) {
1205  auto arg_it = row_func->arg_begin();
1206 
1207  if (agg_col_count) {
1208  for (size_t i = 0; i < agg_col_count; ++i) {
1209  arg_it->setName("out");
1210  ++arg_it;
1211  }
1212  } else {
1213  arg_it->setName("group_by_buff");
1214  ++arg_it;
1215  arg_it->setName("crt_matched");
1216  ++arg_it;
1217  arg_it->setName("total_matched");
1218  ++arg_it;
1219  arg_it->setName("old_total_matched");
1220  ++arg_it;
1221  arg_it->setName("max_matched");
1222  ++arg_it;
1223  }
1224 
1225  arg_it->setName("agg_init_val");
1226  ++arg_it;
1227 
1228  arg_it->setName("pos");
1229  ++arg_it;
1230 
1231  arg_it->setName("frag_row_off");
1232  ++arg_it;
1233 
1234  arg_it->setName("num_rows_per_scan");
1235  ++arg_it;
1236 
1237  if (hoist_literals) {
1238  arg_it->setName("literals");
1239  ++arg_it;
1240  }
1241 
1242  for (size_t i = 0; i < in_col_count; ++i) {
1243  arg_it->setName("col_buf" + std::to_string(i));
1244  ++arg_it;
1245  }
1246 
1247  arg_it->setName("join_hash_tables");
1248 }
1249 
1250 llvm::Function* create_row_function(const size_t in_col_count,
1251  const size_t agg_col_count,
1252  const bool hoist_literals,
1253  llvm::Module* module,
1254  llvm::LLVMContext& context) {
1255  std::vector<llvm::Type*> row_process_arg_types;
1256 
1257  if (agg_col_count) {
1258  // output (aggregate) arguments
1259  for (size_t i = 0; i < agg_col_count; ++i) {
1260  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1261  }
1262  } else {
1263  // group by buffer
1264  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1265  // current match count
1266  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1267  // total match count passed from the caller
1268  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1269  // old total match count returned to the caller
1270  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1271  // max matched (total number of slots in the output buffer)
1272  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1273  }
1274 
1275  // aggregate init values
1276  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1277 
1278  // position argument
1279  row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1280 
1281  // fragment row offset argument
1282  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1283 
1284  // number of rows for each scan
1285  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1286 
1287  // literals buffer argument
1288  if (hoist_literals) {
1289  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1290  }
1291 
1292  // column buffer arguments
1293  for (size_t i = 0; i < in_col_count; ++i) {
1294  row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1295  }
1296 
1297  // join hash table argument
1298  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1299 
1300  // generate the function
1301  auto ft =
1302  llvm::FunctionType::get(get_int_type(32, context), row_process_arg_types, false);
1303 
1304  auto row_func =
1305  llvm::Function::Create(ft, llvm::Function::ExternalLinkage, "row_func", module);
1306 
1307  // set the row function argument names; for debugging purposes only
1308  set_row_func_argnames(row_func, in_col_count, agg_col_count, hoist_literals);
1309 
1310  return row_func;
1311 }
1312 
1313 // Iterate through multifrag_query_func, replacing calls to query_fname with query_func.
1314 void bind_query(llvm::Function* query_func,
1315  const std::string& query_fname,
1316  llvm::Function* multifrag_query_func,
1317  llvm::Module* module) {
1318  std::vector<llvm::CallInst*> query_stubs;
1319  for (auto it = llvm::inst_begin(multifrag_query_func),
1320  e = llvm::inst_end(multifrag_query_func);
1321  it != e;
1322  ++it) {
1323  if (!llvm::isa<llvm::CallInst>(*it)) {
1324  continue;
1325  }
1326  auto& query_call = llvm::cast<llvm::CallInst>(*it);
1327  if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
1328  query_stubs.push_back(&query_call);
1329  }
1330  }
1331  for (auto& S : query_stubs) {
1332  std::vector<llvm::Value*> args;
1333  for (size_t i = 0; i < S->getNumArgOperands(); ++i) {
1334  args.push_back(S->getArgOperand(i));
1335  }
1336  llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args, ""));
1337  }
1338 }
1339 
1340 std::vector<std::string> get_agg_fnames(const std::vector<Analyzer::Expr*>& target_exprs,
1341  const bool is_group_by) {
1342  std::vector<std::string> result;
1343  for (size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1344  ++target_idx, ++agg_col_idx) {
1345  const auto target_expr = target_exprs[target_idx];
1346  CHECK(target_expr);
1347  const auto target_type_info = target_expr->get_type_info();
1348  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
1349  const bool is_varlen =
1350  (target_type_info.is_string() &&
1351  target_type_info.get_compression() == kENCODING_NONE) ||
1352  target_type_info.is_array(); // TODO: should it use is_varlen_array() ?
1353  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
1354  result.emplace_back(target_type_info.is_fp() ? "agg_id_double" : "agg_id");
1355  if (is_varlen) {
1356  result.emplace_back("agg_id");
1357  }
1358  if (target_type_info.is_geometry()) {
1359  result.emplace_back("agg_id");
1360  for (auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1361  result.emplace_back("agg_id");
1362  }
1363  }
1364  continue;
1365  }
1366  const auto agg_type = agg_expr->get_aggtype();
1367  const auto& agg_type_info =
1368  agg_type != kCOUNT ? agg_expr->get_arg()->get_type_info() : target_type_info;
1369  switch (agg_type) {
1370  case kAVG: {
1371  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1372  !agg_type_info.is_fp()) {
1373  throw std::runtime_error("AVG is only valid on integer and floating point");
1374  }
1375  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1376  ? "agg_sum"
1377  : "agg_sum_double");
1378  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1379  ? "agg_count"
1380  : "agg_count_double");
1381  break;
1382  }
1383  case kMIN: {
1384  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1385  agg_type_info.is_geometry()) {
1386  throw std::runtime_error(
1387  "MIN on strings, arrays or geospatial types not supported yet");
1388  }
1389  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1390  ? "agg_min"
1391  : "agg_min_double");
1392  break;
1393  }
1394  case kMAX: {
1395  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1396  agg_type_info.is_geometry()) {
1397  throw std::runtime_error(
1398  "MAX on strings, arrays or geospatial types not supported yet");
1399  }
1400  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1401  ? "agg_max"
1402  : "agg_max_double");
1403  break;
1404  }
1405  case kSUM: {
1406  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1407  !agg_type_info.is_fp()) {
1408  throw std::runtime_error("SUM is only valid on integer and floating point");
1409  }
1410  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1411  ? "agg_sum"
1412  : "agg_sum_double");
1413  break;
1414  }
1415  case kCOUNT:
1416  result.emplace_back(agg_expr->get_is_distinct() ? "agg_count_distinct"
1417  : "agg_count");
1418  break;
1419  case kSINGLE_VALUE: {
1420  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1421  break;
1422  }
1423  case kSAMPLE: {
1424  // Note that varlen SAMPLE arguments are handled separately above
1425  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1426  break;
1427  }
1429  result.emplace_back("agg_approximate_count_distinct");
1430  break;
1431  case kAPPROX_MEDIAN:
1432  result.emplace_back("agg_approx_median");
1433  break;
1434  default:
1435  CHECK(false);
1436  }
1437  }
1438  return result;
1439 }
1440 
1441 } // namespace
1442 
1443 std::unique_ptr<llvm::Module> g_rt_module(read_template_module(getGlobalLLVMContext()));
1444 
1445 #ifdef ENABLE_GEOS
1446 std::unique_ptr<llvm::Module> g_rt_geos_module(read_geos_module(getGlobalLLVMContext()));
1447 #endif
1448 
1449 #ifdef HAVE_CUDA
1450 std::unique_ptr<llvm::Module> g_rt_libdevice_module(
1451  read_libdevice_module(getGlobalLLVMContext()));
1452 #endif
1453 
1454 bool is_udf_module_present(bool cpu_only) {
1455  return (cpu_only || udf_gpu_module != nullptr) && (udf_cpu_module != nullptr);
1456 }
1457 
1458 bool is_rt_udf_module_present(bool cpu_only) {
1459  return (cpu_only || rt_udf_gpu_module != nullptr) && (rt_udf_cpu_module != nullptr);
1460 }
1461 
1462 void read_udf_gpu_module(const std::string& udf_ir_filename) {
1463  llvm::SMDiagnostic parse_error;
1464 
1465  llvm::StringRef file_name_arg(udf_ir_filename);
1466  udf_gpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1467 
1468  if (!udf_gpu_module) {
1469  throw_parseIR_error(parse_error, udf_ir_filename, /* is_gpu= */ true);
1470  }
1471 
1472  llvm::Triple gpu_triple(udf_gpu_module->getTargetTriple());
1473  if (!gpu_triple.isNVPTX()) {
1474  LOG(WARNING)
1475  << "Expected triple nvptx64-nvidia-cuda for NVVM IR of loadtime UDFs but got "
1476  << gpu_triple.str() << ". Disabling the NVVM IR module.";
1477  udf_gpu_module = nullptr;
1478  }
1479 }
1480 
1481 void read_udf_cpu_module(const std::string& udf_ir_filename) {
1482  llvm::SMDiagnostic parse_error;
1483 
1484  llvm::StringRef file_name_arg(udf_ir_filename);
1485 
1486  udf_cpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1487  if (!udf_cpu_module) {
1488  throw_parseIR_error(parse_error, udf_ir_filename);
1489  }
1490 }
1491 
1492 void read_rt_udf_gpu_module(const std::string& udf_ir_string) {
1493  llvm::SMDiagnostic parse_error;
1494 
1495  auto buf =
1496  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for GPU");
1497 
1498  rt_udf_gpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1499  if (!rt_udf_gpu_module) {
1500  LOG(IR) << "read_rt_udf_gpu_module:NVVM IR:\n" << udf_ir_string << "\nEnd of NNVM IR";
1501  throw_parseIR_error(parse_error, "", /* is_gpu= */ true);
1502  }
1503 
1504  llvm::Triple gpu_triple(rt_udf_gpu_module->getTargetTriple());
1505  if (!gpu_triple.isNVPTX()) {
1506  LOG(IR) << "read_rt_udf_gpu_module:NVVM IR:\n" << udf_ir_string << "\nEnd of NNVM IR";
1507  LOG(WARNING) << "Expected triple nvptx64-nvidia-cuda for NVVM IR but got "
1508  << gpu_triple.str()
1509  << ". Executing runtime UDFs on GPU will be disabled.";
1510  rt_udf_gpu_module = nullptr;
1511  return;
1512  }
1513 }
1514 
1515 void read_rt_udf_cpu_module(const std::string& udf_ir_string) {
1516  llvm::SMDiagnostic parse_error;
1517 
1518  auto buf =
1519  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for CPU");
1520 
1521  rt_udf_cpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1522  if (!rt_udf_cpu_module) {
1523  LOG(IR) << "read_rt_udf_cpu_module:LLVM IR:\n" << udf_ir_string << "\nEnd of LLVM IR";
1524  throw_parseIR_error(parse_error);
1525  }
1526 }
1527 
1528 std::unordered_set<llvm::Function*> CodeGenerator::markDeadRuntimeFuncs(
1529  llvm::Module& module,
1530  const std::vector<llvm::Function*>& roots,
1531  const std::vector<llvm::Function*>& leaves) {
1532  std::unordered_set<llvm::Function*> live_funcs;
1533  live_funcs.insert(roots.begin(), roots.end());
1534  live_funcs.insert(leaves.begin(), leaves.end());
1535 
1536  if (auto F = module.getFunction("init_shared_mem_nop")) {
1537  live_funcs.insert(F);
1538  }
1539  if (auto F = module.getFunction("write_back_nop")) {
1540  live_funcs.insert(F);
1541  }
1542 
1543  for (const llvm::Function* F : roots) {
1544  for (const llvm::BasicBlock& BB : *F) {
1545  for (const llvm::Instruction& I : BB) {
1546  if (const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1547  live_funcs.insert(CI->getCalledFunction());
1548  }
1549  }
1550  }
1551  }
1552 
1553  for (llvm::Function& F : module) {
1554  if (!live_funcs.count(&F) && !F.isDeclaration()) {
1555  F.setLinkage(llvm::GlobalValue::InternalLinkage);
1556  }
1557  }
1558 
1559  return live_funcs;
1560 }
1561 
1562 namespace {
1563 // searches for a particular variable within a specific basic block (or all if bb_name is
1564 // empty)
1565 template <typename InstType>
1566 llvm::Value* find_variable_in_basic_block(llvm::Function* func,
1567  std::string bb_name,
1568  std::string variable_name) {
1569  llvm::Value* result = nullptr;
1570  if (func == nullptr || variable_name.empty()) {
1571  return result;
1572  }
1573  bool is_found = false;
1574  for (auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1575  if (!bb_name.empty() && bb_it->getName() != bb_name) {
1576  continue;
1577  }
1578  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1579  if (llvm::isa<InstType>(*inst_it)) {
1580  if (inst_it->getName() == variable_name) {
1581  result = &*inst_it;
1582  is_found = true;
1583  break;
1584  }
1585  }
1586  }
1587  }
1588  return result;
1589 }
1590 }; // namespace
1591 
1593  llvm::Function* query_func,
1594  bool run_with_dynamic_watchdog,
1595  bool run_with_allowing_runtime_interrupt,
1596  ExecutorDeviceType device_type,
1597  const std::vector<InputTableInfo>& input_table_infos) {
1598  AUTOMATIC_IR_METADATA(cgen_state_.get());
1599 
1600  // check whether the row processing was successful; currently, it can
1601  // fail by running out of group by buffer slots
1602 
1603  if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1604  // when both dynamic watchdog and runtime interrupt turns on
1605  // we use dynamic watchdog
1606  run_with_allowing_runtime_interrupt = false;
1607  }
1608 
1609  {
1610  // disable injecting query interrupt checker if the session info is invalid
1611  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
1612  if (current_query_session_.empty()) {
1613  run_with_allowing_runtime_interrupt = false;
1614  }
1615  }
1616 
1617  llvm::Value* row_count = nullptr;
1618  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1619  device_type == ExecutorDeviceType::GPU) {
1620  row_count =
1621  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1622  }
1623 
1624  bool done_splitting = false;
1625  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1626  ++bb_it) {
1627  llvm::Value* pos = nullptr;
1628  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1629  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1630  llvm::isa<llvm::PHINode>(*inst_it)) {
1631  if (inst_it->getName() == "pos") {
1632  pos = &*inst_it;
1633  }
1634  continue;
1635  }
1636  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1637  continue;
1638  }
1639  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1640  if (std::string(row_func_call.getCalledFunction()->getName()) == "row_process") {
1641  auto next_inst_it = inst_it;
1642  ++next_inst_it;
1643  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1644  auto& br_instr = bb_it->back();
1645  llvm::IRBuilder<> ir_builder(&br_instr);
1646  llvm::Value* err_lv = &*inst_it;
1647  llvm::Value* err_lv_returned_from_row_func = nullptr;
1648  if (run_with_dynamic_watchdog) {
1649  CHECK(pos);
1650  llvm::Value* call_watchdog_lv = nullptr;
1651  if (device_type == ExecutorDeviceType::GPU) {
1652  // In order to make sure all threads within a block see the same barrier,
1653  // only those blocks whose none of their threads have experienced the critical
1654  // edge will go through the dynamic watchdog computation
1655  CHECK(row_count);
1656  auto crit_edge_rem =
1657  (blockSize() & (blockSize() - 1))
1658  ? ir_builder.CreateSRem(
1659  row_count,
1660  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1661  : ir_builder.CreateAnd(
1662  row_count,
1663  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1664  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1665  crit_edge_threshold->setName("crit_edge_threshold");
1666 
1667  // only those threads where pos < crit_edge_threshold go through dynamic
1668  // watchdog call
1669  call_watchdog_lv =
1670  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1671  } else {
1672  // CPU path: run watchdog for every 64th row
1673  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1674  call_watchdog_lv = ir_builder.CreateICmp(
1675  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1676  }
1677  CHECK(call_watchdog_lv);
1678  auto error_check_bb = bb_it->splitBasicBlock(
1679  llvm::BasicBlock::iterator(br_instr), ".error_check");
1680  auto& watchdog_br_instr = bb_it->back();
1681 
1682  auto watchdog_check_bb = llvm::BasicBlock::Create(
1683  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1684  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1685  auto detected_timeout = watchdog_ir_builder.CreateCall(
1686  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1687  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1688  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1689  watchdog_ir_builder.CreateBr(error_check_bb);
1691  llvm::ReplaceInstWithInst(
1692  &watchdog_br_instr,
1693  llvm::BranchInst::Create(
1694  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1695  ir_builder.SetInsertPoint(&br_instr);
1696  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1697 
1698  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1699  unified_err_lv->addIncoming(err_lv, &*bb_it);
1700  err_lv = unified_err_lv;
1701  } else if (run_with_allowing_runtime_interrupt) {
1702  CHECK(pos);
1703  llvm::Value* call_check_interrupt_lv = nullptr;
1704  if (device_type == ExecutorDeviceType::GPU) {
1705  // approximate how many times the %pos variable
1706  // is increased --> the number of iteration
1707  // here we calculate the # bit shift by considering grid/block/fragment sizes
1708  // since if we use the fixed one (i.e., per 64-th increment)
1709  // some CUDA threads cannot enter the interrupt checking block depending on
1710  // the fragment size --> a thread may not take care of 64 threads if an outer
1711  // table is not sufficiently large, and so cannot be interrupted
1712  int32_t num_shift_by_gridDim = shared::getExpOfTwo(gridSize());
1713  int32_t num_shift_by_blockDim = shared::getExpOfTwo(blockSize());
1714  int total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1715  uint64_t interrupt_checking_freq = 32;
1716  auto freq_control_knob = g_running_query_interrupt_freq;
1717  CHECK_GT(freq_control_knob, 0);
1718  CHECK_LE(freq_control_knob, 1.0);
1719  if (!input_table_infos.empty()) {
1720  const auto& outer_table_info = *input_table_infos.begin();
1721  auto num_outer_table_tuples = outer_table_info.info.getNumTuples();
1722  if (outer_table_info.table_id < 0) {
1723  auto* rs = (*outer_table_info.info.fragments.begin()).resultSet;
1724  CHECK(rs);
1725  num_outer_table_tuples = rs->entryCount();
1726  } else {
1727  auto num_frags = outer_table_info.info.fragments.size();
1728  if (num_frags > 0) {
1729  num_outer_table_tuples =
1730  outer_table_info.info.fragments.begin()->getNumTuples();
1731  }
1732  }
1733  if (num_outer_table_tuples > 0) {
1734  // gridSize * blockSize --> pos_step (idx of the next row per thread)
1735  // we additionally multiply two to pos_step since the number of
1736  // dispatched blocks are double of the gridSize
1737  // # tuples (of fragment) / pos_step --> maximum # increment (K)
1738  // also we multiply 1 / freq_control_knob to K to control the frequency
1739  // So, needs to check the interrupt status more frequently? make K smaller
1740  auto max_inc = uint64_t(
1741  floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1742  if (max_inc < 2) {
1743  // too small `max_inc`, so this correction is necessary to make
1744  // `interrupt_checking_freq` be valid (i.e., larger than zero)
1745  max_inc = 2;
1746  }
1747  auto calibrated_inc = uint64_t(floor(max_inc * (1 - freq_control_knob)));
1748  interrupt_checking_freq =
1749  uint64_t(pow(2, shared::getExpOfTwo(calibrated_inc)));
1750  // add the coverage when interrupt_checking_freq > K
1751  // if so, some threads still cannot be branched to the interrupt checker
1752  // so we manually use smaller but close to the max_inc as freq
1753  if (interrupt_checking_freq > max_inc) {
1754  interrupt_checking_freq = max_inc / 2;
1755  }
1756  if (interrupt_checking_freq < 8) {
1757  // such small freq incurs too frequent interrupt status checking,
1758  // so we fixup to the minimum freq value at some reasonable degree
1759  interrupt_checking_freq = 8;
1760  }
1761  }
1762  }
1763  VLOG(1) << "Set the running query interrupt checking frequency: "
1764  << interrupt_checking_freq;
1765  // check the interrupt flag for every interrupt_checking_freq-th iteration
1766  llvm::Value* pos_shifted_per_iteration =
1767  ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1768  auto interrupt_predicate =
1769  ir_builder.CreateAnd(pos_shifted_per_iteration, interrupt_checking_freq);
1770  call_check_interrupt_lv =
1771  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1772  interrupt_predicate,
1773  cgen_state_->llInt(int64_t(0LL)));
1774  } else {
1775  // CPU path: run interrupt checker for every 64th row
1776  auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1777  call_check_interrupt_lv =
1778  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1779  interrupt_predicate,
1780  cgen_state_->llInt(int64_t(0LL)));
1781  }
1782  CHECK(call_check_interrupt_lv);
1783  auto error_check_bb = bb_it->splitBasicBlock(
1784  llvm::BasicBlock::iterator(br_instr), ".error_check");
1785  auto& check_interrupt_br_instr = bb_it->back();
1786 
1787  auto interrupt_check_bb = llvm::BasicBlock::Create(
1788  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
1789  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1790  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1791  cgen_state_->module_->getFunction("check_interrupt"), {});
1792  auto interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1793  detected_interrupt, cgen_state_->llInt(Executor::ERR_INTERRUPTED), err_lv);
1794  interrupt_checker_ir_builder.CreateBr(error_check_bb);
1795 
1796  llvm::ReplaceInstWithInst(
1797  &check_interrupt_br_instr,
1798  llvm::BranchInst::Create(
1799  interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
1800  ir_builder.SetInsertPoint(&br_instr);
1801  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1802 
1803  unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
1804  unified_err_lv->addIncoming(err_lv, &*bb_it);
1805  err_lv = unified_err_lv;
1806  }
1807  if (!err_lv_returned_from_row_func) {
1808  err_lv_returned_from_row_func = err_lv;
1809  }
1810  if (device_type == ExecutorDeviceType::GPU && g_enable_dynamic_watchdog) {
1811  // let kernel execution finish as expected, regardless of the observed error,
1812  // unless it is from the dynamic watchdog where all threads within that block
1813  // return together.
1814  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1815  err_lv,
1816  cgen_state_->llInt(Executor::ERR_OUT_OF_TIME));
1817  } else {
1818  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1819  err_lv,
1820  cgen_state_->llInt(static_cast<int32_t>(0)));
1821  }
1822  auto error_bb = llvm::BasicBlock::Create(
1823  cgen_state_->context_, ".error_exit", query_func, new_bb);
1824  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1825  llvm::CallInst::Create(
1826  cgen_state_->module_->getFunction("record_error_code"),
1827  std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
1828  "",
1829  error_bb);
1830  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1831  llvm::ReplaceInstWithInst(&br_instr,
1832  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1833  done_splitting = true;
1834  break;
1835  }
1836  }
1837  }
1838  CHECK(done_splitting);
1839 }
1840 
1841 std::vector<llvm::Value*> Executor::inlineHoistedLiterals() {
1842  AUTOMATIC_IR_METADATA(cgen_state_.get());
1843 
1844  std::vector<llvm::Value*> hoisted_literals;
1845 
1846  // row_func_ is using literals whose defs have been hoisted up to the query_func_,
1847  // extend row_func_ signature to include extra args to pass these literal values.
1848  std::vector<llvm::Type*> row_process_arg_types;
1849 
1850  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1851  E = cgen_state_->row_func_->arg_end();
1852  I != E;
1853  ++I) {
1854  row_process_arg_types.push_back(I->getType());
1855  }
1856 
1857  for (auto& element : cgen_state_->query_func_literal_loads_) {
1858  for (auto value : element.second) {
1859  row_process_arg_types.push_back(value->getType());
1860  }
1861  }
1862 
1863  auto ft = llvm::FunctionType::get(
1864  get_int_type(32, cgen_state_->context_), row_process_arg_types, false);
1865  auto row_func_with_hoisted_literals =
1866  llvm::Function::Create(ft,
1867  llvm::Function::ExternalLinkage,
1868  "row_func_hoisted_literals",
1869  cgen_state_->row_func_->getParent());
1870 
1871  auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
1872  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1873  E = cgen_state_->row_func_->arg_end();
1874  I != E;
1875  ++I) {
1876  if (I->hasName()) {
1877  row_func_arg_it->setName(I->getName());
1878  }
1879  ++row_func_arg_it;
1880  }
1881 
1882  decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{nullptr};
1883  decltype(row_func_arg_it) filter_func_arg_it{nullptr};
1884  if (cgen_state_->filter_func_) {
1885  // filter_func_ is using literals whose defs have been hoisted up to the row_func_,
1886  // extend filter_func_ signature to include extra args to pass these literal values.
1887  std::vector<llvm::Type*> filter_func_arg_types;
1888 
1889  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
1890  E = cgen_state_->filter_func_->arg_end();
1891  I != E;
1892  ++I) {
1893  filter_func_arg_types.push_back(I->getType());
1894  }
1895 
1896  for (auto& element : cgen_state_->query_func_literal_loads_) {
1897  for (auto value : element.second) {
1898  filter_func_arg_types.push_back(value->getType());
1899  }
1900  }
1901 
1902  auto ft2 = llvm::FunctionType::get(
1903  get_int_type(32, cgen_state_->context_), filter_func_arg_types, false);
1904  filter_func_with_hoisted_literals =
1905  llvm::Function::Create(ft2,
1906  llvm::Function::ExternalLinkage,
1907  "filter_func_hoisted_literals",
1908  cgen_state_->filter_func_->getParent());
1909 
1910  filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
1911  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
1912  E = cgen_state_->filter_func_->arg_end();
1913  I != E;
1914  ++I) {
1915  if (I->hasName()) {
1916  filter_func_arg_it->setName(I->getName());
1917  }
1918  ++filter_func_arg_it;
1919  }
1920  }
1921 
1922  std::unordered_map<int, std::vector<llvm::Value*>>
1923  query_func_literal_loads_function_arguments,
1924  query_func_literal_loads_function_arguments2;
1925 
1926  for (auto& element : cgen_state_->query_func_literal_loads_) {
1927  std::vector<llvm::Value*> argument_values, argument_values2;
1928 
1929  for (auto value : element.second) {
1930  hoisted_literals.push_back(value);
1931  argument_values.push_back(&*row_func_arg_it);
1932  if (cgen_state_->filter_func_) {
1933  argument_values2.push_back(&*filter_func_arg_it);
1934  cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
1935  }
1936  if (value->hasName()) {
1937  row_func_arg_it->setName("arg_" + value->getName());
1938  if (cgen_state_->filter_func_) {
1939  filter_func_arg_it->getContext();
1940  filter_func_arg_it->setName("arg_" + value->getName());
1941  }
1942  }
1943  ++row_func_arg_it;
1944  ++filter_func_arg_it;
1945  }
1946 
1947  query_func_literal_loads_function_arguments[element.first] = argument_values;
1948  query_func_literal_loads_function_arguments2[element.first] = argument_values2;
1949  }
1950 
1951  // copy the row_func function body over
1952  // see
1953  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
1954  row_func_with_hoisted_literals->getBasicBlockList().splice(
1955  row_func_with_hoisted_literals->begin(),
1956  cgen_state_->row_func_->getBasicBlockList());
1957 
1958  // also replace row_func arguments with the arguments from row_func_hoisted_literals
1959  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1960  E = cgen_state_->row_func_->arg_end(),
1961  I2 = row_func_with_hoisted_literals->arg_begin();
1962  I != E;
1963  ++I) {
1964  I->replaceAllUsesWith(&*I2);
1965  I2->takeName(&*I);
1966  cgen_state_->filter_func_args_.replace(&*I, &*I2);
1967  ++I2;
1968  }
1969 
1970  cgen_state_->row_func_ = row_func_with_hoisted_literals;
1971 
1972  // and finally replace literal placeholders
1973  std::vector<llvm::Instruction*> placeholders;
1974  std::string prefix("__placeholder__literal_");
1975  for (auto it = llvm::inst_begin(row_func_with_hoisted_literals),
1976  e = llvm::inst_end(row_func_with_hoisted_literals);
1977  it != e;
1978  ++it) {
1979  if (it->hasName() && it->getName().startswith(prefix)) {
1980  auto offset_and_index_entry =
1981  cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
1982  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
1983 
1984  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
1985  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
1986 
1987  it->replaceAllUsesWith(
1988  query_func_literal_loads_function_arguments[lit_off][lit_idx]);
1989  placeholders.push_back(&*it);
1990  }
1991  }
1992  for (auto placeholder : placeholders) {
1993  placeholder->removeFromParent();
1994  }
1995 
1996  if (cgen_state_->filter_func_) {
1997  // copy the filter_func function body over
1998  // see
1999  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2000  filter_func_with_hoisted_literals->getBasicBlockList().splice(
2001  filter_func_with_hoisted_literals->begin(),
2002  cgen_state_->filter_func_->getBasicBlockList());
2003 
2004  // also replace filter_func arguments with the arguments from
2005  // filter_func_hoisted_literals
2006  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2007  E = cgen_state_->filter_func_->arg_end(),
2008  I2 = filter_func_with_hoisted_literals->arg_begin();
2009  I != E;
2010  ++I) {
2011  I->replaceAllUsesWith(&*I2);
2012  I2->takeName(&*I);
2013  ++I2;
2014  }
2015 
2016  cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2017 
2018  // and finally replace literal placeholders
2019  std::vector<llvm::Instruction*> placeholders;
2020  std::string prefix("__placeholder__literal_");
2021  for (auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2022  e = llvm::inst_end(filter_func_with_hoisted_literals);
2023  it != e;
2024  ++it) {
2025  if (it->hasName() && it->getName().startswith(prefix)) {
2026  auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2027  llvm::dyn_cast<llvm::Value>(&*it));
2028  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2029 
2030  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2031  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2032 
2033  it->replaceAllUsesWith(
2034  query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2035  placeholders.push_back(&*it);
2036  }
2037  }
2038  for (auto placeholder : placeholders) {
2039  placeholder->removeFromParent();
2040  }
2041  }
2042 
2043  return hoisted_literals;
2044 }
2045 
2046 namespace {
2047 
2048 size_t get_shared_memory_size(const bool shared_mem_used,
2049  const QueryMemoryDescriptor* query_mem_desc_ptr) {
2050  return shared_mem_used
2051  ? (query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount())
2052  : 0;
2053 }
2054 
2055 bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor* query_mem_desc_ptr,
2056  const RelAlgExecutionUnit& ra_exe_unit,
2057  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2058  const ExecutorDeviceType device_type,
2059  const unsigned gpu_blocksize,
2060  const unsigned num_blocks_per_mp) {
2061  if (device_type == ExecutorDeviceType::CPU) {
2062  return false;
2063  }
2064  if (query_mem_desc_ptr->didOutputColumnar()) {
2065  return false;
2066  }
2067  CHECK(query_mem_desc_ptr);
2068  CHECK(cuda_mgr);
2069  /*
2070  * We only use shared memory strategy if GPU hardware provides native shared
2071  * memory atomics support. From CUDA Toolkit documentation:
2072  * https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html#atomic-ops "Like
2073  * Maxwell, Pascal [and Volta] provides native shared memory atomic operations
2074  * for 32-bit integer arithmetic, along with native 32 or 64-bit compare-and-swap
2075  * (CAS)."
2076  *
2077  **/
2078  if (!cuda_mgr->isArchMaxwellOrLaterForAll()) {
2079  return false;
2080  }
2081 
2082  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2085  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty()) {
2086  // TODO: relax this, if necessary
2087  if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2088  return false;
2089  }
2090  // skip shared memory usage when dealing with 1) variable length targets, 2)
2091  // not a COUNT aggregate
2092  const auto target_infos =
2093  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2094  std::unordered_set<SQLAgg> supported_aggs{kCOUNT};
2095  if (std::find_if(target_infos.begin(),
2096  target_infos.end(),
2097  [&supported_aggs](const TargetInfo& ti) {
2098  if (ti.sql_type.is_varlen() ||
2099  !supported_aggs.count(ti.agg_kind)) {
2100  return true;
2101  } else {
2102  return false;
2103  }
2104  }) == target_infos.end()) {
2105  return true;
2106  }
2107  }
2108  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2119  if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2120  return false;
2121  }
2122 
2123  // Fundamentally, we should use shared memory whenever the output buffer
2124  // is small enough so that we can fit it in the shared memory and yet expect
2125  // good occupancy.
2126  // For now, we allow keyless, row-wise layout, and only for perfect hash
2127  // group by operations.
2128  if (query_mem_desc_ptr->hasKeylessHash() &&
2129  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty() &&
2130  !query_mem_desc_ptr->useStreamingTopN()) {
2131  const size_t shared_memory_threshold_bytes = std::min(
2132  g_gpu_smem_threshold == 0 ? SIZE_MAX : g_gpu_smem_threshold,
2133  cuda_mgr->getMinSharedMemoryPerBlockForAllDevices() / num_blocks_per_mp);
2134  const auto output_buffer_size =
2135  query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount();
2136  if (output_buffer_size > shared_memory_threshold_bytes) {
2137  return false;
2138  }
2139 
2140  // skip shared memory usage when dealing with 1) variable length targets, 2)
2141  // non-basic aggregates (COUNT, SUM, MIN, MAX, AVG)
2142  // TODO: relax this if necessary
2143  const auto target_infos =
2144  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2145  std::unordered_set<SQLAgg> supported_aggs{kCOUNT};
2147  supported_aggs = {kCOUNT, kMIN, kMAX, kSUM, kAVG};
2148  }
2149  if (std::find_if(target_infos.begin(),
2150  target_infos.end(),
2151  [&supported_aggs](const TargetInfo& ti) {
2152  if (ti.sql_type.is_varlen() ||
2153  !supported_aggs.count(ti.agg_kind)) {
2154  return true;
2155  } else {
2156  return false;
2157  }
2158  }) == target_infos.end()) {
2159  return true;
2160  }
2161  }
2162  }
2163  return false;
2164 }
2165 
2166 #ifndef NDEBUG
2167 std::string serialize_llvm_metadata_footnotes(llvm::Function* query_func,
2168  CgenState* cgen_state) {
2169  std::string llvm_ir;
2170  std::unordered_set<llvm::MDNode*> md;
2171 
2172  // Loop over all instructions in the query function.
2173  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2174  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2175  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2176  instr_it->getAllMetadata(imd);
2177  for (auto [kind, node] : imd) {
2178  md.insert(node);
2179  }
2180  }
2181  }
2182 
2183  // Loop over all instructions in the row function.
2184  for (auto bb_it = cgen_state->row_func_->begin(); bb_it != cgen_state->row_func_->end();
2185  ++bb_it) {
2186  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2187  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2188  instr_it->getAllMetadata(imd);
2189  for (auto [kind, node] : imd) {
2190  md.insert(node);
2191  }
2192  }
2193  }
2194 
2195  // Loop over all instructions in the filter function.
2196  if (cgen_state->filter_func_) {
2197  for (auto bb_it = cgen_state->filter_func_->begin();
2198  bb_it != cgen_state->filter_func_->end();
2199  ++bb_it) {
2200  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2201  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2202  instr_it->getAllMetadata(imd);
2203  for (auto [kind, node] : imd) {
2204  md.insert(node);
2205  }
2206  }
2207  }
2208  }
2209 
2210  // Sort the metadata by canonical number and convert to text.
2211  if (!md.empty()) {
2212  std::map<size_t, std::string> sorted_strings;
2213  for (auto p : md) {
2214  std::string str;
2215  llvm::raw_string_ostream os(str);
2216  p->print(os, cgen_state->module_, true);
2217  os.flush();
2218  auto fields = split(str, {}, 1);
2219  if (fields.empty() || fields[0].empty()) {
2220  continue;
2221  }
2222  sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2223  }
2224  llvm_ir += "\n";
2225  for (auto [id, text] : sorted_strings) {
2226  llvm_ir += text;
2227  llvm_ir += "\n";
2228  }
2229  }
2230 
2231  return llvm_ir;
2232 }
2233 #endif // NDEBUG
2234 
2235 } // namespace
2236 
2237 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2238 Executor::compileWorkUnit(const std::vector<InputTableInfo>& query_infos,
2239  const PlanState::DeletedColumnsMap& deleted_cols_map,
2240  const RelAlgExecutionUnit& ra_exe_unit,
2241  const CompilationOptions& co,
2242  const ExecutionOptions& eo,
2243  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2244  const bool allow_lazy_fetch,
2245  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2246  const size_t max_groups_buffer_entry_guess,
2247  const int8_t crt_min_byte_width,
2248  const bool has_cardinality_estimation,
2249  ColumnCacheMap& column_cache,
2250  RenderInfo* render_info) {
2251  auto timer = DEBUG_TIMER(__func__);
2252 
2253 #ifndef NDEBUG
2254  static std::uint64_t counter = 0;
2255  ++counter;
2256  VLOG(1) << "CODEGEN #" << counter << ":";
2257  LOG(IR) << "CODEGEN #" << counter << ":";
2258  LOG(PTX) << "CODEGEN #" << counter << ":";
2259  LOG(ASM) << "CODEGEN #" << counter << ":";
2260 #endif
2261 
2262  nukeOldState(allow_lazy_fetch, query_infos, deleted_cols_map, &ra_exe_unit);
2263 
2264  GroupByAndAggregate group_by_and_aggregate(
2265  this,
2266  co.device_type,
2267  ra_exe_unit,
2268  query_infos,
2269  row_set_mem_owner,
2270  has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2271  : std::nullopt);
2272  auto query_mem_desc =
2273  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
2274  max_groups_buffer_entry_guess,
2275  crt_min_byte_width,
2276  render_info,
2278 
2279  if (query_mem_desc->getQueryDescriptionType() ==
2281  !has_cardinality_estimation &&
2282  (!render_info || !render_info->isPotentialInSituRender()) && !eo.just_explain) {
2283  const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2284  throw CardinalityEstimationRequired(col_range_info.max - col_range_info.min);
2285  }
2286 
2287  const bool output_columnar = query_mem_desc->didOutputColumnar();
2288  const bool gpu_shared_mem_optimization =
2290  ra_exe_unit,
2291  cuda_mgr,
2292  co.device_type,
2293  cuda_mgr ? this->blockSize() : 1,
2294  cuda_mgr ? this->numBlocksPerMP() : 1);
2295  if (gpu_shared_mem_optimization) {
2296  // disable interleaved bins optimization on the GPU
2297  query_mem_desc->setHasInterleavedBinsOnGpu(false);
2298  LOG(DEBUG1) << "GPU shared memory is used for the " +
2299  query_mem_desc->queryDescTypeToString() + " query(" +
2300  std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
2301  query_mem_desc.get())) +
2302  " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
2303  }
2304 
2305  const GpuSharedMemoryContext gpu_smem_context(
2306  get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
2307 
2309  const size_t num_count_distinct_descs =
2310  query_mem_desc->getCountDistinctDescriptorsSize();
2311  for (size_t i = 0; i < num_count_distinct_descs; i++) {
2312  const auto& count_distinct_descriptor =
2313  query_mem_desc->getCountDistinctDescriptor(i);
2314  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::StdSet ||
2315  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
2316  !co.hoist_literals)) {
2317  throw QueryMustRunOnCpu();
2318  }
2319  }
2320  }
2321 
2322  // Read the module template and target either CPU or GPU
2323  // by binding the stream position functions to the right implementation:
2324  // stride access for GPU, contiguous for CPU
2325  auto rt_module_copy = llvm::CloneModule(
2326  *g_rt_module.get(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
2327  auto func = llvm::dyn_cast<llvm::Function>(gv);
2328  if (!func) {
2329  return true;
2330  }
2331  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2332  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
2334  });
2336  if (is_udf_module_present(true)) {
2337  CodeGenerator::link_udf_module(udf_cpu_module, *rt_module_copy, cgen_state_.get());
2338  }
2339  if (is_rt_udf_module_present(true)) {
2341  rt_udf_cpu_module, *rt_module_copy, cgen_state_.get());
2342  }
2343  } else {
2344  rt_module_copy->setDataLayout(get_gpu_data_layout());
2345  rt_module_copy->setTargetTriple(get_gpu_target_triple_string());
2346  if (is_udf_module_present()) {
2347  CodeGenerator::link_udf_module(udf_gpu_module, *rt_module_copy, cgen_state_.get());
2348  }
2349  if (is_rt_udf_module_present()) {
2351  rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
2352  }
2353  }
2354 
2355  cgen_state_->module_ = rt_module_copy.release();
2356  AUTOMATIC_IR_METADATA(cgen_state_.get());
2357 
2358  auto agg_fnames =
2359  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
2360 
2361  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
2362 
2363  const bool is_group_by{query_mem_desc->isGroupBy()};
2364  auto [query_func, row_func_call] = is_group_by
2365  ? query_group_by_template(cgen_state_->module_,
2366  co.hoist_literals,
2367  *query_mem_desc,
2368  co.device_type,
2369  ra_exe_unit.scan_limit,
2370  gpu_smem_context)
2371  : query_template(cgen_state_->module_,
2372  agg_slot_count,
2373  co.hoist_literals,
2374  !!ra_exe_unit.estimator,
2375  gpu_smem_context);
2376  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
2377  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
2378  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
2379 
2380  cgen_state_->query_func_ = query_func;
2381  cgen_state_->row_func_call_ = row_func_call;
2382  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2383  &query_func->getEntryBlock().front());
2384 
2385  // Generate the function signature and column head fetches s.t.
2386  // double indirection isn't needed in the inner loop
2387  auto& fetch_bb = query_func->front();
2388  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2389  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2390  auto col_heads = generate_column_heads_load(ra_exe_unit.input_col_descs.size(),
2391  query_func->args().begin(),
2392  fetch_ir_builder,
2393  cgen_state_->context_);
2394  CHECK_EQ(ra_exe_unit.input_col_descs.size(), col_heads.size());
2395 
2396  cgen_state_->row_func_ = create_row_function(ra_exe_unit.input_col_descs.size(),
2397  is_group_by ? 0 : agg_slot_count,
2398  co.hoist_literals,
2399  cgen_state_->module_,
2400  cgen_state_->context_);
2401  CHECK(cgen_state_->row_func_);
2402  cgen_state_->row_func_bb_ =
2403  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
2404 
2406  auto filter_func_ft =
2407  llvm::FunctionType::get(get_int_type(32, cgen_state_->context_), {}, false);
2408  cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2409  llvm::Function::ExternalLinkage,
2410  "filter_func",
2411  cgen_state_->module_);
2412  CHECK(cgen_state_->filter_func_);
2413  cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2414  cgen_state_->context_, "entry", cgen_state_->filter_func_);
2415  }
2416 
2417  cgen_state_->current_func_ = cgen_state_->row_func_;
2418  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2419 
2420  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
2421  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
2422  const auto join_loops =
2423  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2424 
2425  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
2426  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2427  if (is_not_deleted_bb) {
2428  cgen_state_->row_func_bb_ = is_not_deleted_bb;
2429  }
2430  if (!join_loops.empty()) {
2431  codegenJoinLoops(join_loops,
2432  body_execution_unit,
2433  group_by_and_aggregate,
2434  query_func,
2435  cgen_state_->row_func_bb_,
2436  *(query_mem_desc.get()),
2437  co,
2438  eo);
2439  } else {
2440  const bool can_return_error = compileBody(
2441  ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
2442  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
2444  createErrorCheckControlFlow(query_func,
2448  group_by_and_aggregate.query_infos_);
2449  }
2450  }
2451  std::vector<llvm::Value*> hoisted_literals;
2452 
2453  if (co.hoist_literals) {
2454  VLOG(1) << "number of hoisted literals: "
2455  << cgen_state_->query_func_literal_loads_.size()
2456  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2457  << " bytes";
2458  }
2459 
2460  if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2461  // we have some hoisted literals...
2462  hoisted_literals = inlineHoistedLiterals();
2463  }
2464 
2465  // replace the row func placeholder call with the call to the actual row func
2466  std::vector<llvm::Value*> row_func_args;
2467  for (size_t i = 0; i < cgen_state_->row_func_call_->getNumArgOperands(); ++i) {
2468  row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2469  }
2470  row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2471  row_func_args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
2472  // push hoisted literals arguments, if any
2473  row_func_args.insert(
2474  row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2475  llvm::ReplaceInstWithInst(
2476  cgen_state_->row_func_call_,
2477  llvm::CallInst::Create(cgen_state_->row_func_, row_func_args, ""));
2478 
2479  // replace the filter func placeholder call with the call to the actual filter func
2480  if (cgen_state_->filter_func_) {
2481  std::vector<llvm::Value*> filter_func_args;
2482  for (auto arg_it = cgen_state_->filter_func_args_.begin();
2483  arg_it != cgen_state_->filter_func_args_.end();
2484  ++arg_it) {
2485  filter_func_args.push_back(arg_it->first);
2486  }
2487  llvm::ReplaceInstWithInst(
2488  cgen_state_->filter_func_call_,
2489  llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args, ""));
2490  }
2491 
2492  // Aggregate
2493  plan_state_->init_agg_vals_ =
2494  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
2495 
2496  /*
2497  * If we have decided to use GPU shared memory (decision is not made here), then
2498  * we generate proper code for extra components that it needs (buffer initialization and
2499  * gpu reduction from shared memory to global memory). We then replace these functions
2500  * into the already compiled query_func (replacing two placeholders, write_back_nop and
2501  * init_smem_nop). The rest of the code should be as before (row_func, etc.).
2502  */
2503  if (gpu_smem_context.isSharedMemoryUsed()) {
2504  if (query_mem_desc->getQueryDescriptionType() ==
2506  GpuSharedMemCodeBuilder gpu_smem_code(
2507  cgen_state_->module_,
2508  cgen_state_->context_,
2509  *query_mem_desc,
2511  plan_state_->init_agg_vals_);
2512  gpu_smem_code.codegen();
2513  gpu_smem_code.injectFunctionsInto(query_func);
2514 
2515  // helper functions are used for caching purposes later
2516  cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2517  cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2518  LOG(IR) << gpu_smem_code.toString();
2519  }
2520  }
2521 
2522  auto multifrag_query_func = cgen_state_->module_->getFunction(
2523  "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
2524  CHECK(multifrag_query_func);
2525 
2527  insertErrorCodeChecker(multifrag_query_func, co.hoist_literals);
2528  }
2529 
2530  bind_query(query_func,
2531  "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
2532  multifrag_query_func,
2533  cgen_state_->module_);
2534 
2535  std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2536  if (cgen_state_->filter_func_) {
2537  root_funcs.push_back(cgen_state_->filter_func_);
2538  }
2539  auto live_funcs = CodeGenerator::markDeadRuntimeFuncs(
2540  *cgen_state_->module_, root_funcs, {multifrag_query_func});
2541 
2542  // Always inline the row function and the filter function.
2543  // We don't want register spills in the inner loops.
2544  // LLVM seems to correctly free up alloca instructions
2545  // in these functions even when they are inlined.
2546  mark_function_always_inline(cgen_state_->row_func_);
2547  if (cgen_state_->filter_func_) {
2548  mark_function_always_inline(cgen_state_->filter_func_);
2549  }
2550 
2551 #ifndef NDEBUG
2552  // Add helpful metadata to the LLVM IR for debugging.
2554 #endif
2555 
2556  // Serialize the important LLVM IR functions to text for SQL EXPLAIN.
2557  std::string llvm_ir;
2558  if (eo.just_explain) {
2560 #ifdef WITH_JIT_DEBUG
2561  throw std::runtime_error(
2562  "Explain optimized not available when JIT runtime debug symbols are enabled");
2563 #else
2564  // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
2565  // optimized IR after NVVM reflect
2566  llvm::legacy::PassManager pass_manager;
2567  optimize_ir(query_func, cgen_state_->module_, pass_manager, live_funcs, co);
2568 #endif // WITH_JIT_DEBUG
2569  }
2570  llvm_ir =
2571  serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
2572  serialize_llvm_object(cgen_state_->row_func_) +
2573  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2574  : "");
2575 
2576 #ifndef NDEBUG
2577  llvm_ir += serialize_llvm_metadata_footnotes(query_func, cgen_state_.get());
2578 #endif
2579  }
2580 
2581  LOG(IR) << "\n\n" << query_mem_desc->toString() << "\n";
2582  LOG(IR) << "IR for the "
2583  << (co.device_type == ExecutorDeviceType::CPU ? "CPU:\n" : "GPU:\n");
2584 #ifdef NDEBUG
2585  LOG(IR) << serialize_llvm_object(query_func)
2586  << serialize_llvm_object(cgen_state_->row_func_)
2587  << (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2588  : "")
2589  << "\nEnd of IR";
2590 #else
2591  LOG(IR) << serialize_llvm_object(cgen_state_->module_) << "\nEnd of IR";
2592 #endif
2593 
2594  // Run some basic validation checks on the LLVM IR before code is generated below.
2595  verify_function_ir(cgen_state_->row_func_);
2596  if (cgen_state_->filter_func_) {
2597  verify_function_ir(cgen_state_->filter_func_);
2598  }
2599 
2600  // Generate final native code from the LLVM IR.
2601  return std::make_tuple(
2604  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2605  : optimizeAndCodegenGPU(query_func,
2606  multifrag_query_func,
2607  live_funcs,
2608  is_group_by || ra_exe_unit.estimator,
2609  cuda_mgr,
2610  co),
2611  cgen_state_->getLiterals(),
2612  output_columnar,
2613  llvm_ir,
2614  std::move(gpu_smem_context)},
2615  std::move(query_mem_desc));
2616 }
2617 
2618 void Executor::insertErrorCodeChecker(llvm::Function* query_func, bool hoist_literals) {
2619  auto query_stub_func_name =
2620  "query_stub" + std::string(hoist_literals ? "_hoisted_literals" : "");
2621  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2622  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
2623  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
2624  continue;
2625  }
2626  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
2627  if (std::string(row_func_call.getCalledFunction()->getName()) ==
2628  query_stub_func_name) {
2629  auto next_inst_it = inst_it;
2630  ++next_inst_it;
2631  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
2632  auto& br_instr = bb_it->back();
2633  llvm::IRBuilder<> ir_builder(&br_instr);
2634  llvm::Value* err_lv = &*inst_it;
2635  auto error_check_bb =
2636  bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr), ".error_check");
2637  llvm::Value* error_code_arg = nullptr;
2638  auto arg_cnt = 0;
2639  for (auto arg_it = query_func->arg_begin(); arg_it != query_func->arg_end();
2640  arg_it++, ++arg_cnt) {
2641  // since multi_frag_* func has anonymous arguments so we use arg_offset
2642  // explicitly to capture "error_code" argument in the func's argument list
2643  if (hoist_literals) {
2644  if (arg_cnt == 9) {
2645  error_code_arg = &*arg_it;
2646  break;
2647  }
2648  } else {
2649  if (arg_cnt == 8) {
2650  error_code_arg = &*arg_it;
2651  break;
2652  }
2653  }
2654  }
2655  CHECK(error_code_arg);
2656  llvm::Value* err_code = nullptr;
2658  // decide the final error code with a consideration of interrupt status
2659  auto& check_interrupt_br_instr = bb_it->back();
2660  auto interrupt_check_bb = llvm::BasicBlock::Create(
2661  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
2662  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
2663  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
2664  cgen_state_->module_->getFunction("check_interrupt"), {});
2665  auto detected_error = interrupt_checker_ir_builder.CreateCall(
2666  cgen_state_->module_->getFunction("get_error_code"),
2667  std::vector<llvm::Value*>{error_code_arg});
2668  err_code = interrupt_checker_ir_builder.CreateSelect(
2669  detected_interrupt,
2670  cgen_state_->llInt(Executor::ERR_INTERRUPTED),
2671  detected_error);
2672  interrupt_checker_ir_builder.CreateBr(error_check_bb);
2673  llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
2674  llvm::BranchInst::Create(interrupt_check_bb));
2675  ir_builder.SetInsertPoint(&br_instr);
2676  } else {
2677  // uses error code returned from row_func and skip to check interrupt status
2678  ir_builder.SetInsertPoint(&br_instr);
2679  err_code =
2680  ir_builder.CreateCall(cgen_state_->module_->getFunction("get_error_code"),
2681  std::vector<llvm::Value*>{error_code_arg});
2682  }
2683  err_lv = ir_builder.CreateICmp(
2684  llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
2685  auto error_bb = llvm::BasicBlock::Create(
2686  cgen_state_->context_, ".error_exit", query_func, new_bb);
2687  llvm::CallInst::Create(cgen_state_->module_->getFunction("record_error_code"),
2688  std::vector<llvm::Value*>{err_code, error_code_arg},
2689  "",
2690  error_bb);
2691  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2692  llvm::ReplaceInstWithInst(&br_instr,
2693  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2694  break;
2695  }
2696  }
2697  }
2698 }
2699 
2701  const RelAlgExecutionUnit& ra_exe_unit,
2702  const CompilationOptions& co) {
2703  AUTOMATIC_IR_METADATA(cgen_state_.get());
2704  if (!co.filter_on_deleted_column) {
2705  return nullptr;
2706  }
2707  CHECK(!ra_exe_unit.input_descs.empty());
2708  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
2709  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
2710  return nullptr;
2711  }
2712  const auto deleted_cd =
2713  plan_state_->getDeletedColForTable(outer_input_desc.getTableId());
2714  if (!deleted_cd) {
2715  return nullptr;
2716  }
2717  CHECK(deleted_cd->columnType.is_boolean());
2718  const auto deleted_expr =
2719  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
2720  outer_input_desc.getTableId(),
2721  deleted_cd->columnId,
2722  outer_input_desc.getNestLevel());
2723  CodeGenerator code_generator(this);
2724  const auto is_deleted =
2725  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
2726  const auto is_deleted_bb = llvm::BasicBlock::Create(
2727  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
2728  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
2729  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
2730  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
2731  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
2732  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2733  cgen_state_->ir_builder_.SetInsertPoint(bb);
2734  return bb;
2735 }
2736 
2737 bool Executor::compileBody(const RelAlgExecutionUnit& ra_exe_unit,
2738  GroupByAndAggregate& group_by_and_aggregate,
2740  const CompilationOptions& co,
2741  const GpuSharedMemoryContext& gpu_smem_context) {
2742  AUTOMATIC_IR_METADATA(cgen_state_.get());
2743 
2744  // Switch the code generation into a separate filter function if enabled.
2745  // Note that accesses to function arguments are still codegenned from the
2746  // row function's arguments, then later automatically forwarded and
2747  // remapped into filter function arguments by redeclareFilterFunction().
2748  cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
2749  llvm::Value* loop_done{nullptr};
2750  std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
2751  if (cgen_state_->filter_func_) {
2752  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2753  auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
2754  cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
2755  row_func_entry_bb->begin());
2756  loop_done = cgen_state_->ir_builder_.CreateAlloca(
2757  get_int_type(1, cgen_state_->context_), nullptr, "loop_done");
2758  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2759  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(true), loop_done);
2760  }
2761  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
2762  cgen_state_->current_func_ = cgen_state_->filter_func_;
2763  fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
2764  }
2765 
2766  // generate the code for the filter
2767  std::vector<Analyzer::Expr*> primary_quals;
2768  std::vector<Analyzer::Expr*> deferred_quals;
2769  bool short_circuited =
2770  CodeGenerator::prioritizeQuals(ra_exe_unit, primary_quals, deferred_quals);
2771  if (short_circuited) {
2772  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
2773  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
2774  << " quals";
2775  }
2776  llvm::Value* filter_lv = cgen_state_->llBool(true);
2777  CodeGenerator code_generator(this);
2778  for (auto expr : primary_quals) {
2779  // Generate the filter for primary quals
2780  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
2781  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
2782  }
2783  CHECK(filter_lv->getType()->isIntegerTy(1));
2784  llvm::BasicBlock* sc_false{nullptr};
2785  if (!deferred_quals.empty()) {
2786  auto sc_true = llvm::BasicBlock::Create(
2787  cgen_state_->context_, "sc_true", cgen_state_->current_func_);
2788  sc_false = llvm::BasicBlock::Create(
2789  cgen_state_->context_, "sc_false", cgen_state_->current_func_);
2790  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
2791  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
2792  if (ra_exe_unit.join_quals.empty()) {
2793  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
2794  }
2795  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
2796  filter_lv = cgen_state_->llBool(true);
2797  }
2798  for (auto expr : deferred_quals) {
2799  filter_lv = cgen_state_->ir_builder_.CreateAnd(
2800  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
2801  }
2802 
2803  CHECK(filter_lv->getType()->isIntegerTy(1));
2804  auto ret = group_by_and_aggregate.codegen(
2805  filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
2806 
2807  // Switch the code generation back to the row function if a filter
2808  // function was enabled.
2809  if (cgen_state_->filter_func_) {
2810  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2811  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(false), loop_done);
2812  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2813  }
2814 
2815  redeclareFilterFunction();
2816 
2817  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2818  cgen_state_->current_func_ = cgen_state_->row_func_;
2819  cgen_state_->filter_func_call_ =
2820  cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
2821 
2822  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2823  auto loop_done_true = llvm::BasicBlock::Create(
2824  cgen_state_->context_, "loop_done_true", cgen_state_->row_func_);
2825  auto loop_done_false = llvm::BasicBlock::Create(
2826  cgen_state_->context_, "loop_done_false", cgen_state_->row_func_);
2827  auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(loop_done);
2828  cgen_state_->ir_builder_.CreateCondBr(
2829  loop_done_flag, loop_done_true, loop_done_false);
2830  cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
2831  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2832  cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
2833  } else {
2834  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2835  }
2836  }
2837  return ret;
2838 }
2839 
2840 std::unique_ptr<llvm::Module> runtime_module_shallow_copy(CgenState* cgen_state) {
2841  return llvm::CloneModule(
2842  *g_rt_module.get(), cgen_state->vmap_, [](const llvm::GlobalValue* gv) {
2843  auto func = llvm::dyn_cast<llvm::Function>(gv);
2844  if (!func) {
2845  return true;
2846  }
2847  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2848  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage);
2849  });
2850 }
2851 
2852 std::vector<llvm::Value*> generate_column_heads_load(const int num_columns,
2853  llvm::Value* byte_stream_arg,
2854  llvm::IRBuilder<>& ir_builder,
2855  llvm::LLVMContext& ctx) {
2856  CHECK(byte_stream_arg);
2857  const auto max_col_local_id = num_columns - 1;
2858 
2859  std::vector<llvm::Value*> col_heads;
2860  for (int col_id = 0; col_id <= max_col_local_id; ++col_id) {
2861  col_heads.emplace_back(ir_builder.CreateLoad(ir_builder.CreateGEP(
2862  byte_stream_arg, llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id))));
2863  }
2864  return col_heads;
2865 }
2866 
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
void read_rt_udf_gpu_module(const std::string &udf_ir)
catalog_(nullptr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::string filename(char const *path)
Definition: Logger.cpp:62
double g_running_query_interrupt_freq
Definition: Execute.cpp:113
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
std::unique_ptr< llvm::Module > rt_udf_cpu_module
bool countDistinctDescriptorsLogicallyEmpty() const
std::unique_ptr< llvm::Module > runtime_module_shallow_copy(CgenState *cgen_state)
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1076
void mark_function_never_inline(llvm::Function *func)
std::unique_ptr< llvm::Module > udf_gpu_module
void show_defined(llvm::Module &module)
ExecutorDeviceType
void read_rt_udf_cpu_module(const std::string &udf_ir)
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:188
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
bool is_udf_module_present(bool cpu_only=false)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
std::string join(T const &container, std::string const &delim)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
void read_udf_cpu_module(const std::string &udf_ir_filename)
void read_udf_gpu_module(const std::string &udf_ir_filename)
std::vector< std::string > CodeCacheKey
Definition: CodeCache.h:25
ExecutorOptLevel opt_level
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:77
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
void optimize_ir(llvm::Function *query_func, llvm::Module *module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
#define CHECK_GT(x, y)
Definition: Logger.h:209
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals)
Definition: LogicalIR.cpp:157
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
#define LOG_IF(severity, condition)
Definition: Logger.h:287
gpu_code_cache_(code_cache_size)
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *module)
llvm::Function * row_func_
Definition: CgenState.h:321
cpu_code_cache_(code_cache_size)
std::shared_ptr< CompilationContext > getCodeFromCache(const CodeCacheKey &, const CodeCache &)
bool g_enable_smem_non_grouped_agg
Definition: Execute.cpp:122
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co, const GPUTarget &gpu_target)
Definition: sqldefs.h:73
void insertErrorCodeChecker(llvm::Function *query_func, bool hoist_literals)
unsigned getExpOfTwo(unsigned n)
Definition: MathUtils.cpp:23
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned gpu_blocksize, const unsigned num_blocks_per_mp)
std::string get_cuda_home(void)
Definition: CudaMgr.cpp:405
llvm::StringRef get_gpu_target_triple_string()
llvm::Module * module_
Definition: CgenState.h:320
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
void verify_function_ir(const llvm::Function *func)
const bool allow_multifrag
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:162
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
const bool with_dynamic_watchdog
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > g_rt_module
ExecutorExplainType explain_type
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1075
void initializeNVPTXBackend() const
Definition: sqldefs.h:75
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:114
const_list_iterator_t cend() const
Definition: LruCache.hpp:58
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
QueryDescriptionType getQueryDescriptionType() const
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
Definition: CudaMgr.h:148
static void addCodeToCache(const CodeCacheKey &, std::shared_ptr< CompilationContext >, llvm::Module *, CodeCache &)
#define AUTOMATIC_IR_METADATA_DONE()
ExecutorDeviceType device_type
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
llvm::Function * filter_func_
Definition: CgenState.h:322
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:288
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
#define CHECK_LE(x, y)
Definition: Logger.h:208
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
llvm::Module * read_template_module(llvm::LLVMContext &context)
bool g_enable_smem_grouped_non_count_agg
Definition: Execute.cpp:119
Definition: sqldefs.h:76
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
std::unique_ptr< llvm::Module > udf_cpu_module
int CUdevice
Definition: nocuda.h:20
bool g_enable_filter_function
Definition: Execute.cpp:79
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
llvm::LLVMContext & getGlobalLLVMContext()
float g_fraction_code_cache_to_evict
SQLAgg get_aggtype() const
Definition: Analyzer.h:1095
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls)
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:64
#define CHECK(condition)
Definition: Logger.h:197
#define DEBUG_TIMER(name)
Definition: Logger.h:313
llvm::ValueToValueMapTy vmap_
Definition: CgenState.h:330
std::string get_root_abs_path()
std::vector< llvm::Value * > inlineHoistedLiterals()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *module, llvm::LLVMContext &context)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
const bool allow_runtime_query_interrupt
Definition: sqldefs.h:74
NvidiaDeviceArch getDeviceArch() const
Definition: CudaMgr.h:172
int cpu_threads()
Definition: thread_count.h:24
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
CubinResult ptx_to_cubin(const std::string &ptx, const unsigned block_size, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
Definition: sqldefs.h:72
bool is_rt_udf_module_present(bool cpu_only=false)
void put(const key_t &key, value_t &&value)
Definition: LruCache.hpp:27
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:110
const_list_iterator_t find(const key_t &key) const
Definition: LruCache.hpp:49
#define VLOG(n)
Definition: Logger.h:291
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
size_t g_gpu_smem_threshold
Definition: Execute.cpp:114