OmniSciDB  21ac014ffc
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
NativeCodegen.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
18 #include "Execute.h"
20 #include "GpuSharedMemoryUtils.h"
23 #include "QueryTemplateGenerator.h"
24 
25 #include "CudaMgr/CudaMgr.h"
28 #include "Shared/MathUtils.h"
29 #include "StreamingTopN.h"
30 
31 #if LLVM_VERSION_MAJOR < 9
32 static_assert(false, "LLVM Version >= 9 is required.");
33 #endif
34 
35 #include <llvm/Bitcode/BitcodeReader.h>
36 #include <llvm/Bitcode/BitcodeWriter.h>
37 #include <llvm/ExecutionEngine/MCJIT.h>
38 #include <llvm/IR/Attributes.h>
39 #include <llvm/IR/GlobalValue.h>
40 #include <llvm/IR/InstIterator.h>
41 #include <llvm/IR/IntrinsicInst.h>
42 #include <llvm/IR/Intrinsics.h>
43 #include <llvm/IR/LegacyPassManager.h>
44 #include <llvm/IR/Verifier.h>
45 #include <llvm/IRReader/IRReader.h>
46 #include <llvm/Linker/Linker.h>
47 #include <llvm/Support/Casting.h>
48 #include <llvm/Support/FileSystem.h>
49 #include <llvm/Support/FormattedStream.h>
50 #include <llvm/Support/MemoryBuffer.h>
51 #include <llvm/Support/SourceMgr.h>
52 #include <llvm/Support/TargetRegistry.h>
53 #include <llvm/Support/TargetSelect.h>
54 #include <llvm/Support/raw_os_ostream.h>
55 #include <llvm/Support/raw_ostream.h>
56 #include <llvm/Transforms/IPO.h>
57 #include <llvm/Transforms/IPO/AlwaysInliner.h>
58 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
59 #include <llvm/Transforms/InstCombine/InstCombine.h>
60 #include <llvm/Transforms/Instrumentation.h>
61 #include <llvm/Transforms/Scalar.h>
62 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
63 #include <llvm/Transforms/Utils.h>
64 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
65 #include <llvm/Transforms/Utils/Cloning.h>
66 
67 #if LLVM_VERSION_MAJOR >= 11
68 #include <llvm/Support/Host.h>
69 #endif
70 
72 
73 std::unique_ptr<llvm::Module> udf_gpu_module;
74 std::unique_ptr<llvm::Module> udf_cpu_module;
75 std::unique_ptr<llvm::Module> rt_udf_gpu_module;
76 std::unique_ptr<llvm::Module> rt_udf_cpu_module;
77 
78 extern std::unique_ptr<llvm::Module> g_rt_module;
79 
80 #ifdef HAVE_CUDA
81 extern std::unique_ptr<llvm::Module> g_rt_libdevice_module;
82 #endif
83 
84 #ifdef ENABLE_GEOS
85 extern std::unique_ptr<llvm::Module> g_rt_geos_module;
86 
87 #include <llvm/Support/DynamicLibrary.h>
88 
89 #ifndef GEOS_LIBRARY_FILENAME
90 #error Configuration should include GEOS library file name
91 #endif
92 std::unique_ptr<std::string> g_libgeos_so_filename(
93  new std::string(GEOS_LIBRARY_FILENAME));
94 static llvm::sys::DynamicLibrary geos_dynamic_library;
95 static std::mutex geos_init_mutex;
96 
97 namespace {
98 
99 void load_geos_dynamic_library() {
100  std::lock_guard<std::mutex> guard(geos_init_mutex);
101 
102  if (!geos_dynamic_library.isValid()) {
103  if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
104  LOG(WARNING) << "Misconfigured GEOS library file name, trying 'libgeos_c.so'";
105  g_libgeos_so_filename.reset(new std::string("libgeos_c.so"));
106  }
107  auto filename = *g_libgeos_so_filename;
108  std::string error_message;
109  geos_dynamic_library =
110  llvm::sys::DynamicLibrary::getPermanentLibrary(filename.c_str(), &error_message);
111  if (!geos_dynamic_library.isValid()) {
112  LOG(ERROR) << "Failed to load GEOS library '" + filename + "'";
113  std::string exception_message = "Failed to load GEOS library: " + error_message;
114  throw std::runtime_error(exception_message.c_str());
115  } else {
116  LOG(INFO) << "Loaded GEOS library '" + filename + "'";
117  }
118  }
119 }
120 
121 } // namespace
122 #endif
123 
124 namespace {
125 
126 void throw_parseIR_error(const llvm::SMDiagnostic& parse_error,
127  std::string src = "",
128  const bool is_gpu = false) {
129  std::string excname = (is_gpu ? "NVVM IR ParseError: " : "LLVM IR ParseError: ");
130  llvm::raw_string_ostream ss(excname);
131  parse_error.print(src.c_str(), ss, false, false);
132  throw ParseIRError(ss.str());
133 }
134 
135 /* SHOW_DEFINED(<llvm::Module instance>) prints the function names
136  that are defined in the given LLVM Module instance.
137 
138  SHOW_FUNCTIONS(<llvm::Module instance>) prints the function names
139  of all used functions in the given LLVM Module
140  instance. Declarations are marked with `[decl]` as a name suffix.
141 
142  Useful for debugging.
143 */
144 
145 #define SHOW_DEFINED(MODULE) \
146  { \
147  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
148  ::show_defined(MODULE); \
149  }
150 
151 #define SHOW_FUNCTIONS(MODULE) \
152  { \
153  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
154  ::show_functions(MODULE); \
155  }
156 
157 template <typename T = void>
158 void show_defined(llvm::Module& module) {
159  std::cout << "defines: ";
160  for (auto& f : module.getFunctionList()) {
161  if (!f.isDeclaration()) {
162  std::cout << f.getName().str() << ", ";
163  }
164  }
165  std::cout << std::endl;
166 }
167 
168 template <typename T = void>
169 void show_defined(llvm::Module* module) {
170  if (module == nullptr) {
171  std::cout << "is null" << std::endl;
172  } else {
173  show_defined(*module);
174  }
175 }
176 
177 template <typename T = void>
178 void show_defined(std::unique_ptr<llvm::Module>& module) {
179  show_defined(module.get());
180 }
181 
182 /*
183  scan_function_calls(module, defined, undefined, ignored) computes
184  defined and undefined sets of function names:
185 
186  - defined functions are those that are defined in the given module
187 
188  - undefined functions are those that are called by defined functions
189  but that are not defined in the given module
190 
191  - ignored functions are functions that may be undefined but will not
192  be listed in the set of undefined functions.
193 
194  Useful for debugging.
195 */
196 template <typename T = void>
197 void scan_function_calls(llvm::Function& F,
198  std::unordered_set<std::string>& defined,
199  std::unordered_set<std::string>& undefined,
200  const std::unordered_set<std::string>& ignored) {
201  for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
202  if (auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
203  auto* F2 = CI->getCalledFunction();
204  if (F2 != nullptr) {
205  auto F2name = F2->getName().str();
206  if (F2->isDeclaration()) {
207  if (F2name.rfind("__", 0) !=
208  0 // assume symbols with double underscore are defined
209  && F2name.rfind("llvm.", 0) !=
210  0 // TODO: this may give false positive for NVVM intrinsics
211  && ignored.find(F2name) == ignored.end() // not in ignored list
212  ) {
213  undefined.emplace(F2name);
214  }
215  } else {
216  if (defined.find(F2name) == defined.end()) {
217  defined.emplace(F2name);
218  scan_function_calls<T>(*F2, defined, undefined, ignored);
219  }
220  }
221  }
222  }
223  }
224 }
225 
226 template <typename T = void>
227 void scan_function_calls(llvm::Module& module,
228  std::unordered_set<std::string>& defined,
229  std::unordered_set<std::string>& undefined,
230  const std::unordered_set<std::string>& ignored) {
231  for (auto& F : module) {
232  if (!F.isDeclaration()) {
233  scan_function_calls(F, defined, undefined, ignored);
234  }
235  }
236 }
237 
238 template <typename T = void>
239 std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string>>
240 scan_function_calls(llvm::Module& module,
241  const std::unordered_set<std::string>& ignored = {}) {
242  std::unordered_set<std::string> defined, undefined;
243  scan_function_calls(module, defined, undefined, ignored);
244  return std::make_tuple(defined, undefined);
245 }
246 
247 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
249  llvm::Module& M,
250  const std::unordered_set<llvm::Function*>& live_funcs) {
251  std::vector<llvm::Function*> dead_funcs;
252  for (auto& F : M) {
253  bool bAlive = false;
254  if (live_funcs.count(&F)) {
255  continue;
256  }
257  for (auto U : F.users()) {
258  auto* C = llvm::dyn_cast<const llvm::CallInst>(U);
259  if (!C || C->getParent()->getParent() != &F) {
260  bAlive = true;
261  break;
262  }
263  }
264  if (!bAlive) {
265  dead_funcs.push_back(&F);
266  }
267  }
268  for (auto pFn : dead_funcs) {
269  pFn->eraseFromParent();
270  }
271 }
272 
273 #ifdef HAVE_CUDA
274 
275 // check if linking with libdevice is required
276 // libdevice functions have a __nv_* prefix
277 bool check_module_requires_libdevice(llvm::Module* module) {
278  for (llvm::Function& F : *module) {
279  if (F.hasName() && F.getName().startswith("__nv_")) {
280  LOG(INFO) << "Module requires linking with libdevice: " << std::string(F.getName());
281  return true;
282  }
283  }
284  LOG(DEBUG1) << "module does not require linking against libdevice";
285  return false;
286 }
287 
288 // Adds the missing intrinsics declarations to the given module
289 void add_intrinsics_to_module(llvm::Module* module) {
290  for (llvm::Function& F : *module) {
291  for (llvm::Instruction& I : instructions(F)) {
292  if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
293  if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
294  llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
295  llvm::Function& decl_fn =
296  *llvm::Intrinsic::getDeclaration(module, ii->getIntrinsicID(), Tys);
297  ii->setCalledFunction(&decl_fn);
298  } else {
299  // inserts the declaration into the module if not present
300  llvm::Intrinsic::getDeclaration(module, ii->getIntrinsicID());
301  }
302  }
303  }
304  }
305 }
306 
307 #endif
308 
309 void optimize_ir(llvm::Function* query_func,
310  llvm::Module* module,
311  llvm::legacy::PassManager& pass_manager,
312  const std::unordered_set<llvm::Function*>& live_funcs,
313  const CompilationOptions& co) {
314  pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
315  pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
316  pass_manager.add(llvm::createInstSimplifyLegacyPass());
317  pass_manager.add(llvm::createInstructionCombiningPass());
318  pass_manager.add(llvm::createGlobalOptimizerPass());
319 
320  pass_manager.add(llvm::createLICMPass());
322  pass_manager.add(llvm::createLoopStrengthReducePass());
323  }
324  pass_manager.run(*module);
325 
326  eliminate_dead_self_recursive_funcs(*module, live_funcs);
327 }
328 #endif
329 
330 } // namespace
331 
333 
334 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine)
335  : execution_engine_(execution_engine) {}
336 
337 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine,
338  const CompilationOptions& co)
339  : execution_engine_(execution_engine) {
340  if (execution_engine_) {
342 #ifdef ENABLE_INTEL_JIT_LISTENER
343  intel_jit_listener_.reset(llvm::JITEventListener::createIntelJITEventListener());
345  execution_engine_->RegisterJITEventListener(intel_jit_listener_.get());
346  LOG(INFO) << "Registered IntelJITEventListener";
347 #else
348  LOG(WARNING) << "This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
349  "listener configuration parameter.";
350 #endif // ENABLE_INTEL_JIT_LISTENER
351  }
352  }
353 }
354 
356  llvm::ExecutionEngine* execution_engine) {
357  execution_engine_.reset(execution_engine);
358  intel_jit_listener_ = nullptr;
359  return *this;
360 }
361 
362 void verify_function_ir(const llvm::Function* func) {
363  std::stringstream err_ss;
364  llvm::raw_os_ostream err_os(err_ss);
365  err_os << "\n-----\n";
366  if (llvm::verifyFunction(*func, &err_os)) {
367  err_os << "\n-----\n";
368  func->print(err_os, nullptr);
369  err_os << "\n-----\n";
370  LOG(FATAL) << err_ss.str();
371  }
372 }
373 
374 std::shared_ptr<CompilationContext> Executor::getCodeFromCache(const CodeCacheKey& key,
375  const CodeCache& cache) {
376  auto it = cache.find(key);
377  if (it != cache.cend()) {
378  delete cgen_state_->module_;
379  cgen_state_->module_ = it->second.second;
380  return it->second.first;
381  }
382  return {};
383 }
384 
386  std::shared_ptr<CompilationContext> compilation_context,
387  llvm::Module* module,
388  CodeCache& cache) {
389  cache.put(key,
390  std::make_pair<std::shared_ptr<CompilationContext>, decltype(module)>(
391  std::move(compilation_context), std::move(module)));
392 }
393 
394 namespace {
395 
396 std::string assemblyForCPU(ExecutionEngineWrapper& execution_engine,
397  llvm::Module* module) {
398  llvm::legacy::PassManager pass_manager;
399  auto cpu_target_machine = execution_engine->getTargetMachine();
400  CHECK(cpu_target_machine);
401  llvm::SmallString<256> code_str;
402  llvm::raw_svector_ostream os(code_str);
403 #if LLVM_VERSION_MAJOR >= 10
404  cpu_target_machine->addPassesToEmitFile(
405  pass_manager, os, nullptr, llvm::CGFT_AssemblyFile);
406 #else
407  cpu_target_machine->addPassesToEmitFile(
408  pass_manager, os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
409 #endif
410  pass_manager.run(*module);
411  return "Assembly for the CPU:\n" + std::string(code_str.str()) + "\nEnd of assembly";
412 }
413 
414 } // namespace
415 
417  llvm::Function* func,
418  const std::unordered_set<llvm::Function*>& live_funcs,
419  const CompilationOptions& co) {
420  auto module = func->getParent();
421  // run optimizations
422 #ifndef WITH_JIT_DEBUG
423  llvm::legacy::PassManager pass_manager;
424  optimize_ir(func, module, pass_manager, live_funcs, co);
425 #endif // WITH_JIT_DEBUG
426 
427  auto init_err = llvm::InitializeNativeTarget();
428  CHECK(!init_err);
429 
430  llvm::InitializeAllTargetMCs();
431  llvm::InitializeNativeTargetAsmPrinter();
432  llvm::InitializeNativeTargetAsmParser();
433 
434  std::string err_str;
435  std::unique_ptr<llvm::Module> owner(module);
436  llvm::EngineBuilder eb(std::move(owner));
437  eb.setErrorStr(&err_str);
438  eb.setEngineKind(llvm::EngineKind::JIT);
439  llvm::TargetOptions to;
440  to.EnableFastISel = true;
441  eb.setTargetOptions(to);
443  eb.setOptLevel(llvm::CodeGenOpt::None);
444  }
445 
446 #ifdef _WIN32
447  // TODO: workaround for data layout mismatch crash for now
448  auto target_machine = eb.selectTarget();
449  CHECK(target_machine);
450  module->setDataLayout(target_machine->createDataLayout());
451 #endif
452 
453  ExecutionEngineWrapper execution_engine(eb.create(), co);
454  CHECK(execution_engine.get());
455  LOG(ASM) << assemblyForCPU(execution_engine, module);
456 
457  execution_engine->finalizeObject();
458  return execution_engine;
459 }
460 
461 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenCPU(
462  llvm::Function* query_func,
463  llvm::Function* multifrag_query_func,
464  const std::unordered_set<llvm::Function*>& live_funcs,
465  const CompilationOptions& co) {
466  auto module = multifrag_query_func->getParent();
467  CodeCacheKey key{serialize_llvm_object(query_func),
468  serialize_llvm_object(cgen_state_->row_func_)};
469  if (cgen_state_->filter_func_) {
470  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
471  }
472  for (const auto helper : cgen_state_->helper_functions_) {
473  key.push_back(serialize_llvm_object(helper));
474  }
475  auto cached_code = getCodeFromCache(key, cpu_code_cache_);
476  if (cached_code) {
477  return cached_code;
478  }
479 
480  if (cgen_state_->needs_geos_) {
481 #ifdef ENABLE_GEOS
482  load_geos_dynamic_library();
483 
484  // Read geos runtime module and bind GEOS API function references to GEOS library
485  auto rt_geos_module_copy = llvm::CloneModule(
486  *g_rt_geos_module.get(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
487  auto func = llvm::dyn_cast<llvm::Function>(gv);
488  if (!func) {
489  return true;
490  }
491  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
492  func->getLinkage() ==
493  llvm::GlobalValue::LinkageTypes::InternalLinkage ||
494  func->getLinkage() == llvm::GlobalValue::LinkageTypes::ExternalLinkage);
495  });
496  CodeGenerator::link_udf_module(rt_geos_module_copy,
497  *module,
498  cgen_state_.get(),
499  llvm::Linker::Flags::LinkOnlyNeeded);
500 #else
501  throw std::runtime_error("GEOS is disabled in this build");
502 #endif
503  }
504 
505  auto execution_engine =
506  CodeGenerator::generateNativeCPUCode(query_func, live_funcs, co);
507  auto cpu_compilation_context =
508  std::make_shared<CpuCompilationContext>(std::move(execution_engine));
509  cpu_compilation_context->setFunctionPointer(multifrag_query_func);
510  addCodeToCache(key, cpu_compilation_context, module, cpu_code_cache_);
511  return cpu_compilation_context;
512 }
513 
514 void CodeGenerator::link_udf_module(const std::unique_ptr<llvm::Module>& udf_module,
515  llvm::Module& module,
516  CgenState* cgen_state,
517  llvm::Linker::Flags flags) {
518  // throw a runtime error if the target module contains functions
519  // with the same name as in module of UDF functions.
520  for (auto& f : *udf_module.get()) {
521  auto func = module.getFunction(f.getName());
522  if (!(func == nullptr) && !f.isDeclaration() && flags == llvm::Linker::Flags::None) {
523  LOG(ERROR) << " Attempt to overwrite " << f.getName().str() << " in "
524  << module.getModuleIdentifier() << " from `"
525  << udf_module->getModuleIdentifier() << "`" << std::endl;
526  throw std::runtime_error(
527  "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
528  "function ***");
529  } else {
530  VLOG(1) << " Adding " << f.getName().str() << " to "
531  << module.getModuleIdentifier() << " from `"
532  << udf_module->getModuleIdentifier() << "`" << std::endl;
533  }
534  }
535 
536  std::unique_ptr<llvm::Module> udf_module_copy;
537 
538  udf_module_copy = llvm::CloneModule(*udf_module.get(), cgen_state->vmap_);
539 
540  udf_module_copy->setDataLayout(module.getDataLayout());
541  udf_module_copy->setTargetTriple(module.getTargetTriple());
542 
543  // Initialize linker with module for RuntimeFunctions.bc
544  llvm::Linker ld(module);
545  bool link_error = false;
546 
547  link_error = ld.linkInModule(std::move(udf_module_copy), flags);
548 
549  if (link_error) {
550  throw std::runtime_error("link_udf_module: *** error linking module ***");
551  }
552 }
553 
554 namespace {
555 
556 std::string cpp_to_llvm_name(const std::string& s) {
557  if (s == "int8_t") {
558  return "i8";
559  }
560  if (s == "int16_t") {
561  return "i16";
562  }
563  if (s == "int32_t") {
564  return "i32";
565  }
566  if (s == "int64_t") {
567  return "i64";
568  }
569  CHECK(s == "float" || s == "double");
570  return s;
571 }
572 
573 std::string gen_array_any_all_sigs() {
574  std::string result;
575  for (const std::string any_or_all : {"any", "all"}) {
576  for (const std::string elem_type :
577  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
578  for (const std::string needle_type :
579  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
580  for (const std::string op_name : {"eq", "ne", "lt", "le", "gt", "ge"}) {
581  result += ("declare i1 @array_" + any_or_all + "_" + op_name + "_" + elem_type +
582  "_" + needle_type + "(i8*, i64, " + cpp_to_llvm_name(needle_type) +
583  ", " + cpp_to_llvm_name(elem_type) + ");\n");
584  }
585  }
586  }
587  }
588  return result;
589 }
590 
592  std::string result;
593  for (const std::string key_type : {"int8_t", "int16_t", "int32_t", "int64_t"}) {
594  const auto key_llvm_type = cpp_to_llvm_name(key_type);
595  result += "declare i64 @translate_null_key_" + key_type + "(" + key_llvm_type + ", " +
596  key_llvm_type + ", i64);\n";
597  }
598  return result;
599 }
600 
601 const std::string cuda_rt_decls =
602  R"( declare void @llvm.dbg.declare(metadata, metadata, metadata) declare void @llvm.dbg.value(metadata, metadata, metadata) declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind declare i64 @get_thread_index(); declare i64 @get_block_index(); declare i32 @pos_start_impl(i32*); declare i32 @group_buff_idx_impl(); declare i32 @pos_step_impl(); declare i8 @thread_warp_idx(i8); declare i64* @init_shared_mem(i64*, i32); declare i64* @init_shared_mem_nop(i64*, i32); declare i64* @declare_dynamic_shared_memory(); declare void @write_back_nop(i64*, i64*, i32); declare void @write_back_non_grouped_agg(i64*, i64*, i32); declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8); declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32); declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32); declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32); declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32); declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32); declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32); declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64); declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64); declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64); declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64); declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64); declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double); declare i64 @get_bucket_key_for_range_double(i8*, i64, double); declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double); declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64); declare i64 @agg_count_shared(i64*, i64); declare i64 @agg_count_skip_val_shared(i64*, i64, i64); declare i32 @agg_count_int32_shared(i32*, i32); declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32); declare i64 @agg_count_double_shared(i64*, double); declare i64 @agg_count_double_skip_val_shared(i64*, double, double); declare i32 @agg_count_float_shared(i32*, float); declare i32 @agg_count_float_skip_val_shared(i32*, float, float); declare i64 @agg_sum_shared(i64*, i64); declare i64 @agg_sum_skip_val_shared(i64*, i64, i64); declare i32 @agg_sum_int32_shared(i32*, i32); declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32); declare void @agg_sum_double_shared(i64*, double); declare void @agg_sum_double_skip_val_shared(i64*, double, double); declare void @agg_sum_float_shared(i32*, float); declare void @agg_sum_float_skip_val_shared(i32*, float, float); declare void @agg_max_shared(i64*, i64); declare void @agg_max_skip_val_shared(i64*, i64, i64); declare void @agg_max_int32_shared(i32*, i32); declare void @agg_max_int32_skip_val_shared(i32*, i32, i32); declare void @agg_max_int16_shared(i16*, i16); declare void @agg_max_int16_skip_val_shared(i16*, i16, i16); declare void @agg_max_int8_shared(i8*, i8); declare void @agg_max_int8_skip_val_shared(i8*, i8, i8); declare void @agg_max_double_shared(i64*, double); declare void @agg_max_double_skip_val_shared(i64*, double, double); declare void @agg_max_float_shared(i32*, float); declare void @agg_max_float_skip_val_shared(i32*, float, float); declare void @agg_min_shared(i64*, i64); declare void @agg_min_skip_val_shared(i64*, i64, i64); declare void @agg_min_int32_shared(i32*, i32); declare void @agg_min_int32_skip_val_shared(i32*, i32, i32); declare void @agg_min_int16_shared(i16*, i16); declare void @agg_min_int16_skip_val_shared(i16*, i16, i16); declare void @agg_min_int8_shared(i8*, i8); declare void @agg_min_int8_skip_val_shared(i8*, i8, i8); declare void @agg_min_double_shared(i64*, double); declare void @agg_min_double_skip_val_shared(i64*, double, double); declare void @agg_min_float_shared(i32*, float); declare void @agg_min_float_skip_val_shared(i32*, float, float); declare void @agg_id_shared(i64*, i64); declare i8* @agg_id_varlen_shared(i8*, i64, i8*, i64); declare void @agg_id_int32_shared(i32*, i32); declare void @agg_id_int16_shared(i16*, i16); declare void @agg_id_int8_shared(i8*, i8); declare void @agg_id_double_shared(i64*, double); declare void @agg_id_double_shared_slow(i64*, double*); declare void @agg_id_float_shared(i32*, float); declare i32 @checked_single_agg_id_shared(i64*, i64, i64); declare i32 @checked_single_agg_id_double_shared(i64*, double, double); declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double); declare i32 @checked_single_agg_id_float_shared(i32*, float, float); declare i1 @slotEmptyKeyCAS(i64*, i64, i64); declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32); declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16); declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8); declare i64 @datetrunc_century(i64); declare i64 @datetrunc_day(i64); declare i64 @datetrunc_decade(i64); declare i64 @datetrunc_hour(i64); declare i64 @datetrunc_millennium(i64); declare i64 @datetrunc_minute(i64); declare i64 @datetrunc_month(i64); declare i64 @datetrunc_quarter(i64); declare i64 @datetrunc_quarterday(i64); declare i64 @datetrunc_week_monday(i64); declare i64 @datetrunc_week_sunday(i64); declare i64 @datetrunc_week_saturday(i64); declare i64 @datetrunc_year(i64); declare i64 @extract_epoch(i64); declare i64 @extract_dateepoch(i64); declare i64 @extract_quarterday(i64); declare i64 @extract_hour(i64); declare i64 @extract_minute(i64); declare i64 @extract_second(i64); declare i64 @extract_millisecond(i64); declare i64 @extract_microsecond(i64); declare i64 @extract_nanosecond(i64); declare i64 @extract_dow(i64); declare i64 @extract_isodow(i64); declare i64 @extract_day(i64); declare i64 @extract_week_monday(i64); declare i64 @extract_week_sunday(i64); declare i64 @extract_week_saturday(i64); declare i64 @extract_day_of_year(i64); declare i64 @extract_month(i64); declare i64 @extract_quarter(i64); declare i64 @extract_year(i64); declare i64 @DateTruncateHighPrecisionToDate(i64, i64); declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64); declare i64 @DateDiff(i32, i64, i64); declare i64 @DateDiffNullable(i32, i64, i64, i64); declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32); declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64); declare i64 @DateAdd(i32, i64, i64); declare i64 @DateAddNullable(i32, i64, i64, i64); declare i64 @DateAddHighPrecision(i32, i64, i64, i32); declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64); declare i64 @string_decode(i8*, i64); declare i32 @array_size(i8*, i64, i32); declare i32 @array_size_nullable(i8*, i64, i32, i32); declare i32 @fast_fixlen_array_size(i8*, i32); declare i1 @array_is_null(i8*, i64); declare i1 @point_coord_array_is_null(i8*, i64); declare i8* @array_buff(i8*, i64); declare i8* @fast_fixlen_array_buff(i8*, i64); declare i8 @array_at_int8_t(i8*, i64, i32); declare i16 @array_at_int16_t(i8*, i64, i32); declare i32 @array_at_int32_t(i8*, i64, i32); declare i64 @array_at_int64_t(i8*, i64, i32); declare float @array_at_float(i8*, i64, i32); declare double @array_at_double(i8*, i64, i32); declare i8 @varlen_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_array_at_int64_t(i8*, i64, i32); declare float @varlen_array_at_float(i8*, i64, i32); declare double @varlen_array_at_double(i8*, i64, i32); declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32); declare float @varlen_notnull_array_at_float(i8*, i64, i32); declare double @varlen_notnull_array_at_double(i8*, i64, i32); declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8); declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16); declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32); declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64); declare float @array_at_float_checked(i8*, i64, i64, float); declare double @array_at_double_checked(i8*, i64, i64, double); declare i32 @char_length(i8*, i32); declare i32 @char_length_nullable(i8*, i32, i32); declare i32 @char_length_encoded(i8*, i32); declare i32 @char_length_encoded_nullable(i8*, i32, i32); declare i32 @key_for_string_encoded(i32); declare i1 @sample_ratio(double, i64); declare i1 @string_like(i8*, i32, i8*, i32, i8); declare i1 @string_ilike(i8*, i32, i8*, i32, i8); declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8); declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8); declare i1 @string_like_simple(i8*, i32, i8*, i32); declare i1 @string_ilike_simple(i8*, i32, i8*, i32); declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8); declare i1 @string_lt(i8*, i32, i8*, i32); declare i1 @string_le(i8*, i32, i8*, i32); declare i1 @string_gt(i8*, i32, i8*, i32); declare i1 @string_ge(i8*, i32, i8*, i32); declare i1 @string_eq(i8*, i32, i8*, i32); declare i1 @string_ne(i8*, i32, i8*, i32); declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8); declare i1 @regexp_like(i8*, i32, i8*, i32, i8); declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8); declare void @linear_probabilistic_count(i8*, i32, i8*, i32); declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64); declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64); declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64); declare void @record_error_code(i32, i32*); declare i32 @get_error_code(i32*); declare i1 @dynamic_watchdog(); declare i1 @check_interrupt(); declare void @force_sync(); declare void @sync_warp(); declare void @sync_warp_protected(i64, i64); declare void @sync_threadblock(); declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32); declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64); declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float); declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double); declare double @decompress_x_coord_geoint(i32); declare double @decompress_y_coord_geoint(i32); declare i32 @compress_x_coord_geoint(double); declare i32 @compress_y_coord_geoint(double); )" + gen_array_any_all_sigs() +
604 
605 #ifdef HAVE_CUDA
606 std::string extension_function_decls(const std::unordered_set<std::string>& udf_decls) {
607  const auto decls =
608  ExtensionFunctionsWhitelist::getLLVMDeclarations(udf_decls, /*is_gpu=*/true);
609  return boost::algorithm::join(decls, "\n");
610 }
611 
612 void legalize_nvvm_ir(llvm::Function* query_func) {
613  // optimizations might add attributes to the function
614  // and NVPTX doesn't understand all of them; play it
615  // safe and clear all attributes
616  clear_function_attributes(query_func);
617  verify_function_ir(query_func);
618 
619  std::vector<llvm::Instruction*> stackrestore_intrinsics;
620  std::vector<llvm::Instruction*> stacksave_intrinsics;
621  std::vector<llvm::Instruction*> lifetime;
622  for (auto& BB : *query_func) {
623  for (llvm::Instruction& I : BB) {
624  if (const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
625  if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
626  stacksave_intrinsics.push_back(&I);
627  } else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
628  stackrestore_intrinsics.push_back(&I);
629  } else if (II->getIntrinsicID() == llvm::Intrinsic::lifetime_start ||
630  II->getIntrinsicID() == llvm::Intrinsic::lifetime_end) {
631  lifetime.push_back(&I);
632  }
633  }
634  }
635  }
636 
637  // stacksave and stackrestore intrinsics appear together, and
638  // stackrestore uses stacksaved result as its argument
639  // so it should be removed first.
640  for (auto& II : stackrestore_intrinsics) {
641  II->eraseFromParent();
642  }
643  for (auto& II : stacksave_intrinsics) {
644  II->eraseFromParent();
645  }
646  // Remove lifetime intrinsics as well. NVPTX don't like them
647  for (auto& II : lifetime) {
648  II->eraseFromParent();
649  }
650 }
651 #endif // HAVE_CUDA
652 
653 } // namespace
654 
655 llvm::StringRef get_gpu_target_triple_string() {
656  return llvm::StringRef("nvptx64-nvidia-cuda");
657 }
658 
659 llvm::StringRef get_gpu_data_layout() {
660  return llvm::StringRef(
661  "e-p:64:64:64-i1:8:8-i8:8:8-"
662  "i16:16:16-i32:32:32-i64:64:64-"
663  "f32:32:32-f64:64:64-v16:16:16-"
664  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
665 }
666 
667 std::map<std::string, std::string> get_device_parameters(bool cpu_only) {
668  std::map<std::string, std::string> result;
669 
670  result.insert(std::make_pair("cpu_name", llvm::sys::getHostCPUName()));
671  result.insert(std::make_pair("cpu_triple", llvm::sys::getProcessTriple()));
672  result.insert(
673  std::make_pair("cpu_cores", std::to_string(llvm::sys::getHostNumPhysicalCores())));
674  result.insert(std::make_pair("cpu_threads", std::to_string(cpu_threads())));
675 
676  // https://en.cppreference.com/w/cpp/language/types
677  std::string sizeof_types;
678  sizeof_types += "bool:" + std::to_string(sizeof(bool)) + ";";
679  sizeof_types += "size_t:" + std::to_string(sizeof(size_t)) + ";";
680  sizeof_types += "ssize_t:" + std::to_string(sizeof(ssize_t)) + ";";
681  sizeof_types += "char:" + std::to_string(sizeof(char)) + ";";
682  sizeof_types += "uchar:" + std::to_string(sizeof(unsigned char)) + ";";
683  sizeof_types += "short:" + std::to_string(sizeof(short)) + ";";
684  sizeof_types += "ushort:" + std::to_string(sizeof(unsigned short int)) + ";";
685  sizeof_types += "int:" + std::to_string(sizeof(int)) + ";";
686  sizeof_types += "uint:" + std::to_string(sizeof(unsigned int)) + ";";
687  sizeof_types += "long:" + std::to_string(sizeof(long int)) + ";";
688  sizeof_types += "ulong:" + std::to_string(sizeof(unsigned long int)) + ";";
689  sizeof_types += "longlong:" + std::to_string(sizeof(long long int)) + ";";
690  sizeof_types += "ulonglong:" + std::to_string(sizeof(unsigned long long int)) + ";";
691  sizeof_types += "float:" + std::to_string(sizeof(float)) + ";";
692  sizeof_types += "double:" + std::to_string(sizeof(double)) + ";";
693  sizeof_types += "longdouble:" + std::to_string(sizeof(long double)) + ";";
694  sizeof_types += "voidptr:" + std::to_string(sizeof(void*)) + ";";
695 
696  result.insert(std::make_pair("type_sizeof", sizeof_types));
697 
698  std::string null_values;
699  null_values += "boolean1:" + std::to_string(serialized_null_value<bool>()) + ";";
700  null_values += "boolean8:" + std::to_string(serialized_null_value<int8_t>()) + ";";
701  null_values += "int8:" + std::to_string(serialized_null_value<int8_t>()) + ";";
702  null_values += "int16:" + std::to_string(serialized_null_value<int16_t>()) + ";";
703  null_values += "int32:" + std::to_string(serialized_null_value<int32_t>()) + ";";
704  null_values += "int64:" + std::to_string(serialized_null_value<int64_t>()) + ";";
705  null_values += "uint8:" + std::to_string(serialized_null_value<uint8_t>()) + ";";
706  null_values += "uint16:" + std::to_string(serialized_null_value<uint16_t>()) + ";";
707  null_values += "uint32:" + std::to_string(serialized_null_value<uint32_t>()) + ";";
708  null_values += "uint64:" + std::to_string(serialized_null_value<uint64_t>()) + ";";
709  null_values += "float32:" + std::to_string(serialized_null_value<float>()) + ";";
710  null_values += "float64:" + std::to_string(serialized_null_value<double>()) + ";";
711  null_values +=
712  "Array<boolean8>:" + std::to_string(serialized_null_value<int8_t, true>()) + ";";
713  null_values +=
714  "Array<int8>:" + std::to_string(serialized_null_value<int8_t, true>()) + ";";
715  null_values +=
716  "Array<int16>:" + std::to_string(serialized_null_value<int16_t, true>()) + ";";
717  null_values +=
718  "Array<int32>:" + std::to_string(serialized_null_value<int32_t, true>()) + ";";
719  null_values +=
720  "Array<int64>:" + std::to_string(serialized_null_value<int64_t, true>()) + ";";
721  null_values +=
722  "Array<float32>:" + std::to_string(serialized_null_value<float, true>()) + ";";
723  null_values +=
724  "Array<float64>:" + std::to_string(serialized_null_value<double, true>()) + ";";
725 
726  result.insert(std::make_pair("null_values", null_values));
727 
728  llvm::StringMap<bool> cpu_features;
729  if (llvm::sys::getHostCPUFeatures(cpu_features)) {
730  std::string features_str = "";
731  for (auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
732  features_str += (it->getValue() ? " +" : " -");
733  features_str += it->getKey().str();
734  }
735  result.insert(std::make_pair("cpu_features", features_str));
736  }
737 
738  result.insert(std::make_pair("llvm_version",
739  std::to_string(LLVM_VERSION_MAJOR) + "." +
740  std::to_string(LLVM_VERSION_MINOR) + "." +
741  std::to_string(LLVM_VERSION_PATCH)));
742 
743 #ifdef HAVE_CUDA
744  if (!cpu_only) {
745  int device_count = 0;
746  checkCudaErrors(cuDeviceGetCount(&device_count));
747  if (device_count) {
748  CUdevice device{};
749  char device_name[256];
750  int major = 0, minor = 0;
751  int driver_version;
752  checkCudaErrors(cuDeviceGet(&device, 0)); // assuming homogeneous multi-GPU system
753  checkCudaErrors(cuDeviceGetName(device_name, 256, device));
754  checkCudaErrors(cuDeviceGetAttribute(
755  &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
756  checkCudaErrors(cuDeviceGetAttribute(
757  &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
758  checkCudaErrors(cuDriverGetVersion(&driver_version));
759 
760  result.insert(std::make_pair("gpu_name", device_name));
761  result.insert(std::make_pair("gpu_count", std::to_string(device_count)));
762  result.insert(std::make_pair("gpu_compute_capability",
763  std::to_string(major) + "." + std::to_string(minor)));
764  result.insert(std::make_pair("gpu_triple", get_gpu_target_triple_string()));
765  result.insert(std::make_pair("gpu_datalayout", get_gpu_data_layout()));
766  result.insert(std::make_pair("gpu_driver",
767  "CUDA " + std::to_string(driver_version / 1000) + "." +
768  std::to_string((driver_version % 1000) / 10)));
769  }
770  }
771 #endif
772 
773  return result;
774 }
775 
776 namespace {
777 
778 bool is_udf_module_present(bool cpu_only = false) {
779  return (cpu_only || udf_gpu_module != nullptr) && (udf_cpu_module != nullptr);
780 }
781 
782 } // namespace
783 
784 std::shared_ptr<GpuCompilationContext> CodeGenerator::generateNativeGPUCode(
785  llvm::Function* func,
786  llvm::Function* wrapper_func,
787  const std::unordered_set<llvm::Function*>& live_funcs,
788  const CompilationOptions& co,
789  const GPUTarget& gpu_target) {
790 #ifdef HAVE_CUDA
791  auto module = func->getParent();
792  /*
793  `func` is one of the following generated functions:
794  - `call_table_function(i8** %input_col_buffers, i64*
795  %input_row_count, i64** %output_buffers, i64* %output_row_count)`
796  that wraps the user-defined table function.
797  - `multifrag_query`
798  - `multifrag_query_hoisted_literals`
799  - ...
800 
801  `wrapper_func` is table_func_kernel(i32*, i8**, i64*, i64**,
802  i64*) that wraps `call_table_function`.
803 
804  `module` is from `build/QueryEngine/RuntimeFunctions.bc` and it
805  contains `func` and `wrapper_func`. `module` should also contain
806  the definitions of user-defined table functions.
807 
808  `live_funcs` contains table_func_kernel and call_table_function
809 
810  `gpu_target.cgen_state->module_` appears to be the same as `module`
811  */
812  CHECK(gpu_target.cgen_state->module_ == module);
813  module->setDataLayout(
814  "e-p:64:64:64-i1:8:8-i8:8:8-"
815  "i16:16:16-i32:32:32-i64:64:64-"
816  "f32:32:32-f64:64:64-v16:16:16-"
817  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
818  module->setTargetTriple("nvptx64-nvidia-cuda");
819  CHECK(gpu_target.nvptx_target_machine);
820  auto pass_manager_builder = llvm::PassManagerBuilder();
821 
822  pass_manager_builder.OptLevel = 0;
823  llvm::legacy::PassManager module_pass_manager;
824  pass_manager_builder.populateModulePassManager(module_pass_manager);
825 
826  bool requires_libdevice = check_module_requires_libdevice(module);
827 
828  if (requires_libdevice) {
829  // add nvvm reflect pass replacing any NVVM conditionals with constants
830  gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
831  llvm::legacy::FunctionPassManager FPM(module);
832  pass_manager_builder.populateFunctionPassManager(FPM);
833 
834  // Run the NVVMReflectPass here rather than inside optimize_ir
835  FPM.doInitialization();
836  for (auto& F : *module) {
837  FPM.run(F);
838  }
839  FPM.doFinalization();
840  }
841 
842  // run optimizations
843  optimize_ir(func, module, module_pass_manager, live_funcs, co);
844  legalize_nvvm_ir(func);
845 
846  std::stringstream ss;
847  llvm::raw_os_ostream os(ss);
848 
849  llvm::LLVMContext& ctx = module->getContext();
850  // Get "nvvm.annotations" metadata node
851  llvm::NamedMDNode* md = module->getOrInsertNamedMetadata("nvvm.annotations");
852 
853  llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
854  llvm::MDString::get(ctx, "kernel"),
855  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
856  llvm::Type::getInt32Ty(ctx), 1))};
857 
858  // Append metadata to nvvm.annotations
859  md->addOperand(llvm::MDNode::get(ctx, md_vals));
860 
861  std::unordered_set<llvm::Function*> roots{wrapper_func, func};
862  if (gpu_target.row_func_not_inlined) {
863  clear_function_attributes(gpu_target.cgen_state->row_func_);
864  roots.insert(gpu_target.cgen_state->row_func_);
865  if (gpu_target.cgen_state->filter_func_) {
866  roots.insert(gpu_target.cgen_state->filter_func_);
867  }
868  }
869 
870  // prevent helper functions from being removed
871  for (auto f : gpu_target.cgen_state->helper_functions_) {
872  roots.insert(f);
873  }
874 
875  if (requires_libdevice) {
876  for (llvm::Function& F : *module) {
877  // Some libdevice functions calls another functions that starts with "__internal_"
878  // prefix.
879  // __internal_trig_reduction_slowpathd
880  // __internal_accurate_pow
881  // __internal_lgamma_pos
882  // Those functions have a "noinline" attribute which prevents the optimizer from
883  // inlining them into the body of @query_func
884  if (F.hasName() && F.getName().startswith("__internal") && !F.isDeclaration()) {
885  roots.insert(&F);
886  }
887  legalize_nvvm_ir(&F);
888  }
889  }
890 
891  // Prevent the udf function(s) from being removed the way the runtime functions are
892  std::unordered_set<std::string> udf_declarations;
893  if (is_udf_module_present()) {
894  for (auto& f : udf_gpu_module->getFunctionList()) {
895  llvm::Function* udf_function = module->getFunction(f.getName());
896 
897  if (udf_function) {
898  legalize_nvvm_ir(udf_function);
899  roots.insert(udf_function);
900 
901  // If we have a udf that declares a external function
902  // note it so we can avoid duplicate declarations
903  if (f.isDeclaration()) {
904  udf_declarations.insert(f.getName().str());
905  }
906  }
907  }
908  }
909 
910  if (is_rt_udf_module_present()) {
911  for (auto& f : rt_udf_gpu_module->getFunctionList()) {
912  llvm::Function* udf_function = module->getFunction(f.getName());
913  if (udf_function) {
914  legalize_nvvm_ir(udf_function);
915  roots.insert(udf_function);
916 
917  // If we have a udf that declares a external function
918  // note it so we can avoid duplicate declarations
919  if (f.isDeclaration()) {
920  udf_declarations.insert(f.getName().str());
921  }
922  }
923  }
924  }
925 
926  std::vector<llvm::Function*> rt_funcs;
927  for (auto& Fn : *module) {
928  if (roots.count(&Fn)) {
929  continue;
930  }
931  rt_funcs.push_back(&Fn);
932  }
933  for (auto& pFn : rt_funcs) {
934  pFn->removeFromParent();
935  }
936 
937  if (requires_libdevice) {
938  add_intrinsics_to_module(module);
939  }
940 
941  module->print(os, nullptr);
942  os.flush();
943 
944  for (auto& pFn : rt_funcs) {
945  module->getFunctionList().push_back(pFn);
946  }
947  module->eraseNamedMetadata(md);
948 
949  auto cuda_llir = ss.str() + cuda_rt_decls + extension_function_decls(udf_declarations);
950  std::string ptx;
951  try {
952  ptx = generatePTX(
953  cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
954  } catch (ParseIRError& e) {
955  LOG(WARNING) << "Failed to generate PTX: " << e.what()
956  << ". Switching to CPU execution target.";
957  throw QueryMustRunOnCpu();
958  }
959  LOG(PTX) << "PTX for the GPU:\n" << ptx << "\nEnd of PTX";
960 
961  auto cubin_result = ptx_to_cubin(ptx, gpu_target.block_size, gpu_target.cuda_mgr);
962  auto& option_keys = cubin_result.option_keys;
963  auto& option_values = cubin_result.option_values;
964  auto cubin = cubin_result.cubin;
965  auto link_state = cubin_result.link_state;
966  const auto num_options = option_keys.size();
967 
968  auto func_name = wrapper_func->getName().str();
969  auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
970  for (int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
971  ++device_id) {
972  gpu_compilation_context->addDeviceCode(
973  std::make_unique<GpuDeviceCompilationContext>(cubin,
974  func_name,
975  device_id,
976  gpu_target.cuda_mgr,
977  num_options,
978  &option_keys[0],
979  &option_values[0]));
980  }
981 
982  checkCudaErrors(cuLinkDestroy(link_state));
983  return gpu_compilation_context;
984 #else
985  return {};
986 #endif
987 }
988 
989 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenGPU(
990  llvm::Function* query_func,
991  llvm::Function* multifrag_query_func,
992  std::unordered_set<llvm::Function*>& live_funcs,
993  const bool no_inline,
994  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
995  const CompilationOptions& co) {
996 #ifdef HAVE_CUDA
997  auto module = multifrag_query_func->getParent();
998 
999  CHECK(cuda_mgr);
1000  CodeCacheKey key{serialize_llvm_object(query_func),
1001  serialize_llvm_object(cgen_state_->row_func_)};
1002  if (cgen_state_->filter_func_) {
1003  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
1004  }
1005  for (const auto helper : cgen_state_->helper_functions_) {
1006  key.push_back(serialize_llvm_object(helper));
1007  }
1008  auto cached_code = getCodeFromCache(key, gpu_code_cache_);
1009  if (cached_code) {
1010  return cached_code;
1011  }
1012 
1013  bool row_func_not_inlined = false;
1014  if (no_inline) {
1015  for (auto it = llvm::inst_begin(cgen_state_->row_func_),
1016  e = llvm::inst_end(cgen_state_->row_func_);
1017  it != e;
1018  ++it) {
1019  if (llvm::isa<llvm::CallInst>(*it)) {
1020  auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
1021  if (get_gv_call.getCalledFunction()->getName() == "array_size" ||
1022  get_gv_call.getCalledFunction()->getName() == "linear_probabilistic_count") {
1023  mark_function_never_inline(cgen_state_->row_func_);
1024  row_func_not_inlined = true;
1025  break;
1026  }
1027  }
1028  }
1029  }
1030 
1031  initializeNVPTXBackend();
1032  CodeGenerator::GPUTarget gpu_target{nvptx_target_machine_.get(),
1033  cuda_mgr,
1034  blockSize(),
1035  cgen_state_.get(),
1036  row_func_not_inlined};
1037  std::shared_ptr<GpuCompilationContext> compilation_context;
1038 
1039  if (check_module_requires_libdevice(module)) {
1040  if (g_rt_libdevice_module == nullptr) {
1041  // raise error
1042  throw std::runtime_error(
1043  "libdevice library is not available but required by the UDF module");
1044  }
1045 
1046  // Bind libdevice it to the current module
1047  CodeGenerator::link_udf_module(g_rt_libdevice_module,
1048  *module,
1049  cgen_state_.get(),
1050  llvm::Linker::Flags::OverrideFromSrc);
1051 
1052  // activate nvvm-reflect-ftz flag on the module
1053  module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", (int)1);
1054  for (llvm::Function& fn : *module) {
1055  fn.addFnAttr("nvptx-f32ftz", "true");
1056  }
1057  }
1058 
1059  try {
1060  compilation_context = CodeGenerator::generateNativeGPUCode(
1061  query_func, multifrag_query_func, live_funcs, co, gpu_target);
1062  addCodeToCache(key, compilation_context, module, gpu_code_cache_);
1063  } catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1064  if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1065  // Thrown if memory not able to be allocated on gpu
1066  // Retry once after evicting portion of code cache
1067  LOG(WARNING) << "Failed to allocate GPU memory for generated code. Evicting "
1069  << "% of GPU code cache and re-trying.";
1070  gpu_code_cache_.evictFractionEntries(g_fraction_code_cache_to_evict);
1071  compilation_context = CodeGenerator::generateNativeGPUCode(
1072  query_func, multifrag_query_func, live_funcs, co, gpu_target);
1073  addCodeToCache(key, compilation_context, module, gpu_code_cache_);
1074  } else {
1075  throw;
1076  }
1077  }
1078  CHECK(compilation_context);
1079  return compilation_context;
1080 #else
1081  return nullptr;
1082 #endif
1083 }
1084 
1085 std::string CodeGenerator::generatePTX(const std::string& cuda_llir,
1086  llvm::TargetMachine* nvptx_target_machine,
1087  llvm::LLVMContext& context) {
1088  auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir, "", false);
1089 
1090  llvm::SMDiagnostic parse_error;
1091 
1092  auto module = llvm::parseIR(mem_buff->getMemBufferRef(), parse_error, context);
1093  if (!module) {
1094  LOG(IR) << "CodeGenerator::generatePTX:NVVM IR:\n" << cuda_llir << "\nEnd of NNVM IR";
1095  throw_parseIR_error(parse_error, "generatePTX", /* is_gpu= */ true);
1096  }
1097 
1098  llvm::SmallString<256> code_str;
1099  llvm::raw_svector_ostream formatted_os(code_str);
1100  CHECK(nvptx_target_machine);
1101  {
1102  llvm::legacy::PassManager ptxgen_pm;
1103  module->setDataLayout(nvptx_target_machine->createDataLayout());
1104 
1105 #if LLVM_VERSION_MAJOR >= 10
1106  nvptx_target_machine->addPassesToEmitFile(
1107  ptxgen_pm, formatted_os, nullptr, llvm::CGFT_AssemblyFile);
1108 #else
1109  nvptx_target_machine->addPassesToEmitFile(
1110  ptxgen_pm, formatted_os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1111 #endif
1112  ptxgen_pm.run(*module);
1113  }
1114 
1115 #if LLVM_VERSION_MAJOR >= 11
1116  return std::string(code_str);
1117 #else
1118  return code_str.str();
1119 #endif
1120 }
1121 
1122 std::unique_ptr<llvm::TargetMachine> CodeGenerator::initializeNVPTXBackend(
1124  llvm::InitializeAllTargets();
1125  llvm::InitializeAllTargetMCs();
1126  llvm::InitializeAllAsmPrinters();
1127  std::string err;
1128  auto target = llvm::TargetRegistry::lookupTarget("nvptx64", err);
1129  if (!target) {
1130  LOG(FATAL) << err;
1131  }
1132  return std::unique_ptr<llvm::TargetMachine>(
1133  target->createTargetMachine("nvptx64-nvidia-cuda",
1135  "",
1136  llvm::TargetOptions(),
1137  llvm::Reloc::Static));
1138 }
1139 
1140 std::string Executor::generatePTX(const std::string& cuda_llir) const {
1142  cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1143 }
1144 
1145 void Executor::initializeNVPTXBackend() const {
1146  if (nvptx_target_machine_) {
1147  return;
1148  }
1149  const auto arch = cudaMgr()->getDeviceArch();
1150  nvptx_target_machine_ = CodeGenerator::initializeNVPTXBackend(arch);
1151 }
1152 
1153 // A small number of runtime functions don't get through CgenState::emitCall. List them
1154 // explicitly here and always clone their implementation from the runtime module.
1155 bool CodeGenerator::alwaysCloneRuntimeFunction(const llvm::Function* func) {
1156  return func->getName() == "query_stub_hoisted_literals" ||
1157  func->getName() == "multifrag_query_hoisted_literals" ||
1158  func->getName() == "query_stub" || func->getName() == "multifrag_query" ||
1159  func->getName() == "fixed_width_int_decode" ||
1160  func->getName() == "fixed_width_unsigned_decode" ||
1161  func->getName() == "diff_fixed_width_int_decode" ||
1162  func->getName() == "fixed_width_double_decode" ||
1163  func->getName() == "fixed_width_float_decode" ||
1164  func->getName() == "fixed_width_small_date_decode" ||
1165  func->getName() == "record_error_code" || func->getName() == "get_error_code" ||
1166  func->getName() == "pos_start_impl" || func->getName() == "pos_step_impl" ||
1167  func->getName() == "group_buff_idx_impl" ||
1168  func->getName() == "init_shared_mem" ||
1169  func->getName() == "init_shared_mem_nop" || func->getName() == "write_back_nop";
1170 }
1171 
1172 llvm::Module* read_template_module(llvm::LLVMContext& context) {
1173  llvm::SMDiagnostic err;
1174 
1175  auto buffer_or_error = llvm::MemoryBuffer::getFile(omnisci::get_root_abs_path() +
1176  "/QueryEngine/RuntimeFunctions.bc");
1177  CHECK(!buffer_or_error.getError()) << "root path=" << omnisci::get_root_abs_path();
1178  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1179 
1180  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1181  CHECK(!owner.takeError());
1182  auto module = owner.get().release();
1183  CHECK(module);
1184 
1185  return module;
1186 }
1187 
1188 #ifdef HAVE_CUDA
1189 llvm::Module* read_libdevice_module(llvm::LLVMContext& context) {
1190  llvm::SMDiagnostic err;
1191  const auto env = get_cuda_home();
1192 
1193  boost::filesystem::path cuda_path{env};
1194  cuda_path /= "nvvm";
1195  cuda_path /= "libdevice";
1196  cuda_path /= "libdevice.10.bc";
1197 
1198  if (!boost::filesystem::exists(cuda_path)) {
1199  LOG(WARNING) << "Could not find CUDA libdevice; support for some UDF "
1200  "functions might not be available.";
1201  return nullptr;
1202  }
1203 
1204  auto buffer_or_error = llvm::MemoryBuffer::getFile(cuda_path.c_str());
1205  CHECK(!buffer_or_error.getError()) << "cuda_path=" << cuda_path.c_str();
1206  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1207 
1208  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1209  CHECK(!owner.takeError());
1210  auto module = owner.get().release();
1211  CHECK(module);
1212 
1213  return module;
1214 }
1215 #endif
1216 
1217 #ifdef ENABLE_GEOS
1218 llvm::Module* read_geos_module(llvm::LLVMContext& context) {
1219  llvm::SMDiagnostic err;
1220 
1221  auto buffer_or_error = llvm::MemoryBuffer::getFile(omnisci::get_root_abs_path() +
1222  "/QueryEngine/GeosRuntime.bc");
1223  CHECK(!buffer_or_error.getError()) << "root path=" << omnisci::get_root_abs_path();
1224  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1225 
1226  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1227  CHECK(!owner.takeError());
1228  auto module = owner.get().release();
1229  CHECK(module);
1230 
1231  return module;
1232 }
1233 #endif
1234 
1235 namespace {
1236 
1237 void bind_pos_placeholders(const std::string& pos_fn_name,
1238  const bool use_resume_param,
1239  llvm::Function* query_func,
1240  llvm::Module* module) {
1241  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1242  ++it) {
1243  if (!llvm::isa<llvm::CallInst>(*it)) {
1244  continue;
1245  }
1246  auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1247  if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
1248  if (use_resume_param) {
1249  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1250  llvm::ReplaceInstWithInst(
1251  &pos_call,
1252  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl"),
1253  error_code_arg));
1254  } else {
1255  llvm::ReplaceInstWithInst(
1256  &pos_call,
1257  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl")));
1258  }
1259  break;
1260  }
1261  }
1262 }
1263 
1264 void set_row_func_argnames(llvm::Function* row_func,
1265  const size_t in_col_count,
1266  const size_t agg_col_count,
1267  const bool hoist_literals) {
1268  auto arg_it = row_func->arg_begin();
1269 
1270  if (agg_col_count) {
1271  for (size_t i = 0; i < agg_col_count; ++i) {
1272  arg_it->setName("out");
1273  ++arg_it;
1274  }
1275  } else {
1276  arg_it->setName("group_by_buff");
1277  ++arg_it;
1278  arg_it->setName("varlen_output_buff");
1279  ++arg_it;
1280  arg_it->setName("crt_matched");
1281  ++arg_it;
1282  arg_it->setName("total_matched");
1283  ++arg_it;
1284  arg_it->setName("old_total_matched");
1285  ++arg_it;
1286  arg_it->setName("max_matched");
1287  ++arg_it;
1288  }
1289 
1290  arg_it->setName("agg_init_val");
1291  ++arg_it;
1292 
1293  arg_it->setName("pos");
1294  ++arg_it;
1295 
1296  arg_it->setName("frag_row_off");
1297  ++arg_it;
1298 
1299  arg_it->setName("num_rows_per_scan");
1300  ++arg_it;
1301 
1302  if (hoist_literals) {
1303  arg_it->setName("literals");
1304  ++arg_it;
1305  }
1306 
1307  for (size_t i = 0; i < in_col_count; ++i) {
1308  arg_it->setName("col_buf" + std::to_string(i));
1309  ++arg_it;
1310  }
1311 
1312  arg_it->setName("join_hash_tables");
1313 }
1314 
1315 llvm::Function* create_row_function(const size_t in_col_count,
1316  const size_t agg_col_count,
1317  const bool hoist_literals,
1318  llvm::Module* module,
1319  llvm::LLVMContext& context) {
1320  std::vector<llvm::Type*> row_process_arg_types;
1321 
1322  if (agg_col_count) {
1323  // output (aggregate) arguments
1324  for (size_t i = 0; i < agg_col_count; ++i) {
1325  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1326  }
1327  } else {
1328  // group by buffer
1329  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1330  // varlen output buffer
1331  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1332  // current match count
1333  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1334  // total match count passed from the caller
1335  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1336  // old total match count returned to the caller
1337  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1338  // max matched (total number of slots in the output buffer)
1339  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1340  }
1341 
1342  // aggregate init values
1343  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1344 
1345  // position argument
1346  row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1347 
1348  // fragment row offset argument
1349  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1350 
1351  // number of rows for each scan
1352  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1353 
1354  // literals buffer argument
1355  if (hoist_literals) {
1356  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1357  }
1358 
1359  // column buffer arguments
1360  for (size_t i = 0; i < in_col_count; ++i) {
1361  row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1362  }
1363 
1364  // join hash table argument
1365  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1366 
1367  // generate the function
1368  auto ft =
1369  llvm::FunctionType::get(get_int_type(32, context), row_process_arg_types, false);
1370 
1371  auto row_func =
1372  llvm::Function::Create(ft, llvm::Function::ExternalLinkage, "row_func", module);
1373 
1374  // set the row function argument names; for debugging purposes only
1375  set_row_func_argnames(row_func, in_col_count, agg_col_count, hoist_literals);
1376 
1377  return row_func;
1378 }
1379 
1380 // Iterate through multifrag_query_func, replacing calls to query_fname with query_func.
1381 void bind_query(llvm::Function* query_func,
1382  const std::string& query_fname,
1383  llvm::Function* multifrag_query_func,
1384  llvm::Module* module) {
1385  std::vector<llvm::CallInst*> query_stubs;
1386  for (auto it = llvm::inst_begin(multifrag_query_func),
1387  e = llvm::inst_end(multifrag_query_func);
1388  it != e;
1389  ++it) {
1390  if (!llvm::isa<llvm::CallInst>(*it)) {
1391  continue;
1392  }
1393  auto& query_call = llvm::cast<llvm::CallInst>(*it);
1394  if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
1395  query_stubs.push_back(&query_call);
1396  }
1397  }
1398  for (auto& S : query_stubs) {
1399  std::vector<llvm::Value*> args;
1400  for (size_t i = 0; i < S->getNumArgOperands(); ++i) {
1401  args.push_back(S->getArgOperand(i));
1402  }
1403  llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args, ""));
1404  }
1405 }
1406 
1407 std::vector<std::string> get_agg_fnames(const std::vector<Analyzer::Expr*>& target_exprs,
1408  const bool is_group_by) {
1409  std::vector<std::string> result;
1410  for (size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1411  ++target_idx, ++agg_col_idx) {
1412  const auto target_expr = target_exprs[target_idx];
1413  CHECK(target_expr);
1414  const auto target_type_info = target_expr->get_type_info();
1415  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
1416  const bool is_varlen =
1417  (target_type_info.is_string() &&
1418  target_type_info.get_compression() == kENCODING_NONE) ||
1419  target_type_info.is_array(); // TODO: should it use is_varlen_array() ?
1420  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
1421  result.emplace_back(target_type_info.is_fp() ? "agg_id_double" : "agg_id");
1422  if (is_varlen) {
1423  result.emplace_back("agg_id");
1424  }
1425  if (target_type_info.is_geometry()) {
1426  result.emplace_back("agg_id");
1427  for (auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1428  result.emplace_back("agg_id");
1429  }
1430  }
1431  continue;
1432  }
1433  const auto agg_type = agg_expr->get_aggtype();
1434  const auto& agg_type_info =
1435  agg_type != kCOUNT ? agg_expr->get_arg()->get_type_info() : target_type_info;
1436  switch (agg_type) {
1437  case kAVG: {
1438  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1439  !agg_type_info.is_fp()) {
1440  throw std::runtime_error("AVG is only valid on integer and floating point");
1441  }
1442  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1443  ? "agg_sum"
1444  : "agg_sum_double");
1445  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1446  ? "agg_count"
1447  : "agg_count_double");
1448  break;
1449  }
1450  case kMIN: {
1451  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1452  agg_type_info.is_geometry()) {
1453  throw std::runtime_error(
1454  "MIN on strings, arrays or geospatial types not supported yet");
1455  }
1456  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1457  ? "agg_min"
1458  : "agg_min_double");
1459  break;
1460  }
1461  case kMAX: {
1462  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1463  agg_type_info.is_geometry()) {
1464  throw std::runtime_error(
1465  "MAX on strings, arrays or geospatial types not supported yet");
1466  }
1467  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1468  ? "agg_max"
1469  : "agg_max_double");
1470  break;
1471  }
1472  case kSUM: {
1473  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1474  !agg_type_info.is_fp()) {
1475  throw std::runtime_error("SUM is only valid on integer and floating point");
1476  }
1477  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1478  ? "agg_sum"
1479  : "agg_sum_double");
1480  break;
1481  }
1482  case kCOUNT:
1483  result.emplace_back(agg_expr->get_is_distinct() ? "agg_count_distinct"
1484  : "agg_count");
1485  break;
1486  case kSINGLE_VALUE: {
1487  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1488  break;
1489  }
1490  case kSAMPLE: {
1491  // Note that varlen SAMPLE arguments are handled separately above
1492  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1493  break;
1494  }
1496  result.emplace_back("agg_approximate_count_distinct");
1497  break;
1498  case kAPPROX_QUANTILE:
1499  result.emplace_back("agg_approx_quantile");
1500  break;
1501  default:
1502  CHECK(false);
1503  }
1504  }
1505  return result;
1506 }
1507 
1508 } // namespace
1509 
1510 std::unique_ptr<llvm::Module> g_rt_module(read_template_module(getGlobalLLVMContext()));
1511 
1512 #ifdef ENABLE_GEOS
1513 std::unique_ptr<llvm::Module> g_rt_geos_module(read_geos_module(getGlobalLLVMContext()));
1514 #endif
1515 
1516 #ifdef HAVE_CUDA
1517 std::unique_ptr<llvm::Module> g_rt_libdevice_module(
1518  read_libdevice_module(getGlobalLLVMContext()));
1519 #endif
1520 
1521 bool is_rt_udf_module_present(bool cpu_only) {
1522  return (cpu_only || rt_udf_gpu_module != nullptr) && (rt_udf_cpu_module != nullptr);
1523 }
1524 
1525 namespace {
1526 
1527 void read_udf_gpu_module(const std::string& udf_ir_filename) {
1528  llvm::SMDiagnostic parse_error;
1529 
1530  llvm::StringRef file_name_arg(udf_ir_filename);
1531  udf_gpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1532 
1533  if (!udf_gpu_module) {
1534  throw_parseIR_error(parse_error, udf_ir_filename, /* is_gpu= */ true);
1535  }
1536 
1537  llvm::Triple gpu_triple(udf_gpu_module->getTargetTriple());
1538  if (!gpu_triple.isNVPTX()) {
1539  LOG(WARNING)
1540  << "Expected triple nvptx64-nvidia-cuda for NVVM IR of loadtime UDFs but got "
1541  << gpu_triple.str() << ". Disabling the NVVM IR module.";
1542  udf_gpu_module = nullptr;
1543  }
1544 }
1545 
1546 void read_udf_cpu_module(const std::string& udf_ir_filename) {
1547  llvm::SMDiagnostic parse_error;
1548 
1549  llvm::StringRef file_name_arg(udf_ir_filename);
1550 
1551  udf_cpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1552  if (!udf_cpu_module) {
1553  throw_parseIR_error(parse_error, udf_ir_filename);
1554  }
1555 }
1556 
1557 } // namespace
1558 
1559 void Executor::addUdfIrToModule(const std::string& udf_ir_filename,
1560  const bool is_cuda_ir) {
1561  if (is_cuda_ir) {
1562  read_udf_gpu_module(udf_ir_filename);
1563  } else {
1564  read_udf_cpu_module(udf_ir_filename);
1565  }
1566 }
1567 
1568 void read_rt_udf_gpu_module(const std::string& udf_ir_string) {
1569  llvm::SMDiagnostic parse_error;
1570 
1571  auto buf =
1572  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for GPU");
1573 
1574  rt_udf_gpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1575  if (!rt_udf_gpu_module) {
1576  LOG(IR) << "read_rt_udf_gpu_module:NVVM IR:\n" << udf_ir_string << "\nEnd of NNVM IR";
1577  throw_parseIR_error(parse_error, "", /* is_gpu= */ true);
1578  }
1579 
1580  llvm::Triple gpu_triple(rt_udf_gpu_module->getTargetTriple());
1581  if (!gpu_triple.isNVPTX()) {
1582  LOG(IR) << "read_rt_udf_gpu_module:NVVM IR:\n" << udf_ir_string << "\nEnd of NNVM IR";
1583  LOG(WARNING) << "Expected triple nvptx64-nvidia-cuda for NVVM IR but got "
1584  << gpu_triple.str()
1585  << ". Executing runtime UDFs on GPU will be disabled.";
1586  rt_udf_gpu_module = nullptr;
1587  return;
1588  }
1589 }
1590 
1591 void read_rt_udf_cpu_module(const std::string& udf_ir_string) {
1592  llvm::SMDiagnostic parse_error;
1594  auto buf =
1595  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for CPU");
1596 
1597  rt_udf_cpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1598  if (!rt_udf_cpu_module) {
1599  LOG(IR) << "read_rt_udf_cpu_module:LLVM IR:\n" << udf_ir_string << "\nEnd of LLVM IR";
1600  throw_parseIR_error(parse_error);
1601  }
1602 }
1603 
1604 std::unordered_set<llvm::Function*> CodeGenerator::markDeadRuntimeFuncs(
1605  llvm::Module& module,
1606  const std::vector<llvm::Function*>& roots,
1607  const std::vector<llvm::Function*>& leaves) {
1608  std::unordered_set<llvm::Function*> live_funcs;
1609  live_funcs.insert(roots.begin(), roots.end());
1610  live_funcs.insert(leaves.begin(), leaves.end());
1611 
1612  if (auto F = module.getFunction("init_shared_mem_nop")) {
1613  live_funcs.insert(F);
1614  }
1615  if (auto F = module.getFunction("write_back_nop")) {
1616  live_funcs.insert(F);
1617  }
1618 
1619  for (const llvm::Function* F : roots) {
1620  for (const llvm::BasicBlock& BB : *F) {
1621  for (const llvm::Instruction& I : BB) {
1622  if (const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1623  live_funcs.insert(CI->getCalledFunction());
1624  }
1625  }
1626  }
1627  }
1628 
1629  for (llvm::Function& F : module) {
1630  if (!live_funcs.count(&F) && !F.isDeclaration()) {
1631  F.setLinkage(llvm::GlobalValue::InternalLinkage);
1632  }
1633  }
1634 
1635  return live_funcs;
1636 }
1637 
1638 namespace {
1639 // searches for a particular variable within a specific basic block (or all if bb_name is
1640 // empty)
1641 template <typename InstType>
1642 llvm::Value* find_variable_in_basic_block(llvm::Function* func,
1643  std::string bb_name,
1644  std::string variable_name) {
1645  llvm::Value* result = nullptr;
1646  if (func == nullptr || variable_name.empty()) {
1647  return result;
1648  }
1649  bool is_found = false;
1650  for (auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1651  if (!bb_name.empty() && bb_it->getName() != bb_name) {
1652  continue;
1653  }
1654  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1655  if (llvm::isa<InstType>(*inst_it)) {
1656  if (inst_it->getName() == variable_name) {
1657  result = &*inst_it;
1658  is_found = true;
1659  break;
1660  }
1661  }
1662  }
1663  }
1664  return result;
1665 }
1666 }; // namespace
1667 
1669  llvm::Function* query_func,
1670  bool run_with_dynamic_watchdog,
1671  bool run_with_allowing_runtime_interrupt,
1672  ExecutorDeviceType device_type,
1673  const std::vector<InputTableInfo>& input_table_infos) {
1674  AUTOMATIC_IR_METADATA(cgen_state_.get());
1675 
1676  // check whether the row processing was successful; currently, it can
1677  // fail by running out of group by buffer slots
1678 
1679  if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1680  // when both dynamic watchdog and runtime interrupt turns on
1681  // we use dynamic watchdog
1682  run_with_allowing_runtime_interrupt = false;
1683  }
1684 
1685  {
1686  // disable injecting query interrupt checker if the session info is invalid
1687  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
1688  if (current_query_session_.empty()) {
1689  run_with_allowing_runtime_interrupt = false;
1690  }
1691  }
1692 
1693  llvm::Value* row_count = nullptr;
1694  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1695  device_type == ExecutorDeviceType::GPU) {
1696  row_count =
1697  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1698  }
1699 
1700  bool done_splitting = false;
1701  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1702  ++bb_it) {
1703  llvm::Value* pos = nullptr;
1704  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1705  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1706  llvm::isa<llvm::PHINode>(*inst_it)) {
1707  if (inst_it->getName() == "pos") {
1708  pos = &*inst_it;
1709  }
1710  continue;
1711  }
1712  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1713  continue;
1714  }
1715  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1716  if (std::string(row_func_call.getCalledFunction()->getName()) == "row_process") {
1717  auto next_inst_it = inst_it;
1718  ++next_inst_it;
1719  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1720  auto& br_instr = bb_it->back();
1721  llvm::IRBuilder<> ir_builder(&br_instr);
1722  llvm::Value* err_lv = &*inst_it;
1723  llvm::Value* err_lv_returned_from_row_func = nullptr;
1724  if (run_with_dynamic_watchdog) {
1725  CHECK(pos);
1726  llvm::Value* call_watchdog_lv = nullptr;
1727  if (device_type == ExecutorDeviceType::GPU) {
1728  // In order to make sure all threads within a block see the same barrier,
1729  // only those blocks whose none of their threads have experienced the critical
1730  // edge will go through the dynamic watchdog computation
1731  CHECK(row_count);
1732  auto crit_edge_rem =
1733  (blockSize() & (blockSize() - 1))
1734  ? ir_builder.CreateSRem(
1735  row_count,
1736  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1737  : ir_builder.CreateAnd(
1738  row_count,
1739  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1740  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1741  crit_edge_threshold->setName("crit_edge_threshold");
1742 
1743  // only those threads where pos < crit_edge_threshold go through dynamic
1744  // watchdog call
1745  call_watchdog_lv =
1746  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1747  } else {
1748  // CPU path: run watchdog for every 64th row
1749  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1750  call_watchdog_lv = ir_builder.CreateICmp(
1751  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1752  }
1753  CHECK(call_watchdog_lv);
1754  auto error_check_bb = bb_it->splitBasicBlock(
1755  llvm::BasicBlock::iterator(br_instr), ".error_check");
1756  auto& watchdog_br_instr = bb_it->back();
1757 
1758  auto watchdog_check_bb = llvm::BasicBlock::Create(
1759  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1760  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1761  auto detected_timeout = watchdog_ir_builder.CreateCall(
1762  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1763  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1764  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1765  watchdog_ir_builder.CreateBr(error_check_bb);
1766 
1767  llvm::ReplaceInstWithInst(
1768  &watchdog_br_instr,
1769  llvm::BranchInst::Create(
1770  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1771  ir_builder.SetInsertPoint(&br_instr);
1772  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1773 
1774  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1775  unified_err_lv->addIncoming(err_lv, &*bb_it);
1776  err_lv = unified_err_lv;
1777  } else if (run_with_allowing_runtime_interrupt) {
1778  CHECK(pos);
1779  llvm::Value* call_check_interrupt_lv = nullptr;
1780  if (device_type == ExecutorDeviceType::GPU) {
1781  // approximate how many times the %pos variable
1782  // is increased --> the number of iteration
1783  // here we calculate the # bit shift by considering grid/block/fragment sizes
1784  // since if we use the fixed one (i.e., per 64-th increment)
1785  // some CUDA threads cannot enter the interrupt checking block depending on
1786  // the fragment size --> a thread may not take care of 64 threads if an outer
1787  // table is not sufficiently large, and so cannot be interrupted
1788  int32_t num_shift_by_gridDim = shared::getExpOfTwo(gridSize());
1789  int32_t num_shift_by_blockDim = shared::getExpOfTwo(blockSize());
1790  int total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1791  uint64_t interrupt_checking_freq = 32;
1792  auto freq_control_knob = g_running_query_interrupt_freq;
1793  CHECK_GT(freq_control_knob, 0);
1794  CHECK_LE(freq_control_knob, 1.0);
1795  if (!input_table_infos.empty()) {
1796  const auto& outer_table_info = *input_table_infos.begin();
1797  auto num_outer_table_tuples = outer_table_info.info.getNumTuples();
1798  if (outer_table_info.table_id < 0) {
1799  auto* rs = (*outer_table_info.info.fragments.begin()).resultSet;
1800  CHECK(rs);
1801  num_outer_table_tuples = rs->entryCount();
1802  } else {
1803  auto num_frags = outer_table_info.info.fragments.size();
1804  if (num_frags > 0) {
1805  num_outer_table_tuples =
1806  outer_table_info.info.fragments.begin()->getNumTuples();
1807  }
1808  }
1809  if (num_outer_table_tuples > 0) {
1810  // gridSize * blockSize --> pos_step (idx of the next row per thread)
1811  // we additionally multiply two to pos_step since the number of
1812  // dispatched blocks are double of the gridSize
1813  // # tuples (of fragment) / pos_step --> maximum # increment (K)
1814  // also we multiply 1 / freq_control_knob to K to control the frequency
1815  // So, needs to check the interrupt status more frequently? make K smaller
1816  auto max_inc = uint64_t(
1817  floor(num_outer_table_tuples / (gridSize() * blockSize() * 2)));
1818  if (max_inc < 2) {
1819  // too small `max_inc`, so this correction is necessary to make
1820  // `interrupt_checking_freq` be valid (i.e., larger than zero)
1821  max_inc = 2;
1822  }
1823  auto calibrated_inc = uint64_t(floor(max_inc * (1 - freq_control_knob)));
1824  interrupt_checking_freq =
1825  uint64_t(pow(2, shared::getExpOfTwo(calibrated_inc)));
1826  // add the coverage when interrupt_checking_freq > K
1827  // if so, some threads still cannot be branched to the interrupt checker
1828  // so we manually use smaller but close to the max_inc as freq
1829  if (interrupt_checking_freq > max_inc) {
1830  interrupt_checking_freq = max_inc / 2;
1831  }
1832  if (interrupt_checking_freq < 8) {
1833  // such small freq incurs too frequent interrupt status checking,
1834  // so we fixup to the minimum freq value at some reasonable degree
1835  interrupt_checking_freq = 8;
1836  }
1837  }
1838  }
1839  VLOG(1) << "Set the running query interrupt checking frequency: "
1840  << interrupt_checking_freq;
1841  // check the interrupt flag for every interrupt_checking_freq-th iteration
1842  llvm::Value* pos_shifted_per_iteration =
1843  ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1844  auto interrupt_predicate =
1845  ir_builder.CreateAnd(pos_shifted_per_iteration, interrupt_checking_freq);
1846  call_check_interrupt_lv =
1847  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1848  interrupt_predicate,
1849  cgen_state_->llInt(int64_t(0LL)));
1850  } else {
1851  // CPU path: run interrupt checker for every 64th row
1852  auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1853  call_check_interrupt_lv =
1854  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1855  interrupt_predicate,
1856  cgen_state_->llInt(int64_t(0LL)));
1857  }
1858  CHECK(call_check_interrupt_lv);
1859  auto error_check_bb = bb_it->splitBasicBlock(
1860  llvm::BasicBlock::iterator(br_instr), ".error_check");
1861  auto& check_interrupt_br_instr = bb_it->back();
1862 
1863  auto interrupt_check_bb = llvm::BasicBlock::Create(
1864  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
1865  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1866  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1867  cgen_state_->module_->getFunction("check_interrupt"), {});
1868  auto interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1869  detected_interrupt, cgen_state_->llInt(Executor::ERR_INTERRUPTED), err_lv);
1870  interrupt_checker_ir_builder.CreateBr(error_check_bb);
1871 
1872  llvm::ReplaceInstWithInst(
1873  &check_interrupt_br_instr,
1874  llvm::BranchInst::Create(
1875  interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
1876  ir_builder.SetInsertPoint(&br_instr);
1877  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1878 
1879  unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
1880  unified_err_lv->addIncoming(err_lv, &*bb_it);
1881  err_lv = unified_err_lv;
1882  }
1883  if (!err_lv_returned_from_row_func) {
1884  err_lv_returned_from_row_func = err_lv;
1885  }
1886  if (device_type == ExecutorDeviceType::GPU && g_enable_dynamic_watchdog) {
1887  // let kernel execution finish as expected, regardless of the observed error,
1888  // unless it is from the dynamic watchdog where all threads within that block
1889  // return together.
1890  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1891  err_lv,
1892  cgen_state_->llInt(Executor::ERR_OUT_OF_TIME));
1893  } else {
1894  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1895  err_lv,
1896  cgen_state_->llInt(static_cast<int32_t>(0)));
1897  }
1898  auto error_bb = llvm::BasicBlock::Create(
1899  cgen_state_->context_, ".error_exit", query_func, new_bb);
1900  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1901  llvm::CallInst::Create(
1902  cgen_state_->module_->getFunction("record_error_code"),
1903  std::vector<llvm::Value*>{err_lv_returned_from_row_func, error_code_arg},
1904  "",
1905  error_bb);
1906  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1907  llvm::ReplaceInstWithInst(&br_instr,
1908  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1909  done_splitting = true;
1910  break;
1911  }
1912  }
1913  }
1914  CHECK(done_splitting);
1915 }
1916 
1917 std::vector<llvm::Value*> Executor::inlineHoistedLiterals() {
1918  AUTOMATIC_IR_METADATA(cgen_state_.get());
1919 
1920  std::vector<llvm::Value*> hoisted_literals;
1921 
1922  // row_func_ is using literals whose defs have been hoisted up to the query_func_,
1923  // extend row_func_ signature to include extra args to pass these literal values.
1924  std::vector<llvm::Type*> row_process_arg_types;
1925 
1926  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1927  E = cgen_state_->row_func_->arg_end();
1928  I != E;
1929  ++I) {
1930  row_process_arg_types.push_back(I->getType());
1931  }
1932 
1933  for (auto& element : cgen_state_->query_func_literal_loads_) {
1934  for (auto value : element.second) {
1935  row_process_arg_types.push_back(value->getType());
1936  }
1937  }
1938 
1939  auto ft = llvm::FunctionType::get(
1940  get_int_type(32, cgen_state_->context_), row_process_arg_types, false);
1941  auto row_func_with_hoisted_literals =
1942  llvm::Function::Create(ft,
1943  llvm::Function::ExternalLinkage,
1944  "row_func_hoisted_literals",
1945  cgen_state_->row_func_->getParent());
1946 
1947  auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
1948  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1949  E = cgen_state_->row_func_->arg_end();
1950  I != E;
1951  ++I) {
1952  if (I->hasName()) {
1953  row_func_arg_it->setName(I->getName());
1954  }
1955  ++row_func_arg_it;
1956  }
1957 
1958  decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{nullptr};
1959  decltype(row_func_arg_it) filter_func_arg_it{nullptr};
1960  if (cgen_state_->filter_func_) {
1961  // filter_func_ is using literals whose defs have been hoisted up to the row_func_,
1962  // extend filter_func_ signature to include extra args to pass these literal values.
1963  std::vector<llvm::Type*> filter_func_arg_types;
1964 
1965  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
1966  E = cgen_state_->filter_func_->arg_end();
1967  I != E;
1968  ++I) {
1969  filter_func_arg_types.push_back(I->getType());
1970  }
1971 
1972  for (auto& element : cgen_state_->query_func_literal_loads_) {
1973  for (auto value : element.second) {
1974  filter_func_arg_types.push_back(value->getType());
1975  }
1976  }
1977 
1978  auto ft2 = llvm::FunctionType::get(
1979  get_int_type(32, cgen_state_->context_), filter_func_arg_types, false);
1980  filter_func_with_hoisted_literals =
1981  llvm::Function::Create(ft2,
1982  llvm::Function::ExternalLinkage,
1983  "filter_func_hoisted_literals",
1984  cgen_state_->filter_func_->getParent());
1985 
1986  filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
1987  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
1988  E = cgen_state_->filter_func_->arg_end();
1989  I != E;
1990  ++I) {
1991  if (I->hasName()) {
1992  filter_func_arg_it->setName(I->getName());
1993  }
1994  ++filter_func_arg_it;
1995  }
1996  }
1997 
1998  std::unordered_map<int, std::vector<llvm::Value*>>
1999  query_func_literal_loads_function_arguments,
2000  query_func_literal_loads_function_arguments2;
2001 
2002  for (auto& element : cgen_state_->query_func_literal_loads_) {
2003  std::vector<llvm::Value*> argument_values, argument_values2;
2004 
2005  for (auto value : element.second) {
2006  hoisted_literals.push_back(value);
2007  argument_values.push_back(&*row_func_arg_it);
2008  if (cgen_state_->filter_func_) {
2009  argument_values2.push_back(&*filter_func_arg_it);
2010  cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
2011  }
2012  if (value->hasName()) {
2013  row_func_arg_it->setName("arg_" + value->getName());
2014  if (cgen_state_->filter_func_) {
2015  filter_func_arg_it->getContext();
2016  filter_func_arg_it->setName("arg_" + value->getName());
2017  }
2018  }
2019  ++row_func_arg_it;
2020  ++filter_func_arg_it;
2021  }
2022 
2023  query_func_literal_loads_function_arguments[element.first] = argument_values;
2024  query_func_literal_loads_function_arguments2[element.first] = argument_values2;
2025  }
2026 
2027  // copy the row_func function body over
2028  // see
2029  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2030  row_func_with_hoisted_literals->getBasicBlockList().splice(
2031  row_func_with_hoisted_literals->begin(),
2032  cgen_state_->row_func_->getBasicBlockList());
2033 
2034  // also replace row_func arguments with the arguments from row_func_hoisted_literals
2035  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
2036  E = cgen_state_->row_func_->arg_end(),
2037  I2 = row_func_with_hoisted_literals->arg_begin();
2038  I != E;
2039  ++I) {
2040  I->replaceAllUsesWith(&*I2);
2041  I2->takeName(&*I);
2042  cgen_state_->filter_func_args_.replace(&*I, &*I2);
2043  ++I2;
2044  }
2045 
2046  cgen_state_->row_func_ = row_func_with_hoisted_literals;
2047 
2048  // and finally replace literal placeholders
2049  std::vector<llvm::Instruction*> placeholders;
2050  std::string prefix("__placeholder__literal_");
2051  for (auto it = llvm::inst_begin(row_func_with_hoisted_literals),
2052  e = llvm::inst_end(row_func_with_hoisted_literals);
2053  it != e;
2054  ++it) {
2055  if (it->hasName() && it->getName().startswith(prefix)) {
2056  auto offset_and_index_entry =
2057  cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
2058  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2059 
2060  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2061  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2062 
2063  it->replaceAllUsesWith(
2064  query_func_literal_loads_function_arguments[lit_off][lit_idx]);
2065  placeholders.push_back(&*it);
2066  }
2067  }
2068  for (auto placeholder : placeholders) {
2069  placeholder->removeFromParent();
2070  }
2071 
2072  if (cgen_state_->filter_func_) {
2073  // copy the filter_func function body over
2074  // see
2075  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2076  filter_func_with_hoisted_literals->getBasicBlockList().splice(
2077  filter_func_with_hoisted_literals->begin(),
2078  cgen_state_->filter_func_->getBasicBlockList());
2079 
2080  // also replace filter_func arguments with the arguments from
2081  // filter_func_hoisted_literals
2082  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2083  E = cgen_state_->filter_func_->arg_end(),
2084  I2 = filter_func_with_hoisted_literals->arg_begin();
2085  I != E;
2086  ++I) {
2087  I->replaceAllUsesWith(&*I2);
2088  I2->takeName(&*I);
2089  ++I2;
2090  }
2091 
2092  cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2093 
2094  // and finally replace literal placeholders
2095  std::vector<llvm::Instruction*> placeholders;
2096  std::string prefix("__placeholder__literal_");
2097  for (auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2098  e = llvm::inst_end(filter_func_with_hoisted_literals);
2099  it != e;
2100  ++it) {
2101  if (it->hasName() && it->getName().startswith(prefix)) {
2102  auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2103  llvm::dyn_cast<llvm::Value>(&*it));
2104  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2105 
2106  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2107  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2108 
2109  it->replaceAllUsesWith(
2110  query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2111  placeholders.push_back(&*it);
2112  }
2113  }
2114  for (auto placeholder : placeholders) {
2115  placeholder->removeFromParent();
2116  }
2117  }
2118 
2119  return hoisted_literals;
2120 }
2121 
2122 namespace {
2123 
2124 size_t get_shared_memory_size(const bool shared_mem_used,
2125  const QueryMemoryDescriptor* query_mem_desc_ptr) {
2126  return shared_mem_used
2127  ? (query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount())
2128  : 0;
2130 
2131 bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor* query_mem_desc_ptr,
2132  const RelAlgExecutionUnit& ra_exe_unit,
2133  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2134  const ExecutorDeviceType device_type,
2135  const unsigned gpu_blocksize,
2136  const unsigned num_blocks_per_mp) {
2137  if (device_type == ExecutorDeviceType::CPU) {
2138  return false;
2139  }
2140  if (query_mem_desc_ptr->didOutputColumnar()) {
2141  return false;
2142  }
2143  CHECK(query_mem_desc_ptr);
2144  CHECK(cuda_mgr);
2145  /*
2146  * We only use shared memory strategy if GPU hardware provides native shared
2147  * memory atomics support. From CUDA Toolkit documentation:
2148  * https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html#atomic-ops "Like
2149  * Maxwell, Pascal [and Volta] provides native shared memory atomic operations
2150  * for 32-bit integer arithmetic, along with native 32 or 64-bit compare-and-swap
2151  * (CAS)."
2152  *
2153  **/
2154  if (!cuda_mgr->isArchMaxwellOrLaterForAll()) {
2155  return false;
2156  }
2157 
2158  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2161  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty()) {
2162  // TODO: relax this, if necessary
2163  if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2164  return false;
2165  }
2166  // skip shared memory usage when dealing with 1) variable length targets, 2)
2167  // not a COUNT aggregate
2168  const auto target_infos =
2169  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2170  std::unordered_set<SQLAgg> supported_aggs{kCOUNT};
2171  if (std::find_if(target_infos.begin(),
2172  target_infos.end(),
2173  [&supported_aggs](const TargetInfo& ti) {
2174  if (ti.sql_type.is_varlen() ||
2175  !supported_aggs.count(ti.agg_kind)) {
2176  return true;
2177  } else {
2178  return false;
2179  }
2180  }) == target_infos.end()) {
2181  return true;
2182  }
2183  }
2184  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2195  if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2196  return false;
2197  }
2198 
2199  // Fundamentally, we should use shared memory whenever the output buffer
2200  // is small enough so that we can fit it in the shared memory and yet expect
2201  // good occupancy.
2202  // For now, we allow keyless, row-wise layout, and only for perfect hash
2203  // group by operations.
2204  if (query_mem_desc_ptr->hasKeylessHash() &&
2205  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty() &&
2206  !query_mem_desc_ptr->useStreamingTopN()) {
2207  const size_t shared_memory_threshold_bytes = std::min(
2209  cuda_mgr->getMinSharedMemoryPerBlockForAllDevices() / num_blocks_per_mp);
2210  const auto output_buffer_size =
2211  query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount();
2212  if (output_buffer_size > shared_memory_threshold_bytes) {
2213  return false;
2214  }
2215 
2216  // skip shared memory usage when dealing with 1) variable length targets, 2)
2217  // non-basic aggregates (COUNT, SUM, MIN, MAX, AVG)
2218  // TODO: relax this if necessary
2219  const auto target_infos =
2220  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2221  std::unordered_set<SQLAgg> supported_aggs{kCOUNT};
2223  supported_aggs = {kCOUNT, kMIN, kMAX, kSUM, kAVG};
2224  }
2225  if (std::find_if(target_infos.begin(),
2226  target_infos.end(),
2227  [&supported_aggs](const TargetInfo& ti) {
2228  if (ti.sql_type.is_varlen() ||
2229  !supported_aggs.count(ti.agg_kind)) {
2230  return true;
2231  } else {
2232  return false;
2233  }
2234  }) == target_infos.end()) {
2235  return true;
2236  }
2237  }
2238  }
2239  return false;
2240 }
2241 
2242 #ifndef NDEBUG
2243 std::string serialize_llvm_metadata_footnotes(llvm::Function* query_func,
2244  CgenState* cgen_state) {
2245  std::string llvm_ir;
2246  std::unordered_set<llvm::MDNode*> md;
2247 
2248  // Loop over all instructions in the query function.
2249  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2250  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2251  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2252  instr_it->getAllMetadata(imd);
2253  for (auto [kind, node] : imd) {
2254  md.insert(node);
2255  }
2256  }
2257  }
2258 
2259  // Loop over all instructions in the row function.
2260  for (auto bb_it = cgen_state->row_func_->begin(); bb_it != cgen_state->row_func_->end();
2261  ++bb_it) {
2262  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2263  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2264  instr_it->getAllMetadata(imd);
2265  for (auto [kind, node] : imd) {
2266  md.insert(node);
2267  }
2268  }
2269  }
2270 
2271  // Loop over all instructions in the filter function.
2272  if (cgen_state->filter_func_) {
2273  for (auto bb_it = cgen_state->filter_func_->begin();
2274  bb_it != cgen_state->filter_func_->end();
2275  ++bb_it) {
2276  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2277  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2278  instr_it->getAllMetadata(imd);
2279  for (auto [kind, node] : imd) {
2280  md.insert(node);
2281  }
2282  }
2283  }
2284  }
2285 
2286  // Sort the metadata by canonical number and convert to text.
2287  if (!md.empty()) {
2288  std::map<size_t, std::string> sorted_strings;
2289  for (auto p : md) {
2290  std::string str;
2291  llvm::raw_string_ostream os(str);
2292  p->print(os, cgen_state->module_, true);
2293  os.flush();
2294  auto fields = split(str, {}, 1);
2295  if (fields.empty() || fields[0].empty()) {
2296  continue;
2297  }
2298  sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2299  }
2300  llvm_ir += "\n";
2301  for (auto [id, text] : sorted_strings) {
2302  llvm_ir += text;
2303  llvm_ir += "\n";
2304  }
2305  }
2306 
2307  return llvm_ir;
2308 }
2309 #endif // NDEBUG
2310 
2311 } // namespace
2312 
2313 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2314 Executor::compileWorkUnit(const std::vector<InputTableInfo>& query_infos,
2315  const PlanState::DeletedColumnsMap& deleted_cols_map,
2316  const RelAlgExecutionUnit& ra_exe_unit,
2317  const CompilationOptions& co,
2318  const ExecutionOptions& eo,
2319  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2320  const bool allow_lazy_fetch,
2321  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2322  const size_t max_groups_buffer_entry_guess,
2323  const int8_t crt_min_byte_width,
2324  const bool has_cardinality_estimation,
2325  ColumnCacheMap& column_cache,
2326  RenderInfo* render_info) {
2327  auto timer = DEBUG_TIMER(__func__);
2328 
2330  const auto cuda_mgr = data_mgr_->getCudaMgr();
2331  if (!cuda_mgr) {
2332  throw QueryMustRunOnCpu();
2333  }
2334  }
2335 
2336 #ifndef NDEBUG
2337  static std::uint64_t counter = 0;
2338  ++counter;
2339  VLOG(1) << "CODEGEN #" << counter << ":";
2340  LOG(IR) << "CODEGEN #" << counter << ":";
2341  LOG(PTX) << "CODEGEN #" << counter << ":";
2342  LOG(ASM) << "CODEGEN #" << counter << ":";
2343 #endif
2344 
2345  nukeOldState(allow_lazy_fetch, query_infos, deleted_cols_map, &ra_exe_unit);
2346 
2347  addTransientStringLiterals(ra_exe_unit, row_set_mem_owner);
2348 
2349  GroupByAndAggregate group_by_and_aggregate(
2350  this,
2351  co.device_type,
2352  ra_exe_unit,
2353  query_infos,
2354  row_set_mem_owner,
2355  has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2356  : std::nullopt);
2357  auto query_mem_desc =
2358  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
2359  max_groups_buffer_entry_guess,
2360  crt_min_byte_width,
2361  render_info,
2363 
2364  if (query_mem_desc->getQueryDescriptionType() ==
2366  !has_cardinality_estimation &&
2367  (!render_info || !render_info->isPotentialInSituRender()) && !eo.just_explain) {
2368  const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2369  throw CardinalityEstimationRequired(col_range_info.max - col_range_info.min);
2370  }
2371 
2372  const bool output_columnar = query_mem_desc->didOutputColumnar();
2373  const bool gpu_shared_mem_optimization =
2375  ra_exe_unit,
2376  cuda_mgr,
2377  co.device_type,
2378  cuda_mgr ? this->blockSize() : 1,
2379  cuda_mgr ? this->numBlocksPerMP() : 1);
2380  if (gpu_shared_mem_optimization) {
2381  // disable interleaved bins optimization on the GPU
2382  query_mem_desc->setHasInterleavedBinsOnGpu(false);
2383  LOG(DEBUG1) << "GPU shared memory is used for the " +
2384  query_mem_desc->queryDescTypeToString() + " query(" +
2385  std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
2386  query_mem_desc.get())) +
2387  " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
2388  }
2389 
2390  const GpuSharedMemoryContext gpu_smem_context(
2391  get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
2392 
2394  const size_t num_count_distinct_descs =
2395  query_mem_desc->getCountDistinctDescriptorsSize();
2396  for (size_t i = 0; i < num_count_distinct_descs; i++) {
2397  const auto& count_distinct_descriptor =
2398  query_mem_desc->getCountDistinctDescriptor(i);
2399  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::StdSet ||
2400  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
2401  !co.hoist_literals)) {
2402  throw QueryMustRunOnCpu();
2403  }
2404  }
2405  }
2406 
2407  // Read the module template and target either CPU or GPU
2408  // by binding the stream position functions to the right implementation:
2409  // stride access for GPU, contiguous for CPU
2410  auto rt_module_copy = llvm::CloneModule(
2411  *g_rt_module.get(), cgen_state_->vmap_, [](const llvm::GlobalValue* gv) {
2412  auto func = llvm::dyn_cast<llvm::Function>(gv);
2413  if (!func) {
2414  return true;
2415  }
2416  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2417  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
2419  });
2421  if (is_udf_module_present(true)) {
2422  CodeGenerator::link_udf_module(udf_cpu_module, *rt_module_copy, cgen_state_.get());
2423  }
2424  if (is_rt_udf_module_present(true)) {
2426  rt_udf_cpu_module, *rt_module_copy, cgen_state_.get());
2427  }
2428  } else {
2429  rt_module_copy->setDataLayout(get_gpu_data_layout());
2430  rt_module_copy->setTargetTriple(get_gpu_target_triple_string());
2431  if (is_udf_module_present()) {
2432  CodeGenerator::link_udf_module(udf_gpu_module, *rt_module_copy, cgen_state_.get());
2433  }
2434  if (is_rt_udf_module_present()) {
2436  rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
2437  }
2438  }
2439 
2440  cgen_state_->module_ = rt_module_copy.release();
2441  AUTOMATIC_IR_METADATA(cgen_state_.get());
2442 
2443  auto agg_fnames =
2444  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
2445 
2446  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
2447 
2448  const bool is_group_by{query_mem_desc->isGroupBy()};
2449  auto [query_func, row_func_call] = is_group_by
2450  ? query_group_by_template(cgen_state_->module_,
2451  co.hoist_literals,
2452  *query_mem_desc,
2453  co.device_type,
2454  ra_exe_unit.scan_limit,
2455  gpu_smem_context)
2456  : query_template(cgen_state_->module_,
2457  agg_slot_count,
2458  co.hoist_literals,
2459  !!ra_exe_unit.estimator,
2460  gpu_smem_context);
2461  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
2462  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
2463  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
2464 
2465  cgen_state_->query_func_ = query_func;
2466  cgen_state_->row_func_call_ = row_func_call;
2467  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2468  &query_func->getEntryBlock().front());
2469 
2470  // Generate the function signature and column head fetches s.t.
2471  // double indirection isn't needed in the inner loop
2472  auto& fetch_bb = query_func->front();
2473  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2474  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2475  auto col_heads = generate_column_heads_load(ra_exe_unit.input_col_descs.size(),
2476  query_func->args().begin(),
2477  fetch_ir_builder,
2478  cgen_state_->context_);
2479  CHECK_EQ(ra_exe_unit.input_col_descs.size(), col_heads.size());
2480 
2481  cgen_state_->row_func_ = create_row_function(ra_exe_unit.input_col_descs.size(),
2482  is_group_by ? 0 : agg_slot_count,
2483  co.hoist_literals,
2484  cgen_state_->module_,
2485  cgen_state_->context_);
2486  CHECK(cgen_state_->row_func_);
2487  cgen_state_->row_func_bb_ =
2488  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
2489 
2491  auto filter_func_ft =
2492  llvm::FunctionType::get(get_int_type(32, cgen_state_->context_), {}, false);
2493  cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2494  llvm::Function::ExternalLinkage,
2495  "filter_func",
2496  cgen_state_->module_);
2497  CHECK(cgen_state_->filter_func_);
2498  cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2499  cgen_state_->context_, "entry", cgen_state_->filter_func_);
2500  }
2501 
2502  cgen_state_->current_func_ = cgen_state_->row_func_;
2503  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2504 
2505  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
2506  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
2507  const auto join_loops =
2508  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2509 
2510  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
2511  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2512  if (is_not_deleted_bb) {
2513  cgen_state_->row_func_bb_ = is_not_deleted_bb;
2514  }
2515  if (!join_loops.empty()) {
2516  codegenJoinLoops(join_loops,
2517  body_execution_unit,
2518  group_by_and_aggregate,
2519  query_func,
2520  cgen_state_->row_func_bb_,
2521  *(query_mem_desc.get()),
2522  co,
2523  eo);
2524  } else {
2525  const bool can_return_error = compileBody(
2526  ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
2527  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
2529  createErrorCheckControlFlow(query_func,
2532  co.device_type,
2533  group_by_and_aggregate.query_infos_);
2534  }
2535  }
2536  std::vector<llvm::Value*> hoisted_literals;
2537 
2538  if (co.hoist_literals) {
2539  VLOG(1) << "number of hoisted literals: "
2540  << cgen_state_->query_func_literal_loads_.size()
2541  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2542  << " bytes";
2543  }
2544 
2545  if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2546  // we have some hoisted literals...
2547  hoisted_literals = inlineHoistedLiterals();
2548  }
2549 
2550  // replace the row func placeholder call with the call to the actual row func
2551  std::vector<llvm::Value*> row_func_args;
2552  for (size_t i = 0; i < cgen_state_->row_func_call_->getNumArgOperands(); ++i) {
2553  row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2554  }
2555  row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2556  row_func_args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
2557  // push hoisted literals arguments, if any
2558  row_func_args.insert(
2559  row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2560  llvm::ReplaceInstWithInst(
2561  cgen_state_->row_func_call_,
2562  llvm::CallInst::Create(cgen_state_->row_func_, row_func_args, ""));
2563 
2564  // replace the filter func placeholder call with the call to the actual filter func
2565  if (cgen_state_->filter_func_) {
2566  std::vector<llvm::Value*> filter_func_args;
2567  for (auto arg_it = cgen_state_->filter_func_args_.begin();
2568  arg_it != cgen_state_->filter_func_args_.end();
2569  ++arg_it) {
2570  filter_func_args.push_back(arg_it->first);
2571  }
2572  llvm::ReplaceInstWithInst(
2573  cgen_state_->filter_func_call_,
2574  llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args, ""));
2575  }
2576 
2577  // Aggregate
2578  plan_state_->init_agg_vals_ =
2579  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
2580 
2581  /*
2582  * If we have decided to use GPU shared memory (decision is not made here), then
2583  * we generate proper code for extra components that it needs (buffer initialization and
2584  * gpu reduction from shared memory to global memory). We then replace these functions
2585  * into the already compiled query_func (replacing two placeholders, write_back_nop and
2586  * init_smem_nop). The rest of the code should be as before (row_func, etc.).
2587  */
2588  if (gpu_smem_context.isSharedMemoryUsed()) {
2589  if (query_mem_desc->getQueryDescriptionType() ==
2591  GpuSharedMemCodeBuilder gpu_smem_code(
2592  cgen_state_->module_,
2593  cgen_state_->context_,
2594  *query_mem_desc,
2596  plan_state_->init_agg_vals_);
2597  gpu_smem_code.codegen();
2598  gpu_smem_code.injectFunctionsInto(query_func);
2599 
2600  // helper functions are used for caching purposes later
2601  cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2602  cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2603  LOG(IR) << gpu_smem_code.toString();
2604  }
2605  }
2606 
2607  auto multifrag_query_func = cgen_state_->module_->getFunction(
2608  "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
2609  CHECK(multifrag_query_func);
2610 
2612  insertErrorCodeChecker(
2613  multifrag_query_func, co.hoist_literals, eo.allow_runtime_query_interrupt);
2614  }
2615 
2616  bind_query(query_func,
2617  "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
2618  multifrag_query_func,
2619  cgen_state_->module_);
2620 
2621  std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2622  if (cgen_state_->filter_func_) {
2623  root_funcs.push_back(cgen_state_->filter_func_);
2624  }
2625  auto live_funcs = CodeGenerator::markDeadRuntimeFuncs(
2626  *cgen_state_->module_, root_funcs, {multifrag_query_func});
2627 
2628  // Always inline the row function and the filter function.
2629  // We don't want register spills in the inner loops.
2630  // LLVM seems to correctly free up alloca instructions
2631  // in these functions even when they are inlined.
2632  mark_function_always_inline(cgen_state_->row_func_);
2633  if (cgen_state_->filter_func_) {
2634  mark_function_always_inline(cgen_state_->filter_func_);
2635  }
2636 
2637 #ifndef NDEBUG
2638  // Add helpful metadata to the LLVM IR for debugging.
2640 #endif
2641 
2642  // Serialize the important LLVM IR functions to text for SQL EXPLAIN.
2643  std::string llvm_ir;
2644  if (eo.just_explain) {
2646 #ifdef WITH_JIT_DEBUG
2647  throw std::runtime_error(
2648  "Explain optimized not available when JIT runtime debug symbols are enabled");
2649 #else
2650  // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
2651  // optimized IR after NVVM reflect
2652  llvm::legacy::PassManager pass_manager;
2653  optimize_ir(query_func, cgen_state_->module_, pass_manager, live_funcs, co);
2654 #endif // WITH_JIT_DEBUG
2655  }
2656  llvm_ir =
2657  serialize_llvm_object(multifrag_query_func) + serialize_llvm_object(query_func) +
2658  serialize_llvm_object(cgen_state_->row_func_) +
2659  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2660  : "");
2661 
2662 #ifndef NDEBUG
2663  llvm_ir += serialize_llvm_metadata_footnotes(query_func, cgen_state_.get());
2664 #endif
2665  }
2666 
2667  LOG(IR) << "\n\n" << query_mem_desc->toString() << "\n";
2668  LOG(IR) << "IR for the "
2669  << (co.device_type == ExecutorDeviceType::CPU ? "CPU:\n" : "GPU:\n");
2670 #ifdef NDEBUG
2671  LOG(IR) << serialize_llvm_object(query_func)
2672  << serialize_llvm_object(cgen_state_->row_func_)
2673  << (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2674  : "")
2675  << "\nEnd of IR";
2676 #else
2677  LOG(IR) << serialize_llvm_object(cgen_state_->module_) << "\nEnd of IR";
2678 #endif
2679 
2680  // Run some basic validation checks on the LLVM IR before code is generated below.
2681  verify_function_ir(cgen_state_->row_func_);
2682  if (cgen_state_->filter_func_) {
2683  verify_function_ir(cgen_state_->filter_func_);
2684  }
2685 
2686  // Generate final native code from the LLVM IR.
2687  return std::make_tuple(
2690  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2691  : optimizeAndCodegenGPU(query_func,
2692  multifrag_query_func,
2693  live_funcs,
2694  is_group_by || ra_exe_unit.estimator,
2695  cuda_mgr,
2696  co),
2697  cgen_state_->getLiterals(),
2698  output_columnar,
2699  llvm_ir,
2700  std::move(gpu_smem_context)},
2701  std::move(query_mem_desc));
2702 }
2703 
2704 void Executor::insertErrorCodeChecker(llvm::Function* query_func,
2705  bool hoist_literals,
2706  bool allow_runtime_query_interrupt) {
2707  auto query_stub_func_name =
2708  "query_stub" + std::string(hoist_literals ? "_hoisted_literals" : "");
2709  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2710  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
2711  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
2712  continue;
2713  }
2714  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
2715  if (std::string(row_func_call.getCalledFunction()->getName()) ==
2716  query_stub_func_name) {
2717  auto next_inst_it = inst_it;
2718  ++next_inst_it;
2719  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
2720  auto& br_instr = bb_it->back();
2721  llvm::IRBuilder<> ir_builder(&br_instr);
2722  llvm::Value* err_lv = &*inst_it;
2723  auto error_check_bb =
2724  bb_it->splitBasicBlock(llvm::BasicBlock::iterator(br_instr), ".error_check");
2725  llvm::Value* error_code_arg = nullptr;
2726  auto arg_cnt = 0;
2727  for (auto arg_it = query_func->arg_begin(); arg_it != query_func->arg_end();
2728  arg_it++, ++arg_cnt) {
2729  // since multi_frag_* func has anonymous arguments so we use arg_offset
2730  // explicitly to capture "error_code" argument in the func's argument list
2731  if (hoist_literals) {
2732  if (arg_cnt == 9) {
2733  error_code_arg = &*arg_it;
2734  break;
2735  }
2736  } else {
2737  if (arg_cnt == 8) {
2738  error_code_arg = &*arg_it;
2739  break;
2740  }
2741  }
2742  }
2743  CHECK(error_code_arg);
2744  llvm::Value* err_code = nullptr;
2745  if (allow_runtime_query_interrupt) {
2746  // decide the final error code with a consideration of interrupt status
2747  auto& check_interrupt_br_instr = bb_it->back();
2748  auto interrupt_check_bb = llvm::BasicBlock::Create(
2749  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
2750  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
2751  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
2752  cgen_state_->module_->getFunction("check_interrupt"), {});
2753  auto detected_error = interrupt_checker_ir_builder.CreateCall(
2754  cgen_state_->module_->getFunction("get_error_code"),
2755  std::vector<llvm::Value*>{error_code_arg});
2756  err_code = interrupt_checker_ir_builder.CreateSelect(
2757  detected_interrupt,
2758  cgen_state_->llInt(Executor::ERR_INTERRUPTED),
2759  detected_error);
2760  interrupt_checker_ir_builder.CreateBr(error_check_bb);
2761  llvm::ReplaceInstWithInst(&check_interrupt_br_instr,
2762  llvm::BranchInst::Create(interrupt_check_bb));
2763  ir_builder.SetInsertPoint(&br_instr);
2764  } else {
2765  // uses error code returned from row_func and skip to check interrupt status
2766  ir_builder.SetInsertPoint(&br_instr);
2767  err_code =
2768  ir_builder.CreateCall(cgen_state_->module_->getFunction("get_error_code"),
2769  std::vector<llvm::Value*>{error_code_arg});
2770  }
2771  err_lv = ir_builder.CreateICmp(
2772  llvm::ICmpInst::ICMP_NE, err_code, cgen_state_->llInt(0));
2773  auto error_bb = llvm::BasicBlock::Create(
2774  cgen_state_->context_, ".error_exit", query_func, new_bb);
2775  llvm::CallInst::Create(cgen_state_->module_->getFunction("record_error_code"),
2776  std::vector<llvm::Value*>{err_code, error_code_arg},
2777  "",
2778  error_bb);
2779  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
2780  llvm::ReplaceInstWithInst(&br_instr,
2781  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
2782  break;
2783  }
2784  }
2785  }
2786 }
2787 
2789  const RelAlgExecutionUnit& ra_exe_unit,
2790  const CompilationOptions& co) {
2791  AUTOMATIC_IR_METADATA(cgen_state_.get());
2792  if (!co.filter_on_deleted_column) {
2793  return nullptr;
2794  }
2795  CHECK(!ra_exe_unit.input_descs.empty());
2796  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
2797  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
2798  return nullptr;
2799  }
2800  const auto deleted_cd =
2801  plan_state_->getDeletedColForTable(outer_input_desc.getTableId());
2802  if (!deleted_cd) {
2803  return nullptr;
2804  }
2805  CHECK(deleted_cd->columnType.is_boolean());
2806  const auto deleted_expr =
2807  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
2808  outer_input_desc.getTableId(),
2809  deleted_cd->columnId,
2810  outer_input_desc.getNestLevel());
2811  CodeGenerator code_generator(this);
2812  const auto is_deleted =
2813  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
2814  const auto is_deleted_bb = llvm::BasicBlock::Create(
2815  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
2816  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
2817  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
2818  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
2819  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
2820  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2821  cgen_state_->ir_builder_.SetInsertPoint(bb);
2822  return bb;
2823 }
2824 
2825 bool Executor::compileBody(const RelAlgExecutionUnit& ra_exe_unit,
2826  GroupByAndAggregate& group_by_and_aggregate,
2828  const CompilationOptions& co,
2829  const GpuSharedMemoryContext& gpu_smem_context) {
2830  AUTOMATIC_IR_METADATA(cgen_state_.get());
2831 
2832  // Switch the code generation into a separate filter function if enabled.
2833  // Note that accesses to function arguments are still codegenned from the
2834  // row function's arguments, then later automatically forwarded and
2835  // remapped into filter function arguments by redeclareFilterFunction().
2836  cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
2837  llvm::Value* loop_done{nullptr};
2838  std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
2839  if (cgen_state_->filter_func_) {
2840  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2841  auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
2842  cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
2843  row_func_entry_bb->begin());
2844  loop_done = cgen_state_->ir_builder_.CreateAlloca(
2845  get_int_type(1, cgen_state_->context_), nullptr, "loop_done");
2846  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2847  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(true), loop_done);
2848  }
2849  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
2850  cgen_state_->current_func_ = cgen_state_->filter_func_;
2851  fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
2852  }
2853 
2854  // generate the code for the filter
2855  std::vector<Analyzer::Expr*> primary_quals;
2856  std::vector<Analyzer::Expr*> deferred_quals;
2857  bool short_circuited = CodeGenerator::prioritizeQuals(
2858  ra_exe_unit, primary_quals, deferred_quals, plan_state_->hoisted_filters_);
2859  if (short_circuited) {
2860  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
2861  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
2862  << " quals";
2863  }
2864  llvm::Value* filter_lv = cgen_state_->llBool(true);
2865  CodeGenerator code_generator(this);
2866  for (auto expr : primary_quals) {
2867  // Generate the filter for primary quals
2868  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
2869  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
2870  }
2871  CHECK(filter_lv->getType()->isIntegerTy(1));
2872  llvm::BasicBlock* sc_false{nullptr};
2873  if (!deferred_quals.empty()) {
2874  auto sc_true = llvm::BasicBlock::Create(
2875  cgen_state_->context_, "sc_true", cgen_state_->current_func_);
2876  sc_false = llvm::BasicBlock::Create(
2877  cgen_state_->context_, "sc_false", cgen_state_->current_func_);
2878  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
2879  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
2880  if (ra_exe_unit.join_quals.empty()) {
2881  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
2882  }
2883  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
2884  filter_lv = cgen_state_->llBool(true);
2885  }
2886  for (auto expr : deferred_quals) {
2887  filter_lv = cgen_state_->ir_builder_.CreateAnd(
2888  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
2889  }
2890 
2891  CHECK(filter_lv->getType()->isIntegerTy(1));
2892  auto ret = group_by_and_aggregate.codegen(
2893  filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
2894 
2895  // Switch the code generation back to the row function if a filter
2896  // function was enabled.
2897  if (cgen_state_->filter_func_) {
2898  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2899  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(false), loop_done);
2900  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2901  }
2902 
2903  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2904  cgen_state_->current_func_ = cgen_state_->row_func_;
2905  cgen_state_->filter_func_call_ =
2906  cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
2907 
2908  // Create real filter function declaration after placeholder call
2909  // is emitted.
2910  redeclareFilterFunction();
2911 
2912  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2913  auto loop_done_true = llvm::BasicBlock::Create(
2914  cgen_state_->context_, "loop_done_true", cgen_state_->row_func_);
2915  auto loop_done_false = llvm::BasicBlock::Create(
2916  cgen_state_->context_, "loop_done_false", cgen_state_->row_func_);
2917  auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(loop_done);
2918  cgen_state_->ir_builder_.CreateCondBr(
2919  loop_done_flag, loop_done_true, loop_done_false);
2920  cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
2921  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2922  cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
2923  } else {
2924  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2925  }
2926  }
2927  return ret;
2928 }
2929 
2930 std::unique_ptr<llvm::Module> runtime_module_shallow_copy(CgenState* cgen_state) {
2931  return llvm::CloneModule(
2932  *g_rt_module.get(), cgen_state->vmap_, [](const llvm::GlobalValue* gv) {
2933  auto func = llvm::dyn_cast<llvm::Function>(gv);
2934  if (!func) {
2935  return true;
2936  }
2937  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2938  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage);
2939  });
2940 }
2941 
2942 std::vector<llvm::Value*> generate_column_heads_load(const int num_columns,
2943  llvm::Value* byte_stream_arg,
2944  llvm::IRBuilder<>& ir_builder,
2945  llvm::LLVMContext& ctx) {
2946  CHECK(byte_stream_arg);
2947  const auto max_col_local_id = num_columns - 1;
2948 
2949  std::vector<llvm::Value*> col_heads;
2950  for (int col_id = 0; col_id <= max_col_local_id; ++col_id) {
2951  col_heads.emplace_back(ir_builder.CreateLoad(ir_builder.CreateGEP(
2952  byte_stream_arg, llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id))));
2953  }
2954  return col_heads;
2955 }
2956 
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
void read_rt_udf_gpu_module(const std::string &udf_ir)
std::vector< Analyzer::Expr * > target_exprs
bool is_udf_module_present(bool cpu_only=false)
#define CHECK_EQ(x, y)
Definition: Logger.h:214
std::string filename(char const *path)
Definition: Logger.cpp:64
double g_running_query_interrupt_freq
Definition: Execute.cpp:118
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
std::unique_ptr< llvm::Module > rt_udf_cpu_module
bool countDistinctDescriptorsLogicallyEmpty() const
std::unique_ptr< llvm::Module > runtime_module_shallow_copy(CgenState *cgen_state)
void read_udf_cpu_module(const std::string &udf_ir_filename)
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1142
void mark_function_never_inline(llvm::Function *func)
std::unique_ptr< llvm::Module > udf_gpu_module
void show_defined(llvm::Module &module)
ExecutorDeviceType
#define SIZE_MAX
void read_rt_udf_cpu_module(const std::string &udf_ir)
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:200
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
std::string join(T const &container, std::string const &delim)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
std::vector< std::string > CodeCacheKey
Definition: CodeCache.h:25
ExecutorOptLevel opt_level
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:76
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
void optimize_ir(llvm::Function *query_func, llvm::Module *module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
#define CHECK_GT(x, y)
Definition: Logger.h:218
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
gpu_code_cache_(code_cache_size)
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *module)
llvm::Function * row_func_
Definition: CgenState.h:330
cpu_code_cache_(code_cache_size)
std::shared_ptr< CompilationContext > getCodeFromCache(const CodeCacheKey &, const CodeCache &)
bool g_enable_smem_non_grouped_agg
Definition: Execute.cpp:127
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co, const GPUTarget &gpu_target)
Definition: sqldefs.h:73
unsigned getExpOfTwo(unsigned n)
Definition: MathUtils.cpp:23
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned gpu_blocksize, const unsigned num_blocks_per_mp)
std::string get_cuda_home(void)
Definition: CudaMgr.cpp:404
llvm::StringRef get_gpu_target_triple_string()
llvm::Module * module_
Definition: CgenState.h:329
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
void verify_function_ir(const llvm::Function *func)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:166
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > g_rt_module
ExecutorExplainType explain_type
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
void insertErrorCodeChecker(llvm::Function *query_func, bool hoist_literals, bool allow_runtime_query_interrupt)
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1141
void initializeNVPTXBackend() const
Definition: sqldefs.h:75
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:114
const_list_iterator_t cend() const
Definition: LruCache.hpp:58
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
char * to
QueryDescriptionType getQueryDescriptionType() const
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
Definition: CudaMgr.h:148
static void addCodeToCache(const CodeCacheKey &, std::shared_ptr< CompilationContext >, llvm::Module *, CodeCache &)
#define AUTOMATIC_IR_METADATA_DONE()
ExecutorDeviceType device_type
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
llvm::Function * filter_func_
Definition: CgenState.h:331
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
static void addUdfIrToModule(const std::string &udf_ir_filename, const bool is_cuda_ir)
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:287
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
#define CHECK_LE(x, y)
Definition: Logger.h:217
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
llvm::Module * read_template_module(llvm::LLVMContext &context)
bool g_enable_smem_grouped_non_count_agg
Definition: Execute.cpp:124
Definition: sqldefs.h:76
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
std::unique_ptr< llvm::Module > udf_cpu_module
int CUdevice
Definition: nocuda.h:20
bool g_enable_filter_function
Definition: Execute.cpp:78
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
llvm::LLVMContext & getGlobalLLVMContext()
float g_fraction_code_cache_to_evict
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals, const PlanState::HoistedFiltersSet &hoisted_quals)
Definition: LogicalIR.cpp:157
data_mgr_(data_mgr)
SQLAgg get_aggtype() const
Definition: Analyzer.h:1095
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:64
#define CHECK(condition)
Definition: Logger.h:206
#define DEBUG_TIMER(name)
Definition: Logger.h:322
llvm::ValueToValueMapTy vmap_
Definition: CgenState.h:339
std::string get_root_abs_path()
char * f
std::vector< llvm::Value * > inlineHoistedLiterals()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *module, llvm::LLVMContext &context)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
void read_udf_gpu_module(const std::string &udf_ir_filename)
Definition: sqldefs.h:74
int cpu_threads()
Definition: thread_count.h:24
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls, const bool is_gpu=false)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
CubinResult ptx_to_cubin(const std::string &ptx, const unsigned block_size, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
Definition: sqldefs.h:72
bool is_rt_udf_module_present(bool cpu_only=false)
void put(const key_t &key, value_t &&value)
Definition: LruCache.hpp:27
const_list_iterator_t find(const key_t &key) const
Definition: LruCache.hpp:49
#define VLOG(n)
Definition: Logger.h:300
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type, const std::vector< InputTableInfo > &input_table_infos)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
size_t g_gpu_smem_threshold
Definition: Execute.cpp:119