OmniSciDB  dfae7c3b14
NativeCodegen.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
18 #include "Execute.h"
20 #include "GpuSharedMemoryUtils.h"
23 #include "QueryTemplateGenerator.h"
24 
26 #include "Shared/MathUtils.h"
27 #include "StreamingTopN.h"
28 
29 #if LLVM_VERSION_MAJOR < 4
30 static_assert(false, "LLVM Version >= 4 is required.");
31 #endif
32 
33 #include <llvm/Bitcode/BitcodeReader.h>
34 #include <llvm/Bitcode/BitcodeWriter.h>
35 #include <llvm/ExecutionEngine/MCJIT.h>
36 #include <llvm/IR/Attributes.h>
37 #include <llvm/IR/GlobalValue.h>
38 #include <llvm/IR/InstIterator.h>
39 #include <llvm/IR/LegacyPassManager.h>
40 #include <llvm/IR/Verifier.h>
41 #include <llvm/IRReader/IRReader.h>
42 #include <llvm/Support/Casting.h>
43 #include <llvm/Support/FileSystem.h>
44 #include <llvm/Support/FormattedStream.h>
45 #include <llvm/Support/MemoryBuffer.h>
46 #include <llvm/Support/SourceMgr.h>
47 #include <llvm/Support/TargetRegistry.h>
48 #include <llvm/Support/TargetSelect.h>
49 #include <llvm/Support/raw_os_ostream.h>
50 #include <llvm/Transforms/IPO.h>
51 #include <llvm/Transforms/IPO/AlwaysInliner.h>
52 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
53 #include <llvm/Transforms/InstCombine/InstCombine.h>
54 #include <llvm/Transforms/Instrumentation.h>
55 #include <llvm/Transforms/Scalar.h>
56 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
57 #include <llvm/Transforms/Utils/Cloning.h>
58 #include "llvm/IR/IntrinsicInst.h"
59 #include "llvm/IR/Intrinsics.h"
60 
61 #if LLVM_VERSION_MAJOR >= 7
62 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
63 #include <llvm/Transforms/Utils.h>
64 #endif
65 #include <llvm/IRReader/IRReader.h>
66 #include <llvm/Linker/Linker.h>
67 #include <llvm/Support/SourceMgr.h>
68 #include <llvm/Support/raw_ostream.h>
69 
71 
72 std::unique_ptr<llvm::Module> udf_gpu_module;
73 std::unique_ptr<llvm::Module> udf_cpu_module;
74 std::unique_ptr<llvm::Module> rt_udf_gpu_module;
75 std::unique_ptr<llvm::Module> rt_udf_cpu_module;
76 
77 extern std::unique_ptr<llvm::Module> g_rt_module;
78 
79 #ifdef HAVE_CUDA
80 extern std::unique_ptr<llvm::Module> g_rt_libdevice_module;
81 #endif
82 
83 #ifdef ENABLE_GEOS
84 extern std::unique_ptr<llvm::Module> g_rt_geos_module;
85 
86 #include <llvm/Support/DynamicLibrary.h>
87 
88 #ifndef GEOS_LIBRARY_FILENAME
89 #error Configuration should include GEOS library file name
90 #endif
91 std::unique_ptr<std::string> g_libgeos_so_filename(
92  new std::string(GEOS_LIBRARY_FILENAME));
93 static llvm::sys::DynamicLibrary geos_dynamic_library;
94 static std::mutex geos_init_mutex;
95 
96 namespace {
97 
98 void load_geos_dynamic_library() {
99  std::lock_guard<std::mutex> guard(geos_init_mutex);
100 
101  if (!geos_dynamic_library.isValid()) {
102  if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
103  LOG(WARNING) << "Misconfigured GEOS library file name, trying 'libgeos_c.so'";
104  g_libgeos_so_filename.reset(new std::string("libgeos_c.so"));
105  }
106  auto filename = *g_libgeos_so_filename;
107  std::string error_message;
108  geos_dynamic_library =
109  llvm::sys::DynamicLibrary::getPermanentLibrary(filename.c_str(), &error_message);
110  if (!geos_dynamic_library.isValid()) {
111  LOG(ERROR) << "Failed to load GEOS library '" + filename + "'";
112  std::string exception_message = "Failed to load GEOS library: " + error_message;
113  throw std::runtime_error(exception_message.c_str());
114  } else {
115  LOG(INFO) << "Loaded GEOS library '" + filename + "'";
116  }
117  }
118 }
119 
120 } // namespace
121 #endif
122 
123 namespace {
124 
125 /* SHOW_DEFINED(<llvm::Module instance>) prints the function names
126  that are defined in the given LLVM Module instance. Useful for
127  debugging.
128 */
129 
130 #define SHOW_DEFINED(MODULE) \
131  { \
132  std::cout << __func__ << "#" << __LINE__ << ": " #MODULE << " "; \
133  ::show_defined(MODULE); \
134  }
135 
136 static void show_defined(llvm::Module& module) {
137  std::cout << "defines: ";
138  for (auto& f : module.getFunctionList()) {
139  if (!f.isDeclaration()) {
140  std::cout << f.getName().str() << ", ";
141  }
142  }
143  std::cout << std::endl;
144 }
145 
146 static void show_defined(llvm::Module* module) {
147  show_defined(*module);
148 }
149 
150 static void show_defined(std::unique_ptr<llvm::Module>& module) {
151  show_defined(module.get());
152 }
153 
154 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
156  llvm::Module& M,
157  const std::unordered_set<llvm::Function*>& live_funcs) {
158  std::vector<llvm::Function*> dead_funcs;
159  for (auto& F : M) {
160  bool bAlive = false;
161  if (live_funcs.count(&F)) {
162  continue;
163  }
164  for (auto U : F.users()) {
165  auto* C = llvm::dyn_cast<const llvm::CallInst>(U);
166  if (!C || C->getParent()->getParent() != &F) {
167  bAlive = true;
168  break;
169  }
170  }
171  if (!bAlive) {
172  dead_funcs.push_back(&F);
173  }
174  }
175  for (auto pFn : dead_funcs) {
176  pFn->eraseFromParent();
177  }
178 }
179 
180 // check if linking with libdevice is required
181 // libdevice functions have a __nv_* prefix
182 bool check_module_requires_libdevice(llvm::Module* module) {
183  for (llvm::Function& F : *module) {
184  if (F.hasName() && F.getName().startswith("__nv_")) {
185  LOG(INFO) << "Module requires linking with libdevice: " << std::string(F.getName());
186  return true;
187  }
188  }
189  LOG(INFO) << "module does not require linking against libdevice";
190  return false;
191 }
192 
193 // Adds the missing intrinsics declarations to the given module
194 void add_intrinsics_to_module(llvm::Module* module) {
195  for (llvm::Function& F : *module) {
196  for (llvm::Instruction& I : instructions(F)) {
197  if (llvm::IntrinsicInst* ii = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
198  if (llvm::Intrinsic::isOverloaded(ii->getIntrinsicID())) {
199  llvm::Type* Tys[] = {ii->getFunctionType()->getReturnType()};
200  llvm::Function& decl_fn =
201  *llvm::Intrinsic::getDeclaration(module, ii->getIntrinsicID(), Tys);
202  ii->setCalledFunction(&decl_fn);
203  } else {
204  // inserts the declaration into the module if not present
205  llvm::Intrinsic::getDeclaration(module, ii->getIntrinsicID());
206  }
207  }
208  }
209  }
210 }
211 
212 void optimize_ir(llvm::Function* query_func,
213  llvm::Module* module,
214  llvm::legacy::PassManager& pass_manager,
215  const std::unordered_set<llvm::Function*>& live_funcs,
216  const CompilationOptions& co) {
217  pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
218  pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
219 #if LLVM_VERSION_MAJOR >= 7
220  pass_manager.add(llvm::createInstSimplifyLegacyPass());
221 #else
222  pass_manager.add(llvm::createInstructionSimplifierPass());
223 #endif
224  pass_manager.add(llvm::createInstructionCombiningPass());
225  pass_manager.add(llvm::createGlobalOptimizerPass());
226 
227  pass_manager.add(llvm::createLICMPass());
229  pass_manager.add(llvm::createLoopStrengthReducePass());
230  }
231  pass_manager.run(*module);
232 
233  eliminate_dead_self_recursive_funcs(*module, live_funcs);
234 }
235 #endif
236 
237 } // namespace
238 
240 
241 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine)
242  : execution_engine_(execution_engine) {}
243 
244 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine,
245  const CompilationOptions& co)
246  : execution_engine_(execution_engine) {
247  if (execution_engine_) {
249 #ifdef ENABLE_INTEL_JIT_LISTENER
250  intel_jit_listener_.reset(llvm::JITEventListener::createIntelJITEventListener());
252  execution_engine_->RegisterJITEventListener(intel_jit_listener_.get());
253  LOG(INFO) << "Registered IntelJITEventListener";
254 #else
255  LOG(WARNING) << "This build is not Intel JIT Listener enabled. Ignoring Intel JIT "
256  "listener configuration parameter.";
257 #endif // ENABLE_INTEL_JIT_LISTENER
258  }
259  }
260 }
261 
263  llvm::ExecutionEngine* execution_engine) {
264  execution_engine_.reset(execution_engine);
265  intel_jit_listener_ = nullptr;
266  return *this;
267 }
268 
269 void verify_function_ir(const llvm::Function* func) {
270  std::stringstream err_ss;
271  llvm::raw_os_ostream err_os(err_ss);
272  if (llvm::verifyFunction(*func, &err_os)) {
273  func->print(llvm::outs());
274  LOG(FATAL) << err_ss.str();
275  }
276 }
277 
278 std::shared_ptr<CompilationContext> Executor::getCodeFromCache(const CodeCacheKey& key,
279  const CodeCache& cache) {
280  auto it = cache.find(key);
281  if (it != cache.cend()) {
282  delete cgen_state_->module_;
283  cgen_state_->module_ = it->second.second;
284  return it->second.first;
285  }
286  return {};
287 }
288 
290  std::shared_ptr<CompilationContext> compilation_context,
291  llvm::Module* module,
292  CodeCache& cache) {
293  cache.put(key,
294  std::make_pair<std::shared_ptr<CompilationContext>, decltype(module)>(
295  std::move(compilation_context), std::move(module)));
296 }
297 
298 namespace {
299 
300 std::string assemblyForCPU(ExecutionEngineWrapper& execution_engine,
301  llvm::Module* module) {
302  llvm::legacy::PassManager pass_manager;
303  auto cpu_target_machine = execution_engine->getTargetMachine();
304  CHECK(cpu_target_machine);
305  llvm::SmallString<256> code_str;
306  llvm::raw_svector_ostream os(code_str);
307 #if LLVM_VERSION_MAJOR >= 10
308  cpu_target_machine->addPassesToEmitFile(
309  pass_manager, os, nullptr, llvm::CGFT_AssemblyFile);
310 #elif LLVM_VERSION_MAJOR >= 7
311  cpu_target_machine->addPassesToEmitFile(
312  pass_manager, os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
313 #else
314  cpu_target_machine->addPassesToEmitFile(
315  pass_manager, os, llvm::TargetMachine::CGFT_AssemblyFile);
316 #endif
317  pass_manager.run(*module);
318  return "Assembly for the CPU:\n" + std::string(code_str.str()) + "\nEnd of assembly";
319 }
320 
321 } // namespace
322 
324  llvm::Function* func,
325  const std::unordered_set<llvm::Function*>& live_funcs,
326  const CompilationOptions& co) {
327  auto module = func->getParent();
328  // run optimizations
329 #ifndef WITH_JIT_DEBUG
330  llvm::legacy::PassManager pass_manager;
331  optimize_ir(func, module, pass_manager, live_funcs, co);
332 #endif // WITH_JIT_DEBUG
333 
334  auto init_err = llvm::InitializeNativeTarget();
335  CHECK(!init_err);
336 
337  llvm::InitializeAllTargetMCs();
338  llvm::InitializeNativeTargetAsmPrinter();
339  llvm::InitializeNativeTargetAsmParser();
340 
341  std::string err_str;
342  std::unique_ptr<llvm::Module> owner(module);
343  llvm::EngineBuilder eb(std::move(owner));
344  eb.setErrorStr(&err_str);
345  eb.setEngineKind(llvm::EngineKind::JIT);
346  llvm::TargetOptions to;
347  to.EnableFastISel = true;
348  eb.setTargetOptions(to);
350  eb.setOptLevel(llvm::CodeGenOpt::None);
351  }
352 
353  ExecutionEngineWrapper execution_engine(eb.create(), co);
354  CHECK(execution_engine.get());
355  LOG(ASM) << assemblyForCPU(execution_engine, module);
356 
357  execution_engine->finalizeObject();
358 
359  return execution_engine;
360 }
361 
362 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenCPU(
363  llvm::Function* query_func,
364  llvm::Function* multifrag_query_func,
365  const std::unordered_set<llvm::Function*>& live_funcs,
366  const CompilationOptions& co) {
367  auto module = multifrag_query_func->getParent();
368  CodeCacheKey key{serialize_llvm_object(query_func),
369  serialize_llvm_object(cgen_state_->row_func_)};
370  if (cgen_state_->filter_func_) {
371  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
372  }
373  for (const auto helper : cgen_state_->helper_functions_) {
374  key.push_back(serialize_llvm_object(helper));
375  }
376  auto cached_code = getCodeFromCache(key, cpu_code_cache_);
377  if (cached_code) {
378  return cached_code;
379  }
380 
381  if (cgen_state_->needs_geos_) {
382 #ifdef ENABLE_GEOS
383  load_geos_dynamic_library();
384 
385  // Read geos runtime module and bind GEOS API function references to GEOS library
386  auto rt_geos_module_copy = llvm::CloneModule(
387 #if LLVM_VERSION_MAJOR >= 7
388  *g_rt_geos_module.get(),
389 #else
390  g_rt_geos_module.get(),
391 #endif
392  cgen_state_->vmap_,
393  [](const llvm::GlobalValue* gv) {
394  auto func = llvm::dyn_cast<llvm::Function>(gv);
395  if (!func) {
396  return true;
397  }
398  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
399  func->getLinkage() ==
400  llvm::GlobalValue::LinkageTypes::InternalLinkage ||
401  func->getLinkage() == llvm::GlobalValue::LinkageTypes::ExternalLinkage);
402  });
403  CodeGenerator::link_udf_module(rt_geos_module_copy,
404  *module,
405  cgen_state_.get(),
406  llvm::Linker::Flags::LinkOnlyNeeded);
407 #else
408  throw std::runtime_error("GEOS is disabled in this build");
409 #endif
410  }
411 
412  auto execution_engine =
413  CodeGenerator::generateNativeCPUCode(query_func, live_funcs, co);
414  auto cpu_compilation_context =
415  std::make_shared<CpuCompilationContext>(std::move(execution_engine));
416  cpu_compilation_context->setFunctionPointer(multifrag_query_func);
417  addCodeToCache(key, cpu_compilation_context, module, cpu_code_cache_);
418  return cpu_compilation_context;
419 }
420 
421 void CodeGenerator::link_udf_module(const std::unique_ptr<llvm::Module>& udf_module,
422  llvm::Module& module,
423  CgenState* cgen_state,
424  llvm::Linker::Flags flags) {
425  // throw a runtime error if the target module contains functions
426  // with the same name as in module of UDF functions.
427  for (auto& f : *udf_module.get()) {
428  auto func = module.getFunction(f.getName());
429  if (!(func == nullptr) && !f.isDeclaration() && flags == llvm::Linker::Flags::None) {
430  LOG(ERROR) << " Attempt to overwrite " << f.getName().str() << " in "
431  << module.getModuleIdentifier() << " from `"
432  << udf_module->getModuleIdentifier() << "`" << std::endl;
433  throw std::runtime_error(
434  "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
435  "function ***");
436  } else {
437  VLOG(1) << " Adding " << f.getName().str() << " to "
438  << module.getModuleIdentifier() << " from `"
439  << udf_module->getModuleIdentifier() << "`" << std::endl;
440  }
441  }
442 
443  std::unique_ptr<llvm::Module> udf_module_copy;
444 
445  udf_module_copy = llvm::CloneModule(
446 #if LLVM_VERSION_MAJOR >= 7
447  *udf_module.get(),
448 #else
449  udf_module.get(),
450 #endif
451  cgen_state->vmap_);
452 
453  udf_module_copy->setDataLayout(module.getDataLayout());
454  udf_module_copy->setTargetTriple(module.getTargetTriple());
455 
456  // Initialize linker with module for RuntimeFunctions.bc
457  llvm::Linker ld(module);
458  bool link_error = false;
459 
460  link_error = ld.linkInModule(std::move(udf_module_copy), flags);
461 
462  if (link_error) {
463  throw std::runtime_error("link_udf_module: *** error linking module ***");
464  }
465 }
466 
467 namespace {
468 
469 std::string cpp_to_llvm_name(const std::string& s) {
470  if (s == "int8_t") {
471  return "i8";
472  }
473  if (s == "int16_t") {
474  return "i16";
475  }
476  if (s == "int32_t") {
477  return "i32";
478  }
479  if (s == "int64_t") {
480  return "i64";
481  }
482  CHECK(s == "float" || s == "double");
483  return s;
484 }
485 
486 std::string gen_array_any_all_sigs() {
487  std::string result;
488  for (const std::string any_or_all : {"any", "all"}) {
489  for (const std::string elem_type :
490  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
491  for (const std::string needle_type :
492  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
493  for (const std::string op_name : {"eq", "ne", "lt", "le", "gt", "ge"}) {
494  result += ("declare i1 @array_" + any_or_all + "_" + op_name + "_" + elem_type +
495  "_" + needle_type + "(i8*, i64, " + cpp_to_llvm_name(needle_type) +
496  ", " + cpp_to_llvm_name(elem_type) + ");\n");
497  }
498  }
499  }
500  }
501  return result;
502 }
503 
505  std::string result;
506  for (const std::string key_type : {"int8_t", "int16_t", "int32_t", "int64_t"}) {
507  const auto key_llvm_type = cpp_to_llvm_name(key_type);
508  result += "declare i64 @translate_null_key_" + key_type + "(" + key_llvm_type + ", " +
509  key_llvm_type + ", i64);\n";
510  }
511  return result;
512 }
513 
514 const std::string cuda_rt_decls =
515  R"(
516 declare void @llvm.dbg.declare(metadata, metadata, metadata)
517 declare void @llvm.dbg.value(metadata, metadata, metadata)
518 declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
519 declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
520 declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
521 declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
522 declare i64 @get_thread_index();
523 declare i64 @get_block_index();
524 declare i32 @pos_start_impl(i32*);
525 declare i32 @group_buff_idx_impl();
526 declare i32 @pos_step_impl();
527 declare i8 @thread_warp_idx(i8);
528 declare i64* @init_shared_mem(i64*, i32);
529 declare i64* @init_shared_mem_nop(i64*, i32);
530 declare i64* @declare_dynamic_shared_memory();
531 declare void @write_back_nop(i64*, i64*, i32);
532 declare void @write_back_non_grouped_agg(i64*, i64*, i32);
533 declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8);
534 declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32, i64*);
535 declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32, i64*);
536 declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32);
537 declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32);
538 declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32);
539 declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32);
540 declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64);
541 declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64);
542 declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64);
543 declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64);
544 declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64);
545 declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double);
546 declare i64 @get_bucket_key_for_range_double(i8*, i64, double);
547 declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double);
548 declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64);
549 declare i64 @agg_count_shared(i64*, i64);
550 declare i64 @agg_count_skip_val_shared(i64*, i64, i64);
551 declare i32 @agg_count_int32_shared(i32*, i32);
552 declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32);
553 declare i64 @agg_count_double_shared(i64*, double);
554 declare i64 @agg_count_double_skip_val_shared(i64*, double, double);
555 declare i32 @agg_count_float_shared(i32*, float);
556 declare i32 @agg_count_float_skip_val_shared(i32*, float, float);
557 declare i64 @agg_sum_shared(i64*, i64);
558 declare i64 @agg_sum_skip_val_shared(i64*, i64, i64);
559 declare i32 @agg_sum_int32_shared(i32*, i32);
560 declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32);
561 declare void @agg_sum_double_shared(i64*, double);
562 declare void @agg_sum_double_skip_val_shared(i64*, double, double);
563 declare void @agg_sum_float_shared(i32*, float);
564 declare void @agg_sum_float_skip_val_shared(i32*, float, float);
565 declare void @agg_max_shared(i64*, i64);
566 declare void @agg_max_skip_val_shared(i64*, i64, i64);
567 declare void @agg_max_int32_shared(i32*, i32);
568 declare void @agg_max_int32_skip_val_shared(i32*, i32, i32);
569 declare void @agg_max_int16_shared(i16*, i16);
570 declare void @agg_max_int16_skip_val_shared(i16*, i16, i16);
571 declare void @agg_max_int8_shared(i8*, i8);
572 declare void @agg_max_int8_skip_val_shared(i8*, i8, i8);
573 declare void @agg_max_double_shared(i64*, double);
574 declare void @agg_max_double_skip_val_shared(i64*, double, double);
575 declare void @agg_max_float_shared(i32*, float);
576 declare void @agg_max_float_skip_val_shared(i32*, float, float);
577 declare void @agg_min_shared(i64*, i64);
578 declare void @agg_min_skip_val_shared(i64*, i64, i64);
579 declare void @agg_min_int32_shared(i32*, i32);
580 declare void @agg_min_int32_skip_val_shared(i32*, i32, i32);
581 declare void @agg_min_int16_shared(i16*, i16);
582 declare void @agg_min_int16_skip_val_shared(i16*, i16, i16);
583 declare void @agg_min_int8_shared(i8*, i8);
584 declare void @agg_min_int8_skip_val_shared(i8*, i8, i8);
585 declare void @agg_min_double_shared(i64*, double);
586 declare void @agg_min_double_skip_val_shared(i64*, double, double);
587 declare void @agg_min_float_shared(i32*, float);
588 declare void @agg_min_float_skip_val_shared(i32*, float, float);
589 declare void @agg_id_shared(i64*, i64);
590 declare void @agg_id_int32_shared(i32*, i32);
591 declare void @agg_id_int16_shared(i16*, i16);
592 declare void @agg_id_int8_shared(i8*, i8);
593 declare void @agg_id_double_shared(i64*, double);
594 declare void @agg_id_double_shared_slow(i64*, double*);
595 declare void @agg_id_float_shared(i32*, float);
596 declare i32 @checked_single_agg_id_shared(i64*, i64, i64);
597 declare i32 @checked_single_agg_id_double_shared(i64*, double, double);
598 declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double);
599 declare i32 @checked_single_agg_id_float_shared(i32*, float, float);
600 declare i1 @slotEmptyKeyCAS(i64*, i64, i64);
601 declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32);
602 declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16);
603 declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8);
604 declare i64 @datetrunc_century(i64);
605 declare i64 @datetrunc_day(i64);
606 declare i64 @datetrunc_decade(i64);
607 declare i64 @datetrunc_hour(i64);
608 declare i64 @datetrunc_millennium(i64);
609 declare i64 @datetrunc_minute(i64);
610 declare i64 @datetrunc_month(i64);
611 declare i64 @datetrunc_quarter(i64);
612 declare i64 @datetrunc_quarterday(i64);
613 declare i64 @datetrunc_week(i64);
614 declare i64 @datetrunc_year(i64);
615 declare i64 @extract_epoch(i64);
616 declare i64 @extract_dateepoch(i64);
617 declare i64 @extract_quarterday(i64);
618 declare i64 @extract_hour(i64);
619 declare i64 @extract_minute(i64);
620 declare i64 @extract_second(i64);
621 declare i64 @extract_millisecond(i64);
622 declare i64 @extract_microsecond(i64);
623 declare i64 @extract_nanosecond(i64);
624 declare i64 @extract_dow(i64);
625 declare i64 @extract_isodow(i64);
626 declare i64 @extract_day(i64);
627 declare i64 @extract_week(i64);
628 declare i64 @extract_day_of_year(i64);
629 declare i64 @extract_month(i64);
630 declare i64 @extract_quarter(i64);
631 declare i64 @extract_year(i64);
632 declare i64 @DateTruncateHighPrecisionToDate(i64, i64);
633 declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64);
634 declare i64 @DateDiff(i32, i64, i64);
635 declare i64 @DateDiffNullable(i32, i64, i64, i64);
636 declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i32);
637 declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i32, i64);
638 declare i64 @DateAdd(i32, i64, i64);
639 declare i64 @DateAddNullable(i32, i64, i64, i64);
640 declare i64 @DateAddHighPrecision(i32, i64, i64, i32);
641 declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i32, i64);
642 declare i64 @string_decode(i8*, i64);
643 declare i32 @array_size(i8*, i64, i32);
644 declare i32 @array_size_nullable(i8*, i64, i32, i32);
645 declare i32 @fast_fixlen_array_size(i8*, i32);
646 declare i1 @array_is_null(i8*, i64);
647 declare i1 @point_coord_array_is_null(i8*, i64);
648 declare i8* @array_buff(i8*, i64);
649 declare i8* @fast_fixlen_array_buff(i8*, i64);
650 declare i8 @array_at_int8_t(i8*, i64, i32);
651 declare i16 @array_at_int16_t(i8*, i64, i32);
652 declare i32 @array_at_int32_t(i8*, i64, i32);
653 declare i64 @array_at_int64_t(i8*, i64, i32);
654 declare float @array_at_float(i8*, i64, i32);
655 declare double @array_at_double(i8*, i64, i32);
656 declare i8 @varlen_array_at_int8_t(i8*, i64, i32);
657 declare i16 @varlen_array_at_int16_t(i8*, i64, i32);
658 declare i32 @varlen_array_at_int32_t(i8*, i64, i32);
659 declare i64 @varlen_array_at_int64_t(i8*, i64, i32);
660 declare float @varlen_array_at_float(i8*, i64, i32);
661 declare double @varlen_array_at_double(i8*, i64, i32);
662 declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32);
663 declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32);
664 declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32);
665 declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32);
666 declare float @varlen_notnull_array_at_float(i8*, i64, i32);
667 declare double @varlen_notnull_array_at_double(i8*, i64, i32);
668 declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8);
669 declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16);
670 declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32);
671 declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64);
672 declare float @array_at_float_checked(i8*, i64, i64, float);
673 declare double @array_at_double_checked(i8*, i64, i64, double);
674 declare i32 @char_length(i8*, i32);
675 declare i32 @char_length_nullable(i8*, i32, i32);
676 declare i32 @char_length_encoded(i8*, i32);
677 declare i32 @char_length_encoded_nullable(i8*, i32, i32);
678 declare i32 @key_for_string_encoded(i32);
679 declare i1 @sample_ratio(double, i64);
680 declare i1 @string_like(i8*, i32, i8*, i32, i8);
681 declare i1 @string_ilike(i8*, i32, i8*, i32, i8);
682 declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8);
683 declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8);
684 declare i1 @string_like_simple(i8*, i32, i8*, i32);
685 declare i1 @string_ilike_simple(i8*, i32, i8*, i32);
686 declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8);
687 declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8);
688 declare i1 @string_lt(i8*, i32, i8*, i32);
689 declare i1 @string_le(i8*, i32, i8*, i32);
690 declare i1 @string_gt(i8*, i32, i8*, i32);
691 declare i1 @string_ge(i8*, i32, i8*, i32);
692 declare i1 @string_eq(i8*, i32, i8*, i32);
693 declare i1 @string_ne(i8*, i32, i8*, i32);
694 declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8);
695 declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8);
696 declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8);
697 declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8);
698 declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8);
699 declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8);
700 declare i1 @regexp_like(i8*, i32, i8*, i32, i8);
701 declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8);
702 declare void @linear_probabilistic_count(i8*, i32, i8*, i32);
703 declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64);
704 declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64);
705 declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64);
706 declare i32 @record_error_code(i32, i32*);
707 declare i1 @dynamic_watchdog();
708 declare i1 @check_interrupt();
709 declare void @force_sync();
710 declare void @sync_warp();
711 declare void @sync_warp_protected(i64, i64);
712 declare void @sync_threadblock();
713 declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32);
714 declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64);
715 declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float);
716 declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double);
717 )" + gen_array_any_all_sigs() +
719 
720 #ifdef HAVE_CUDA
721 std::string extension_function_decls(const std::unordered_set<std::string>& udf_decls) {
722  const auto decls = ExtensionFunctionsWhitelist::getLLVMDeclarations(udf_decls);
723  return boost::algorithm::join(decls, "\n");
724 }
725 
726 void legalize_nvvm_ir(llvm::Function* query_func) {
727  // optimizations might add attributes to the function
728  // and NVPTX doesn't understand all of them; play it
729  // safe and clear all attributes
730  clear_function_attributes(query_func);
731  verify_function_ir(query_func);
732 
733  std::vector<llvm::Instruction*> stackrestore_intrinsics;
734  std::vector<llvm::Instruction*> stacksave_intrinsics;
735  for (auto& BB : *query_func) {
736  for (llvm::Instruction& I : BB) {
737  if (const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
738  if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
739  stacksave_intrinsics.push_back(&I);
740  } else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
741  stackrestore_intrinsics.push_back(&I);
742  }
743  }
744  }
745  }
746 
747  // stacksave and stackrestore intrinsics appear together, and
748  // stackrestore uses stacksaved result as its argument
749  // so it should be removed first.
750  for (auto& II : stackrestore_intrinsics) {
751  II->eraseFromParent();
752  }
753  for (auto& II : stacksave_intrinsics) {
754  II->eraseFromParent();
755  }
756 }
757 #endif // HAVE_CUDA
758 
759 } // namespace
760 
761 llvm::StringRef get_gpu_target_triple_string() {
762  return llvm::StringRef("nvptx64-nvidia-cuda");
763 }
764 
765 llvm::StringRef get_gpu_data_layout() {
766  return llvm::StringRef(
767  "e-p:64:64:64-i1:8:8-i8:8:8-"
768  "i16:16:16-i32:32:32-i64:64:64-"
769  "f32:32:32-f64:64:64-v16:16:16-"
770  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
771 }
772 
773 std::map<std::string, std::string> get_device_parameters(bool cpu_only) {
774  std::map<std::string, std::string> result;
775 
776  result.insert(std::make_pair("cpu_name", llvm::sys::getHostCPUName()));
777  result.insert(std::make_pair("cpu_triple", llvm::sys::getProcessTriple()));
778  result.insert(
779  std::make_pair("cpu_cores", std::to_string(llvm::sys::getHostNumPhysicalCores())));
780  result.insert(std::make_pair("cpu_threads", std::to_string(cpu_threads())));
781 
782  llvm::StringMap<bool> cpu_features;
783  if (llvm::sys::getHostCPUFeatures(cpu_features)) {
784  std::string features_str = "";
785  for (auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
786  features_str += (it->getValue() ? " +" : " -");
787  features_str += it->getKey().str();
788  }
789  result.insert(std::make_pair("cpu_features", features_str));
790  }
791 
792 #ifdef HAVE_CUDA
793  if (!cpu_only) {
794  int device_count = 0;
795  checkCudaErrors(cuDeviceGetCount(&device_count));
796  if (device_count) {
797  CUdevice device{};
798  char device_name[256];
799  int major = 0, minor = 0;
800  checkCudaErrors(cuDeviceGet(&device, 0)); // assuming homogeneous multi-GPU system
801  checkCudaErrors(cuDeviceGetName(device_name, 256, device));
802  checkCudaErrors(cuDeviceGetAttribute(
803  &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
804  checkCudaErrors(cuDeviceGetAttribute(
805  &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
806 
807  result.insert(std::make_pair("gpu_name", device_name));
808  result.insert(std::make_pair("gpu_count", std::to_string(device_count)));
809  result.insert(std::make_pair("gpu_compute_capability",
810  std::to_string(major) + "." + std::to_string(minor)));
811  result.insert(std::make_pair("gpu_triple", get_gpu_target_triple_string()));
812  result.insert(std::make_pair("gpu_datalayout", get_gpu_data_layout()));
813  }
814  }
815 #endif
816 
817  return result;
818 }
819 
820 std::shared_ptr<GpuCompilationContext> CodeGenerator::generateNativeGPUCode(
821  llvm::Function* func,
822  llvm::Function* wrapper_func,
823  const std::unordered_set<llvm::Function*>& live_funcs,
824  const CompilationOptions& co,
825  const GPUTarget& gpu_target) {
826 #ifdef HAVE_CUDA
827  auto module = func->getParent();
828  /*
829  `func` is one of the following generated functions:
830  - `call_table_function(i8** %input_col_buffers, i64*
831  %input_row_count, i64** %output_buffers, i64* %output_row_count)`
832  that wraps the user-defined table function.
833  - `multifrag_query`
834  - `multifrag_query_hoisted_literals`
835  - ...
836 
837  `wrapper_func` is table_func_kernel(i32*, i8**, i64*, i64**,
838  i64*) that wraps `call_table_function`.
839 
840  `module` is from `build/QueryEngine/RuntimeFunctions.bc` and it
841  contains `func` and `wrapper_func`. `module` should also contain
842  the definitions of user-defined table functions.
843 
844  `live_funcs` contains table_func_kernel and call_table_function
845 
846  `gpu_target.cgen_state->module_` appears to be the same as `module`
847  */
848  CHECK(gpu_target.cgen_state->module_ == module);
849  module->setDataLayout(
850  "e-p:64:64:64-i1:8:8-i8:8:8-"
851  "i16:16:16-i32:32:32-i64:64:64-"
852  "f32:32:32-f64:64:64-v16:16:16-"
853  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
854  module->setTargetTriple("nvptx64-nvidia-cuda");
855  CHECK(gpu_target.nvptx_target_machine);
856  auto pass_manager_builder = llvm::PassManagerBuilder();
857 
858  pass_manager_builder.OptLevel = 0;
859  llvm::legacy::PassManager module_pass_manager;
860  pass_manager_builder.populateModulePassManager(module_pass_manager);
861 
862  bool requires_libdevice = check_module_requires_libdevice(module);
863 
864  if (requires_libdevice) {
865  // add nvvm reflect pass replacing any NVVM conditionals with constants
866  gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
867  llvm::legacy::FunctionPassManager FPM(module);
868  pass_manager_builder.populateFunctionPassManager(FPM);
869 
870  // Run the NVVMReflectPass here rather than inside optimize_ir
871  FPM.doInitialization();
872  for (auto& F : *module) {
873  FPM.run(F);
874  }
875  FPM.doFinalization();
876  }
877 
878  // run optimizations
879  optimize_ir(func, module, module_pass_manager, live_funcs, co);
880  legalize_nvvm_ir(func);
881 
882  std::stringstream ss;
883  llvm::raw_os_ostream os(ss);
884 
885  llvm::LLVMContext& ctx = module->getContext();
886  // Get "nvvm.annotations" metadata node
887  llvm::NamedMDNode* md = module->getOrInsertNamedMetadata("nvvm.annotations");
888 
889  llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
890  llvm::MDString::get(ctx, "kernel"),
891  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
892  llvm::Type::getInt32Ty(ctx), 1))};
893 
894  // Append metadata to nvvm.annotations
895  md->addOperand(llvm::MDNode::get(ctx, md_vals));
896 
897  std::unordered_set<llvm::Function*> roots{wrapper_func, func};
898  if (gpu_target.row_func_not_inlined) {
900  roots.insert(gpu_target.cgen_state->row_func_);
901  if (gpu_target.cgen_state->filter_func_) {
902  roots.insert(gpu_target.cgen_state->filter_func_);
903  }
904  }
905 
906  // prevent helper functions from being removed
907  for (auto f : gpu_target.cgen_state->helper_functions_) {
908  roots.insert(f);
909  }
910 
911  // Prevent the udf function(s) from being removed the way the runtime functions are
912  std::unordered_set<std::string> udf_declarations;
913  if (is_udf_module_present()) {
914  for (auto& f : udf_gpu_module->getFunctionList()) {
915  llvm::Function* udf_function = module->getFunction(f.getName());
916 
917  if (udf_function) {
918  legalize_nvvm_ir(udf_function);
919  roots.insert(udf_function);
920 
921  // If we have a udf that declares a external function
922  // note it so we can avoid duplicate declarations
923  if (f.isDeclaration()) {
924  udf_declarations.insert(f.getName().str());
925  }
926  }
927  }
928  }
929 
930  if (is_rt_udf_module_present()) {
931  for (auto& f : rt_udf_gpu_module->getFunctionList()) {
932  llvm::Function* udf_function = module->getFunction(f.getName());
933  if (udf_function) {
934  legalize_nvvm_ir(udf_function);
935  roots.insert(udf_function);
936 
937  // If we have a udf that declares a external function
938  // note it so we can avoid duplicate declarations
939  if (f.isDeclaration()) {
940  udf_declarations.insert(f.getName().str());
941  }
942  }
943  }
944  }
945 
946  std::vector<llvm::Function*> rt_funcs;
947  for (auto& Fn : *module) {
948  if (roots.count(&Fn)) {
949  continue;
950  }
951  rt_funcs.push_back(&Fn);
952  }
953  for (auto& pFn : rt_funcs) {
954  pFn->removeFromParent();
955  }
956 
957  if (requires_libdevice) {
958  add_intrinsics_to_module(module);
959  }
960 
961  module->print(os, nullptr);
962  os.flush();
963 
964  for (auto& pFn : rt_funcs) {
965  module->getFunctionList().push_back(pFn);
966  }
967  module->eraseNamedMetadata(md);
968 
969  auto cuda_llir = cuda_rt_decls + extension_function_decls(udf_declarations) + ss.str();
970  const auto ptx = generatePTX(
971  cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
972 
973  LOG(PTX) << "PTX for the GPU:\n" << ptx << "\nEnd of PTX";
974 
975  auto cubin_result = ptx_to_cubin(ptx, gpu_target.block_size, gpu_target.cuda_mgr);
976  auto& option_keys = cubin_result.option_keys;
977  auto& option_values = cubin_result.option_values;
978  auto cubin = cubin_result.cubin;
979  auto link_state = cubin_result.link_state;
980  const auto num_options = option_keys.size();
981 
982  auto func_name = wrapper_func->getName().str();
983  auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
984  for (int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
985  ++device_id) {
986  gpu_compilation_context->addDeviceCode(
987  std::make_unique<GpuDeviceCompilationContext>(cubin,
988  func_name,
989  device_id,
990  gpu_target.cuda_mgr,
991  num_options,
992  &option_keys[0],
993  &option_values[0]));
994  }
995 
996  checkCudaErrors(cuLinkDestroy(link_state));
997  return gpu_compilation_context;
998 #else
999  return {};
1000 #endif
1001 }
1002 
1003 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenGPU(
1004  llvm::Function* query_func,
1005  llvm::Function* multifrag_query_func,
1006  std::unordered_set<llvm::Function*>& live_funcs,
1007  const bool no_inline,
1008  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
1009  const CompilationOptions& co) {
1010 #ifdef HAVE_CUDA
1011  auto module = multifrag_query_func->getParent();
1012  CHECK(cuda_mgr);
1013  CodeCacheKey key{serialize_llvm_object(query_func),
1014  serialize_llvm_object(cgen_state_->row_func_)};
1015  if (cgen_state_->filter_func_) {
1016  key.push_back(serialize_llvm_object(cgen_state_->filter_func_));
1017  }
1018  for (const auto helper : cgen_state_->helper_functions_) {
1019  key.push_back(serialize_llvm_object(helper));
1020  }
1021  auto cached_code = getCodeFromCache(key, gpu_code_cache_);
1022  if (cached_code) {
1023  return cached_code;
1024  }
1025 
1026  bool row_func_not_inlined = false;
1027  if (no_inline) {
1028  for (auto it = llvm::inst_begin(cgen_state_->row_func_),
1029  e = llvm::inst_end(cgen_state_->row_func_);
1030  it != e;
1031  ++it) {
1032  if (llvm::isa<llvm::CallInst>(*it)) {
1033  auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
1034  if (get_gv_call.getCalledFunction()->getName() == "array_size" ||
1035  get_gv_call.getCalledFunction()->getName() == "linear_probabilistic_count") {
1036  mark_function_never_inline(cgen_state_->row_func_);
1037  row_func_not_inlined = true;
1038  break;
1039  }
1040  }
1041  }
1042  }
1043 
1044  initializeNVPTXBackend();
1045  CodeGenerator::GPUTarget gpu_target{nvptx_target_machine_.get(),
1046  cuda_mgr,
1047  blockSize(),
1048  cgen_state_.get(),
1049  row_func_not_inlined};
1050  std::shared_ptr<GpuCompilationContext> compilation_context;
1051 
1052  if (check_module_requires_libdevice(module)) {
1053  if (g_rt_libdevice_module == nullptr) {
1054  // raise error
1055  throw std::runtime_error(
1056  "libdevice library is not available but required by the UDF module");
1057  }
1058 
1059  // Bind libdevice it to the current module
1060  CodeGenerator::link_udf_module(g_rt_libdevice_module,
1061  *module,
1062  cgen_state_.get(),
1063  llvm::Linker::Flags::OverrideFromSrc);
1064 
1065  // activate nvvm-reflect-ftz flag on the module
1066  module->addModuleFlag(llvm::Module::Override, "nvvm-reflect-ftz", (int)1);
1067  for (llvm::Function& fn : *module) {
1068  fn.addFnAttr("nvptx-f32ftz", "true");
1069  }
1070  }
1071 
1072  try {
1073  compilation_context = CodeGenerator::generateNativeGPUCode(
1074  query_func, multifrag_query_func, live_funcs, co, gpu_target);
1075  addCodeToCache(key, compilation_context, module, gpu_code_cache_);
1076  } catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
1077  if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
1078  // Thrown if memory not able to be allocated on gpu
1079  // Retry once after evicting portion of code cache
1080  LOG(WARNING) << "Failed to allocate GPU memory for generated code. Evicting "
1082  << "% of GPU code cache and re-trying.";
1083  gpu_code_cache_.evictFractionEntries(g_fraction_code_cache_to_evict);
1084  compilation_context = CodeGenerator::generateNativeGPUCode(
1085  query_func, multifrag_query_func, live_funcs, co, gpu_target);
1086  addCodeToCache(key, compilation_context, module, gpu_code_cache_);
1087  } else {
1088  throw;
1089  }
1090  }
1091  CHECK(compilation_context);
1092  return compilation_context;
1093 #else
1094  return nullptr;
1095 #endif
1096 }
1097 
1098 std::string CodeGenerator::generatePTX(const std::string& cuda_llir,
1099  llvm::TargetMachine* nvptx_target_machine,
1100  llvm::LLVMContext& context) {
1101  auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir, "", false);
1102 
1103  llvm::SMDiagnostic err;
1104 
1105  auto module = llvm::parseIR(mem_buff->getMemBufferRef(), err, context);
1106  if (!module) {
1107  LOG(FATAL) << err.getMessage().str();
1108  }
1109 
1110  llvm::SmallString<256> code_str;
1111  llvm::raw_svector_ostream formatted_os(code_str);
1112  CHECK(nvptx_target_machine);
1113  {
1114  llvm::legacy::PassManager ptxgen_pm;
1115  module->setDataLayout(nvptx_target_machine->createDataLayout());
1116 
1117 #if LLVM_VERSION_MAJOR >= 10
1118  nvptx_target_machine->addPassesToEmitFile(
1119  ptxgen_pm, formatted_os, nullptr, llvm::CGFT_AssemblyFile);
1120 #elif LLVM_VERSION_MAJOR >= 7
1121  nvptx_target_machine->addPassesToEmitFile(
1122  ptxgen_pm, formatted_os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
1123 #else
1124  nvptx_target_machine->addPassesToEmitFile(
1125  ptxgen_pm, formatted_os, llvm::TargetMachine::CGFT_AssemblyFile);
1126 #endif
1127  ptxgen_pm.run(*module);
1128  }
1129 
1130  return code_str.str();
1131 }
1132 
1133 std::unique_ptr<llvm::TargetMachine> CodeGenerator::initializeNVPTXBackend(
1135  llvm::InitializeAllTargets();
1136  llvm::InitializeAllTargetMCs();
1137  llvm::InitializeAllAsmPrinters();
1138  std::string err;
1139  auto target = llvm::TargetRegistry::lookupTarget("nvptx64", err);
1140  if (!target) {
1141  LOG(FATAL) << err;
1142  }
1143  return std::unique_ptr<llvm::TargetMachine>(
1144  target->createTargetMachine("nvptx64-nvidia-cuda",
1146  "",
1147  llvm::TargetOptions(),
1148  llvm::Reloc::Static));
1149 }
1150 
1151 std::string Executor::generatePTX(const std::string& cuda_llir) const {
1153  cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
1154 }
1155 
1157  if (nvptx_target_machine_) {
1158  return;
1159  }
1160  const auto cuda_mgr = catalog_->getDataMgr().getCudaMgr();
1161  LOG_IF(FATAL, cuda_mgr == nullptr) << "No CudaMgr instantiated, unable to check device "
1162  "architecture or generate code for nvidia GPUs.";
1163  const auto arch = cuda_mgr->getDeviceArch();
1164  nvptx_target_machine_ = CodeGenerator::initializeNVPTXBackend(arch);
1165 }
1166 
1167 // A small number of runtime functions don't get through CgenState::emitCall. List them
1168 // explicitly here and always clone their implementation from the runtime module.
1169 bool CodeGenerator::alwaysCloneRuntimeFunction(const llvm::Function* func) {
1170  return func->getName() == "query_stub_hoisted_literals" ||
1171  func->getName() == "multifrag_query_hoisted_literals" ||
1172  func->getName() == "query_stub" || func->getName() == "multifrag_query" ||
1173  func->getName() == "fixed_width_int_decode" ||
1174  func->getName() == "fixed_width_unsigned_decode" ||
1175  func->getName() == "diff_fixed_width_int_decode" ||
1176  func->getName() == "fixed_width_double_decode" ||
1177  func->getName() == "fixed_width_float_decode" ||
1178  func->getName() == "fixed_width_small_date_decode" ||
1179  func->getName() == "record_error_code";
1180 }
1181 
1182 llvm::Module* read_template_module(llvm::LLVMContext& context) {
1183  llvm::SMDiagnostic err;
1184 
1185  auto buffer_or_error = llvm::MemoryBuffer::getFile(omnisci::get_root_abs_path() +
1186  "/QueryEngine/RuntimeFunctions.bc");
1187  CHECK(!buffer_or_error.getError());
1188  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1189 
1190  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1191  CHECK(!owner.takeError());
1192  auto module = owner.get().release();
1193  CHECK(module);
1194 
1195  return module;
1196 }
1197 
1198 #ifdef HAVE_CUDA
1199 llvm::Module* read_libdevice_module(llvm::LLVMContext& context) {
1200  llvm::SMDiagnostic err;
1201 
1202  const char* CUDA_DEFAULT_PATH = "/usr/local/cuda";
1203  const char* env = nullptr;
1204 
1205  if (!(env = getenv("CUDA_HOME")) && !(env = getenv("CUDA_DIR"))) {
1206  // check if the default CUDA directory exists: /usr/local/cuda
1207  if (boost::filesystem::exists(boost::filesystem::path(CUDA_DEFAULT_PATH)))
1208  env = CUDA_DEFAULT_PATH;
1209  }
1210 
1211  if (env == nullptr) {
1212  LOG(WARNING) << "Could not find CUDA installation path: environment variables "
1213  "CUDA_HOME or CUDA_DIR are not defined";
1214  return nullptr;
1215  }
1216 
1217  boost::filesystem::path cuda_path{env};
1218  cuda_path /= "nvvm";
1219  cuda_path /= "libdevice";
1220  cuda_path /= "libdevice.10.bc";
1221 
1222  if (!boost::filesystem::exists(cuda_path)) {
1223  LOG(WARNING) << "Could not find CUDA libdevice; support for some UDF "
1224  "functions might not be available.";
1225  return nullptr;
1226  }
1227 
1228  auto buffer_or_error = llvm::MemoryBuffer::getFile(cuda_path.c_str());
1229  CHECK(!buffer_or_error.getError());
1230  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1231 
1232  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1233  CHECK(!owner.takeError());
1234  auto module = owner.get().release();
1235  CHECK(module);
1236 
1237  return module;
1238 }
1239 #endif
1240 
1241 #ifdef ENABLE_GEOS
1242 llvm::Module* read_geos_module(llvm::LLVMContext& context) {
1243  llvm::SMDiagnostic err;
1244 
1245  auto buffer_or_error = llvm::MemoryBuffer::getFile(omnisci::get_root_abs_path() +
1246  "/QueryEngine/GeosRuntime.bc");
1247  CHECK(!buffer_or_error.getError());
1248  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
1249 
1250  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
1251  CHECK(!owner.takeError());
1252  auto module = owner.get().release();
1253  CHECK(module);
1254 
1255  return module;
1256 }
1257 #endif
1258 
1259 namespace {
1260 
1261 void bind_pos_placeholders(const std::string& pos_fn_name,
1262  const bool use_resume_param,
1263  llvm::Function* query_func,
1264  llvm::Module* module) {
1265  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1266  ++it) {
1267  if (!llvm::isa<llvm::CallInst>(*it)) {
1268  continue;
1269  }
1270  auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1271  if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
1272  if (use_resume_param) {
1273  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1274  llvm::ReplaceInstWithInst(
1275  &pos_call,
1276  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl"),
1277  error_code_arg));
1278  } else {
1279  llvm::ReplaceInstWithInst(
1280  &pos_call,
1281  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl")));
1282  }
1283  break;
1284  }
1285  }
1286 }
1287 
1288 void set_row_func_argnames(llvm::Function* row_func,
1289  const size_t in_col_count,
1290  const size_t agg_col_count,
1291  const bool hoist_literals) {
1292  auto arg_it = row_func->arg_begin();
1293 
1294  if (agg_col_count) {
1295  for (size_t i = 0; i < agg_col_count; ++i) {
1296  arg_it->setName("out");
1297  ++arg_it;
1298  }
1299  } else {
1300  arg_it->setName("group_by_buff");
1301  ++arg_it;
1302  arg_it->setName("crt_matched");
1303  ++arg_it;
1304  arg_it->setName("total_matched");
1305  ++arg_it;
1306  arg_it->setName("old_total_matched");
1307  ++arg_it;
1308  arg_it->setName("max_matched");
1309  ++arg_it;
1310  }
1311 
1312  arg_it->setName("agg_init_val");
1313  ++arg_it;
1314 
1315  arg_it->setName("pos");
1316  ++arg_it;
1317 
1318  arg_it->setName("frag_row_off");
1319  ++arg_it;
1320 
1321  arg_it->setName("num_rows_per_scan");
1322  ++arg_it;
1323 
1324  if (hoist_literals) {
1325  arg_it->setName("literals");
1326  ++arg_it;
1327  }
1328 
1329  for (size_t i = 0; i < in_col_count; ++i) {
1330  arg_it->setName("col_buf" + std::to_string(i));
1331  ++arg_it;
1332  }
1333 
1334  arg_it->setName("join_hash_tables");
1335 }
1336 
1337 llvm::Function* create_row_function(const size_t in_col_count,
1338  const size_t agg_col_count,
1339  const bool hoist_literals,
1340  llvm::Module* module,
1341  llvm::LLVMContext& context) {
1342  std::vector<llvm::Type*> row_process_arg_types;
1343 
1344  if (agg_col_count) {
1345  // output (aggregate) arguments
1346  for (size_t i = 0; i < agg_col_count; ++i) {
1347  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1348  }
1349  } else {
1350  // group by buffer
1351  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1352  // current match count
1353  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1354  // total match count passed from the caller
1355  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1356  // old total match count returned to the caller
1357  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1358  // max matched (total number of slots in the output buffer)
1359  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1360  }
1361 
1362  // aggregate init values
1363  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1364 
1365  // position argument
1366  row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1367 
1368  // fragment row offset argument
1369  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1370 
1371  // number of rows for each scan
1372  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1373 
1374  // literals buffer argument
1375  if (hoist_literals) {
1376  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1377  }
1378 
1379  // column buffer arguments
1380  for (size_t i = 0; i < in_col_count; ++i) {
1381  row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1382  }
1383 
1384  // join hash table argument
1385  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1386 
1387  // generate the function
1388  auto ft =
1389  llvm::FunctionType::get(get_int_type(32, context), row_process_arg_types, false);
1390 
1391  auto row_func =
1392  llvm::Function::Create(ft, llvm::Function::ExternalLinkage, "row_func", module);
1393 
1394  // set the row function argument names; for debugging purposes only
1395  set_row_func_argnames(row_func, in_col_count, agg_col_count, hoist_literals);
1396 
1397  return row_func;
1398 }
1399 
1400 void bind_query(llvm::Function* query_func,
1401  const std::string& query_fname,
1402  llvm::Function* multifrag_query_func,
1403  llvm::Module* module) {
1404  std::vector<llvm::CallInst*> query_stubs;
1405  for (auto it = llvm::inst_begin(multifrag_query_func),
1406  e = llvm::inst_end(multifrag_query_func);
1407  it != e;
1408  ++it) {
1409  if (!llvm::isa<llvm::CallInst>(*it)) {
1410  continue;
1411  }
1412  auto& query_call = llvm::cast<llvm::CallInst>(*it);
1413  if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
1414  query_stubs.push_back(&query_call);
1415  }
1416  }
1417  for (auto& S : query_stubs) {
1418  std::vector<llvm::Value*> args;
1419  for (size_t i = 0; i < S->getNumArgOperands(); ++i) {
1420  args.push_back(S->getArgOperand(i));
1421  }
1422  llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args, ""));
1423  }
1424 }
1425 
1426 std::vector<std::string> get_agg_fnames(const std::vector<Analyzer::Expr*>& target_exprs,
1427  const bool is_group_by) {
1428  std::vector<std::string> result;
1429  for (size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1430  ++target_idx, ++agg_col_idx) {
1431  const auto target_expr = target_exprs[target_idx];
1432  CHECK(target_expr);
1433  const auto target_type_info = target_expr->get_type_info();
1434  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
1435  const bool is_varlen =
1436  (target_type_info.is_string() &&
1437  target_type_info.get_compression() == kENCODING_NONE) ||
1438  target_type_info.is_array(); // TODO: should it use is_varlen_array() ?
1439  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
1440  result.emplace_back(target_type_info.is_fp() ? "agg_id_double" : "agg_id");
1441  if (is_varlen) {
1442  result.emplace_back("agg_id");
1443  }
1444  if (target_type_info.is_geometry()) {
1445  result.emplace_back("agg_id");
1446  for (auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1447  result.emplace_back("agg_id");
1448  }
1449  }
1450  continue;
1451  }
1452  const auto agg_type = agg_expr->get_aggtype();
1453  const auto& agg_type_info =
1454  agg_type != kCOUNT ? agg_expr->get_arg()->get_type_info() : target_type_info;
1455  switch (agg_type) {
1456  case kAVG: {
1457  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1458  !agg_type_info.is_fp()) {
1459  throw std::runtime_error("AVG is only valid on integer and floating point");
1460  }
1461  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1462  ? "agg_sum"
1463  : "agg_sum_double");
1464  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1465  ? "agg_count"
1466  : "agg_count_double");
1467  break;
1468  }
1469  case kMIN: {
1470  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1471  agg_type_info.is_geometry()) {
1472  throw std::runtime_error(
1473  "MIN on strings, arrays or geospatial types not supported yet");
1474  }
1475  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1476  ? "agg_min"
1477  : "agg_min_double");
1478  break;
1479  }
1480  case kMAX: {
1481  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1482  agg_type_info.is_geometry()) {
1483  throw std::runtime_error(
1484  "MAX on strings, arrays or geospatial types not supported yet");
1485  }
1486  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1487  ? "agg_max"
1488  : "agg_max_double");
1489  break;
1490  }
1491  case kSUM: {
1492  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1493  !agg_type_info.is_fp()) {
1494  throw std::runtime_error("SUM is only valid on integer and floating point");
1495  }
1496  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1497  ? "agg_sum"
1498  : "agg_sum_double");
1499  break;
1500  }
1501  case kCOUNT:
1502  result.emplace_back(agg_expr->get_is_distinct() ? "agg_count_distinct"
1503  : "agg_count");
1504  break;
1505  case kSINGLE_VALUE: {
1506  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1507  break;
1508  }
1509  case kSAMPLE: {
1510  // Note that varlen SAMPLE arguments are handled separately above
1511  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1512  break;
1513  }
1515  result.emplace_back("agg_approximate_count_distinct");
1516  break;
1517  default:
1518  CHECK(false);
1519  }
1520  }
1521  return result;
1522 }
1523 
1524 } // namespace
1525 
1526 std::unique_ptr<llvm::Module> g_rt_module(read_template_module(getGlobalLLVMContext()));
1527 
1528 #ifdef ENABLE_GEOS
1529 std::unique_ptr<llvm::Module> g_rt_geos_module(read_geos_module(getGlobalLLVMContext()));
1530 #endif
1531 
1532 #ifdef HAVE_CUDA
1533 std::unique_ptr<llvm::Module> g_rt_libdevice_module(
1534  read_libdevice_module(getGlobalLLVMContext()));
1535 #endif
1536 
1537 bool is_udf_module_present(bool cpu_only) {
1538  return (cpu_only || udf_gpu_module != nullptr) && (udf_cpu_module != nullptr);
1539 }
1540 
1541 bool is_rt_udf_module_present(bool cpu_only) {
1542  return (cpu_only || rt_udf_gpu_module != nullptr) && (rt_udf_cpu_module != nullptr);
1543 }
1544 
1545 void throw_parseIR_error(const llvm::SMDiagnostic& parse_error, std::string src = "") {
1546  std::string excname = "LLVM IR ParseError: ";
1547  llvm::raw_string_ostream ss(excname);
1548  parse_error.print(src.c_str(), ss, false, false);
1549  throw std::runtime_error(ss.str());
1550 }
1551 
1552 void read_udf_gpu_module(const std::string& udf_ir_filename) {
1553  llvm::SMDiagnostic parse_error;
1554 
1555  llvm::StringRef file_name_arg(udf_ir_filename);
1556 
1557  udf_gpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1558  if (!udf_gpu_module) {
1559  throw_parseIR_error(parse_error, udf_ir_filename);
1560  }
1561 }
1562 
1563 void read_udf_cpu_module(const std::string& udf_ir_filename) {
1564  llvm::SMDiagnostic parse_error;
1565 
1566  llvm::StringRef file_name_arg(udf_ir_filename);
1567 
1568  udf_cpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1569  if (!udf_cpu_module) {
1570  throw_parseIR_error(parse_error, udf_ir_filename);
1571  }
1572 }
1573 
1574 void read_rt_udf_gpu_module(const std::string& udf_ir_string) {
1575  llvm::SMDiagnostic parse_error;
1576 
1577  auto buf =
1578  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for GPU");
1579 
1580  rt_udf_gpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1581  if (!rt_udf_gpu_module) {
1582  throw_parseIR_error(parse_error);
1583  }
1584 }
1585 
1586 void read_rt_udf_cpu_module(const std::string& udf_ir_string) {
1587  llvm::SMDiagnostic parse_error;
1588 
1589  auto buf =
1590  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for CPU");
1591 
1592  rt_udf_cpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1593  if (!rt_udf_cpu_module) {
1594  throw_parseIR_error(parse_error);
1595  }
1596 }
1597 
1598 std::unordered_set<llvm::Function*> CodeGenerator::markDeadRuntimeFuncs(
1599  llvm::Module& module,
1600  const std::vector<llvm::Function*>& roots,
1601  const std::vector<llvm::Function*>& leaves) {
1602  std::unordered_set<llvm::Function*> live_funcs;
1603  live_funcs.insert(roots.begin(), roots.end());
1604  live_funcs.insert(leaves.begin(), leaves.end());
1605 
1606  if (auto F = module.getFunction("init_shared_mem_nop")) {
1607  live_funcs.insert(F);
1608  }
1609  if (auto F = module.getFunction("write_back_nop")) {
1610  live_funcs.insert(F);
1611  }
1612 
1613  for (const llvm::Function* F : roots) {
1614  for (const llvm::BasicBlock& BB : *F) {
1615  for (const llvm::Instruction& I : BB) {
1616  if (const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1617  live_funcs.insert(CI->getCalledFunction());
1618  }
1619  }
1620  }
1621  }
1622 
1623  for (llvm::Function& F : module) {
1624  if (!live_funcs.count(&F) && !F.isDeclaration()) {
1625  F.setLinkage(llvm::GlobalValue::InternalLinkage);
1626  }
1627  }
1628 
1629  return live_funcs;
1630 }
1631 
1632 namespace {
1633 // searches for a particular variable within a specific basic block (or all if bb_name is
1634 // empty)
1635 template <typename InstType>
1636 llvm::Value* find_variable_in_basic_block(llvm::Function* func,
1637  std::string bb_name,
1638  std::string variable_name) {
1639  llvm::Value* result = nullptr;
1640  if (func == nullptr || variable_name.empty()) {
1641  return result;
1642  }
1643  bool is_found = false;
1644  for (auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1645  if (!bb_name.empty() && bb_it->getName() != bb_name) {
1646  continue;
1647  }
1648  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1649  if (llvm::isa<InstType>(*inst_it)) {
1650  if (inst_it->getName() == variable_name) {
1651  result = &*inst_it;
1652  is_found = true;
1653  break;
1654  }
1655  }
1656  }
1657  }
1658  return result;
1659 }
1660 }; // namespace
1661 
1662 void Executor::createErrorCheckControlFlow(llvm::Function* query_func,
1663  bool run_with_dynamic_watchdog,
1664  bool run_with_allowing_runtime_interrupt,
1665  ExecutorDeviceType device_type) {
1666  AUTOMATIC_IR_METADATA(cgen_state_.get());
1667 
1668  // check whether the row processing was successful; currently, it can
1669  // fail by running out of group by buffer slots
1670 
1671  if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1672  // when both dynamic watchdog and runtime interrupt turns on
1673  // we use dynamic watchdog
1674  run_with_allowing_runtime_interrupt = false;
1675  }
1676 
1677  llvm::Value* row_count = nullptr;
1678  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1679  device_type == ExecutorDeviceType::GPU) {
1680  row_count =
1681  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1682  }
1683 
1684  bool done_splitting = false;
1685  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1686  ++bb_it) {
1687  llvm::Value* pos = nullptr;
1688  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1689  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1690  llvm::isa<llvm::PHINode>(*inst_it)) {
1691  if (inst_it->getName() == "pos") {
1692  pos = &*inst_it;
1693  }
1694  continue;
1695  }
1696  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1697  continue;
1698  }
1699  auto& row_func_call = llvm::cast<llvm::CallInst>(*inst_it);
1700  if (std::string(row_func_call.getCalledFunction()->getName()) == "row_process") {
1701  auto next_inst_it = inst_it;
1702  ++next_inst_it;
1703  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1704  auto& br_instr = bb_it->back();
1705  llvm::IRBuilder<> ir_builder(&br_instr);
1706  llvm::Value* err_lv = &*inst_it;
1707  if (run_with_dynamic_watchdog) {
1708  CHECK(pos);
1709  llvm::Value* call_watchdog_lv = nullptr;
1710  if (device_type == ExecutorDeviceType::GPU) {
1711  // In order to make sure all threads within a block see the same barrier,
1712  // only those blocks whose none of their threads have experienced the critical
1713  // edge will go through the dynamic watchdog computation
1714  CHECK(row_count);
1715  auto crit_edge_rem =
1716  (blockSize() & (blockSize() - 1))
1717  ? ir_builder.CreateSRem(
1718  row_count,
1719  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1720  : ir_builder.CreateAnd(
1721  row_count,
1722  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1723  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1724  crit_edge_threshold->setName("crit_edge_threshold");
1725 
1726  // only those threads where pos < crit_edge_threshold go through dynamic
1727  // watchdog call
1728  call_watchdog_lv =
1729  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1730  } else {
1731  // CPU path: run watchdog for every 64th row
1732  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1733  call_watchdog_lv = ir_builder.CreateICmp(
1734  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1735  }
1736  CHECK(call_watchdog_lv);
1737  auto error_check_bb = bb_it->splitBasicBlock(
1738  llvm::BasicBlock::iterator(br_instr), ".error_check");
1739  auto& watchdog_br_instr = bb_it->back();
1740 
1741  auto watchdog_check_bb = llvm::BasicBlock::Create(
1742  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1743  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1744  auto detected_timeout = watchdog_ir_builder.CreateCall(
1745  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1746  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1747  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1748  watchdog_ir_builder.CreateBr(error_check_bb);
1749 
1750  llvm::ReplaceInstWithInst(
1751  &watchdog_br_instr,
1752  llvm::BranchInst::Create(
1753  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1754  ir_builder.SetInsertPoint(&br_instr);
1755  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1756 
1757  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1758  unified_err_lv->addIncoming(err_lv, &*bb_it);
1759  err_lv = unified_err_lv;
1760  } else if (run_with_allowing_runtime_interrupt) {
1761  CHECK(pos);
1762  llvm::Value* call_check_interrupt_lv = nullptr;
1763  if (device_type == ExecutorDeviceType::GPU) {
1764  // approximate how many times the %pos variable
1765  // is increased --> the number of iteration
1766  int32_t num_shift_by_gridDim = getExpOfTwo(gridSize());
1767  int32_t num_shift_by_blockDim = getExpOfTwo(blockSize());
1768  if (!isPowOfTwo(gridSize())) {
1769  num_shift_by_gridDim++;
1770  }
1771  if (!isPowOfTwo(blockSize())) {
1772  num_shift_by_blockDim++;
1773  }
1774  int total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1775  // check the interrupt flag for every 64th iteration
1776  llvm::Value* pos_shifted_per_iteration =
1777  ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1778  auto interrupt_predicate =
1779  ir_builder.CreateAnd(pos_shifted_per_iteration, uint64_t(0x3f));
1780  call_check_interrupt_lv =
1781  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1782  interrupt_predicate,
1783  cgen_state_->llInt(int64_t(0LL)));
1784  } else {
1785  // CPU path: run interrupt checker for every 64th row
1786  auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1787  call_check_interrupt_lv =
1788  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1789  interrupt_predicate,
1790  cgen_state_->llInt(int64_t(0LL)));
1791  }
1792  CHECK(call_check_interrupt_lv);
1793  auto error_check_bb = bb_it->splitBasicBlock(
1794  llvm::BasicBlock::iterator(br_instr), ".error_check");
1795  auto& check_interrupt_br_instr = bb_it->back();
1796 
1797  auto interrupt_check_bb = llvm::BasicBlock::Create(
1798  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
1799  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1800  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1801  cgen_state_->module_->getFunction("check_interrupt"), {});
1802  auto interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1803  detected_interrupt, cgen_state_->llInt(Executor::ERR_INTERRUPTED), err_lv);
1804  interrupt_checker_ir_builder.CreateBr(error_check_bb);
1805 
1806  llvm::ReplaceInstWithInst(
1807  &check_interrupt_br_instr,
1808  llvm::BranchInst::Create(
1809  interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
1810  ir_builder.SetInsertPoint(&br_instr);
1811  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1812 
1813  unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
1814  unified_err_lv->addIncoming(err_lv, &*bb_it);
1815  err_lv = unified_err_lv;
1816  }
1817  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1818  err_lv =
1819  ir_builder.CreateCall(cgen_state_->module_->getFunction("record_error_code"),
1820  std::vector<llvm::Value*>{err_lv, error_code_arg});
1821  if (device_type == ExecutorDeviceType::GPU) {
1822  // let kernel execution finish as expected, regardless of the observed error,
1823  // unless it is from the dynamic watchdog where all threads within that block
1824  // return together.
1825  if (run_with_allowing_runtime_interrupt) {
1826  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1827  err_lv,
1828  cgen_state_->llInt(Executor::ERR_INTERRUPTED));
1829  } else {
1830  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1831  err_lv,
1832  cgen_state_->llInt(Executor::ERR_OUT_OF_TIME));
1833  }
1834 
1835  } else {
1836  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1837  err_lv,
1838  cgen_state_->llInt(static_cast<int32_t>(0)));
1839  }
1840  auto error_bb = llvm::BasicBlock::Create(
1841  cgen_state_->context_, ".error_exit", query_func, new_bb);
1842  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1843  llvm::ReplaceInstWithInst(&br_instr,
1844  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1845  done_splitting = true;
1846  break;
1847  }
1848  }
1849  }
1850  CHECK(done_splitting);
1851 }
1852 
1853 std::vector<llvm::Value*> Executor::inlineHoistedLiterals() {
1854  AUTOMATIC_IR_METADATA(cgen_state_.get());
1855 
1856  std::vector<llvm::Value*> hoisted_literals;
1857 
1858  // row_func_ is using literals whose defs have been hoisted up to the query_func_,
1859  // extend row_func_ signature to include extra args to pass these literal values.
1860  std::vector<llvm::Type*> row_process_arg_types;
1861 
1862  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1863  E = cgen_state_->row_func_->arg_end();
1864  I != E;
1865  ++I) {
1866  row_process_arg_types.push_back(I->getType());
1867  }
1868 
1869  for (auto& element : cgen_state_->query_func_literal_loads_) {
1870  for (auto value : element.second) {
1871  row_process_arg_types.push_back(value->getType());
1872  }
1873  }
1874 
1875  auto ft = llvm::FunctionType::get(
1876  get_int_type(32, cgen_state_->context_), row_process_arg_types, false);
1877  auto row_func_with_hoisted_literals =
1878  llvm::Function::Create(ft,
1879  llvm::Function::ExternalLinkage,
1880  "row_func_hoisted_literals",
1881  cgen_state_->row_func_->getParent());
1882 
1883  auto row_func_arg_it = row_func_with_hoisted_literals->arg_begin();
1884  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1885  E = cgen_state_->row_func_->arg_end();
1886  I != E;
1887  ++I) {
1888  if (I->hasName()) {
1889  row_func_arg_it->setName(I->getName());
1890  }
1891  ++row_func_arg_it;
1892  }
1893 
1894  decltype(row_func_with_hoisted_literals) filter_func_with_hoisted_literals{nullptr};
1895  decltype(row_func_arg_it) filter_func_arg_it{nullptr};
1896  if (cgen_state_->filter_func_) {
1897  // filter_func_ is using literals whose defs have been hoisted up to the row_func_,
1898  // extend filter_func_ signature to include extra args to pass these literal values.
1899  std::vector<llvm::Type*> filter_func_arg_types;
1900 
1901  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
1902  E = cgen_state_->filter_func_->arg_end();
1903  I != E;
1904  ++I) {
1905  filter_func_arg_types.push_back(I->getType());
1906  }
1907 
1908  for (auto& element : cgen_state_->query_func_literal_loads_) {
1909  for (auto value : element.second) {
1910  filter_func_arg_types.push_back(value->getType());
1911  }
1912  }
1913 
1914  auto ft2 = llvm::FunctionType::get(
1915  get_int_type(32, cgen_state_->context_), filter_func_arg_types, false);
1916  filter_func_with_hoisted_literals =
1917  llvm::Function::Create(ft2,
1918  llvm::Function::ExternalLinkage,
1919  "filter_func_hoisted_literals",
1920  cgen_state_->filter_func_->getParent());
1921 
1922  filter_func_arg_it = filter_func_with_hoisted_literals->arg_begin();
1923  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
1924  E = cgen_state_->filter_func_->arg_end();
1925  I != E;
1926  ++I) {
1927  if (I->hasName()) {
1928  filter_func_arg_it->setName(I->getName());
1929  }
1930  ++filter_func_arg_it;
1931  }
1932  }
1933 
1934  std::unordered_map<int, std::vector<llvm::Value*>>
1935  query_func_literal_loads_function_arguments,
1936  query_func_literal_loads_function_arguments2;
1937 
1938  for (auto& element : cgen_state_->query_func_literal_loads_) {
1939  std::vector<llvm::Value*> argument_values, argument_values2;
1940 
1941  for (auto value : element.second) {
1942  hoisted_literals.push_back(value);
1943  argument_values.push_back(&*row_func_arg_it);
1944  if (cgen_state_->filter_func_) {
1945  argument_values2.push_back(&*filter_func_arg_it);
1946  cgen_state_->filter_func_args_[&*row_func_arg_it] = &*filter_func_arg_it;
1947  }
1948  if (value->hasName()) {
1949  row_func_arg_it->setName("arg_" + value->getName());
1950  if (cgen_state_->filter_func_) {
1951  filter_func_arg_it->getContext();
1952  filter_func_arg_it->setName("arg_" + value->getName());
1953  }
1954  }
1955  ++row_func_arg_it;
1956  ++filter_func_arg_it;
1957  }
1958 
1959  query_func_literal_loads_function_arguments[element.first] = argument_values;
1960  query_func_literal_loads_function_arguments2[element.first] = argument_values2;
1961  }
1962 
1963  // copy the row_func function body over
1964  // see
1965  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
1966  row_func_with_hoisted_literals->getBasicBlockList().splice(
1967  row_func_with_hoisted_literals->begin(),
1968  cgen_state_->row_func_->getBasicBlockList());
1969 
1970  // also replace row_func arguments with the arguments from row_func_hoisted_literals
1971  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1972  E = cgen_state_->row_func_->arg_end(),
1973  I2 = row_func_with_hoisted_literals->arg_begin();
1974  I != E;
1975  ++I) {
1976  I->replaceAllUsesWith(&*I2);
1977  I2->takeName(&*I);
1978  cgen_state_->filter_func_args_.replace(&*I, &*I2);
1979  ++I2;
1980  }
1981 
1982  cgen_state_->row_func_ = row_func_with_hoisted_literals;
1983 
1984  // and finally replace literal placeholders
1985  std::vector<llvm::Instruction*> placeholders;
1986  std::string prefix("__placeholder__literal_");
1987  for (auto it = llvm::inst_begin(row_func_with_hoisted_literals),
1988  e = llvm::inst_end(row_func_with_hoisted_literals);
1989  it != e;
1990  ++it) {
1991  if (it->hasName() && it->getName().startswith(prefix)) {
1992  auto offset_and_index_entry =
1993  cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
1994  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
1995 
1996  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
1997  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
1998 
1999  it->replaceAllUsesWith(
2000  query_func_literal_loads_function_arguments[lit_off][lit_idx]);
2001  placeholders.push_back(&*it);
2002  }
2003  }
2004  for (auto placeholder : placeholders) {
2005  placeholder->removeFromParent();
2006  }
2007 
2008  if (cgen_state_->filter_func_) {
2009  // copy the filter_func function body over
2010  // see
2011  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
2012  filter_func_with_hoisted_literals->getBasicBlockList().splice(
2013  filter_func_with_hoisted_literals->begin(),
2014  cgen_state_->filter_func_->getBasicBlockList());
2015 
2016  // also replace filter_func arguments with the arguments from
2017  // filter_func_hoisted_literals
2018  for (llvm::Function::arg_iterator I = cgen_state_->filter_func_->arg_begin(),
2019  E = cgen_state_->filter_func_->arg_end(),
2020  I2 = filter_func_with_hoisted_literals->arg_begin();
2021  I != E;
2022  ++I) {
2023  I->replaceAllUsesWith(&*I2);
2024  I2->takeName(&*I);
2025  ++I2;
2026  }
2027 
2028  cgen_state_->filter_func_ = filter_func_with_hoisted_literals;
2029 
2030  // and finally replace literal placeholders
2031  std::vector<llvm::Instruction*> placeholders;
2032  std::string prefix("__placeholder__literal_");
2033  for (auto it = llvm::inst_begin(filter_func_with_hoisted_literals),
2034  e = llvm::inst_end(filter_func_with_hoisted_literals);
2035  it != e;
2036  ++it) {
2037  if (it->hasName() && it->getName().startswith(prefix)) {
2038  auto offset_and_index_entry = cgen_state_->row_func_hoisted_literals_.find(
2039  llvm::dyn_cast<llvm::Value>(&*it));
2040  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
2041 
2042  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
2043  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
2044 
2045  it->replaceAllUsesWith(
2046  query_func_literal_loads_function_arguments2[lit_off][lit_idx]);
2047  placeholders.push_back(&*it);
2048  }
2049  }
2050  for (auto placeholder : placeholders) {
2051  placeholder->removeFromParent();
2052  }
2053  }
2054 
2055  return hoisted_literals;
2056 }
2057 
2058 namespace {
2059 
2060 size_t get_shared_memory_size(const bool shared_mem_used,
2061  const QueryMemoryDescriptor* query_mem_desc_ptr) {
2062  return shared_mem_used
2063  ? (query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount())
2064  : 0;
2065 }
2066 
2068  const RelAlgExecutionUnit& ra_exe_unit,
2069  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2070  const ExecutorDeviceType device_type,
2071  const unsigned gpu_blocksize,
2072  const unsigned num_blocks_per_mp) {
2073  if (device_type == ExecutorDeviceType::CPU) {
2074  return false;
2075  }
2076  if (query_mem_desc_ptr->didOutputColumnar()) {
2077  return false;
2078  }
2079  CHECK(query_mem_desc_ptr);
2080  CHECK(cuda_mgr);
2081  /*
2082  * We only use shared memory strategy if GPU hardware provides native shared
2083  * memory atomics support. From CUDA Toolkit documentation:
2084  * https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html#atomic-ops "Like
2085  * Maxwell, Pascal [and Volta] provides native shared memory atomic operations
2086  * for 32-bit integer arithmetic, along with native 32 or 64-bit compare-and-swap
2087  * (CAS)."
2088  *
2089  **/
2090  if (!cuda_mgr->isArchMaxwellOrLaterForAll()) {
2091  return false;
2092  }
2093 
2094  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2097  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty()) {
2098  // TODO: relax this, if necessary
2099  if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2100  return false;
2101  }
2102  // skip shared memory usage when dealing with 1) variable length targets, 2)
2103  // not a COUNT aggregate
2104  const auto target_infos =
2105  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2106  std::unordered_set<SQLAgg> supported_aggs{kCOUNT};
2107  if (std::find_if(target_infos.begin(),
2108  target_infos.end(),
2109  [&supported_aggs](const TargetInfo& ti) {
2110  if (ti.sql_type.is_varlen() ||
2111  !supported_aggs.count(ti.agg_kind)) {
2112  return true;
2113  } else {
2114  return false;
2115  }
2116  }) == target_infos.end()) {
2117  return true;
2118  }
2119  }
2120  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2131  if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
2132  return false;
2133  }
2134 
2135  // Fundamentally, we should use shared memory whenever the output buffer
2136  // is small enough so that we can fit it in the shared memory and yet expect
2137  // good occupancy.
2138  // For now, we allow keyless, row-wise layout, and only for perfect hash
2139  // group by operations.
2140  if (query_mem_desc_ptr->hasKeylessHash() &&
2141  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty() &&
2142  !query_mem_desc_ptr->useStreamingTopN()) {
2143  const size_t shared_memory_threshold_bytes = std::min(
2144  g_gpu_smem_threshold == 0 ? SIZE_MAX : g_gpu_smem_threshold,
2145  cuda_mgr->getMinSharedMemoryPerBlockForAllDevices() / num_blocks_per_mp);
2146  const auto output_buffer_size =
2147  query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount();
2148  if (output_buffer_size > shared_memory_threshold_bytes) {
2149  return false;
2150  }
2151 
2152  // skip shared memory usage when dealing with 1) variable length targets, 2)
2153  // non-basic aggregates (COUNT, SUM, MIN, MAX, AVG)
2154  // TODO: relax this if necessary
2155  const auto target_infos =
2156  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2157  std::unordered_set<SQLAgg> supported_aggs{kCOUNT};
2159  supported_aggs = {kCOUNT, kMIN, kMAX, kSUM, kAVG};
2160  }
2161  if (std::find_if(target_infos.begin(),
2162  target_infos.end(),
2163  [&supported_aggs](const TargetInfo& ti) {
2164  if (ti.sql_type.is_varlen() ||
2165  !supported_aggs.count(ti.agg_kind)) {
2166  return true;
2167  } else {
2168  return false;
2169  }
2170  }) == target_infos.end()) {
2171  return true;
2172  }
2173  }
2174  }
2175  return false;
2176 }
2177 
2178 #ifndef NDEBUG
2179 std::string serialize_llvm_metadata_footnotes(llvm::Function* query_func,
2180  CgenState* cgen_state) {
2181  std::string llvm_ir;
2182  std::unordered_set<llvm::MDNode*> md;
2183 
2184  // Loop over all instructions in the query function.
2185  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2186  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2187  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2188  instr_it->getAllMetadata(imd);
2189  for (auto [kind, node] : imd) {
2190  md.insert(node);
2191  }
2192  }
2193  }
2194 
2195  // Loop over all instructions in the row function.
2196  for (auto bb_it = cgen_state->row_func_->begin(); bb_it != cgen_state->row_func_->end();
2197  ++bb_it) {
2198  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2199  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2200  instr_it->getAllMetadata(imd);
2201  for (auto [kind, node] : imd) {
2202  md.insert(node);
2203  }
2204  }
2205  }
2206 
2207  // Loop over all instructions in the filter function.
2208  if (cgen_state->filter_func_) {
2209  for (auto bb_it = cgen_state->filter_func_->begin();
2210  bb_it != cgen_state->filter_func_->end();
2211  ++bb_it) {
2212  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2213  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2214  instr_it->getAllMetadata(imd);
2215  for (auto [kind, node] : imd) {
2216  md.insert(node);
2217  }
2218  }
2219  }
2220  }
2221 
2222  // Sort the metadata by canonical number and convert to text.
2223  if (!md.empty()) {
2224  std::map<size_t, std::string> sorted_strings;
2225  for (auto p : md) {
2226  std::string str;
2227  llvm::raw_string_ostream os(str);
2228  p->print(os, cgen_state->module_, true);
2229  os.flush();
2230  auto fields = split(str, {}, 1);
2231  if (fields.empty() || fields[0].empty()) {
2232  continue;
2233  }
2234  sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2235  }
2236  llvm_ir += "\n";
2237  for (auto [id, text] : sorted_strings) {
2238  llvm_ir += text;
2239  llvm_ir += "\n";
2240  }
2241  }
2242 
2243  return llvm_ir;
2244 }
2245 #endif // NDEBUG
2246 
2247 } // namespace
2248 
2249 std::tuple<CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
2250 Executor::compileWorkUnit(const std::vector<InputTableInfo>& query_infos,
2251  const PlanState::DeletedColumnsMap& deleted_cols_map,
2252  const RelAlgExecutionUnit& ra_exe_unit,
2253  const CompilationOptions& co,
2254  const ExecutionOptions& eo,
2255  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
2256  const bool allow_lazy_fetch,
2257  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
2258  const size_t max_groups_buffer_entry_guess,
2259  const int8_t crt_min_byte_width,
2260  const bool has_cardinality_estimation,
2261  ColumnCacheMap& column_cache,
2262  RenderInfo* render_info) {
2263  auto timer = DEBUG_TIMER(__func__);
2264 
2265 #ifndef NDEBUG
2266  static std::uint64_t counter = 0;
2267  ++counter;
2268  VLOG(1) << "CODEGEN #" << counter << ":";
2269  LOG(IR) << "CODEGEN #" << counter << ":";
2270  LOG(PTX) << "CODEGEN #" << counter << ":";
2271  LOG(ASM) << "CODEGEN #" << counter << ":";
2272 #endif
2273 
2274  nukeOldState(allow_lazy_fetch, query_infos, deleted_cols_map, &ra_exe_unit);
2275 
2276  GroupByAndAggregate group_by_and_aggregate(
2277  this,
2278  co.device_type,
2279  ra_exe_unit,
2280  query_infos,
2281  row_set_mem_owner,
2282  has_cardinality_estimation ? std::optional<int64_t>(max_groups_buffer_entry_guess)
2283  : std::nullopt);
2284  auto query_mem_desc =
2285  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
2286  max_groups_buffer_entry_guess,
2287  crt_min_byte_width,
2288  render_info,
2290 
2291  if (query_mem_desc->getQueryDescriptionType() ==
2293  !has_cardinality_estimation &&
2294  (!render_info || !render_info->isPotentialInSituRender()) && !eo.just_explain) {
2295  const auto col_range_info = group_by_and_aggregate.getColRangeInfo();
2296  throw CardinalityEstimationRequired(col_range_info.max - col_range_info.min);
2297  }
2298 
2299  const bool output_columnar = query_mem_desc->didOutputColumnar();
2300  const bool gpu_shared_mem_optimization =
2301  is_gpu_shared_mem_supported(query_mem_desc.get(),
2302  ra_exe_unit,
2303  cuda_mgr,
2304  co.device_type,
2305  cuda_mgr ? this->blockSize() : 1,
2306  cuda_mgr ? this->numBlocksPerMP() : 1);
2307  if (gpu_shared_mem_optimization) {
2308  // disable interleaved bins optimization on the GPU
2309  query_mem_desc->setHasInterleavedBinsOnGpu(false);
2310  LOG(DEBUG1) << "GPU shared memory is used for the " +
2311  query_mem_desc->queryDescTypeToString() + " query(" +
2312  std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
2313  query_mem_desc.get())) +
2314  " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
2315  }
2316 
2317  const GpuSharedMemoryContext gpu_smem_context(
2318  get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
2319 
2321  const size_t num_count_distinct_descs =
2322  query_mem_desc->getCountDistinctDescriptorsSize();
2323  for (size_t i = 0; i < num_count_distinct_descs; i++) {
2324  const auto& count_distinct_descriptor =
2325  query_mem_desc->getCountDistinctDescriptor(i);
2326  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::StdSet ||
2327  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
2328  !co.hoist_literals)) {
2329  throw QueryMustRunOnCpu();
2330  }
2331  }
2332  }
2333 
2334  // Read the module template and target either CPU or GPU
2335  // by binding the stream position functions to the right implementation:
2336  // stride access for GPU, contiguous for CPU
2337  auto rt_module_copy = llvm::CloneModule(
2338 #if LLVM_VERSION_MAJOR >= 7
2339  *g_rt_module.get(),
2340 #else
2341  g_rt_module.get(),
2342 #endif
2343  cgen_state_->vmap_,
2344  [](const llvm::GlobalValue* gv) {
2345  auto func = llvm::dyn_cast<llvm::Function>(gv);
2346  if (!func) {
2347  return true;
2348  }
2349  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2350  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
2352  });
2353 
2355  if (is_udf_module_present(true)) {
2356  CodeGenerator::link_udf_module(udf_cpu_module, *rt_module_copy, cgen_state_.get());
2357  }
2358  if (is_rt_udf_module_present(true)) {
2360  rt_udf_cpu_module, *rt_module_copy, cgen_state_.get());
2361  }
2362  } else {
2363  rt_module_copy->setDataLayout(get_gpu_data_layout());
2364  rt_module_copy->setTargetTriple(get_gpu_target_triple_string());
2365 
2366  if (is_udf_module_present()) {
2367  llvm::Triple gpu_triple(udf_gpu_module->getTargetTriple());
2368 
2369  if (!gpu_triple.isNVPTX()) {
2370  throw QueryMustRunOnCpu();
2371  }
2372 
2373  CodeGenerator::link_udf_module(udf_gpu_module, *rt_module_copy, cgen_state_.get());
2374  }
2375  if (is_rt_udf_module_present()) {
2377  rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
2378  }
2379  }
2380 
2381  cgen_state_->module_ = rt_module_copy.release();
2382  AUTOMATIC_IR_METADATA(cgen_state_.get());
2383 
2384  auto agg_fnames =
2385  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
2386 
2387  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
2388 
2389  const bool is_group_by{query_mem_desc->isGroupBy()};
2390  auto [query_func, row_func_call] = is_group_by
2391  ? query_group_by_template(cgen_state_->module_,
2392  co.hoist_literals,
2393  *query_mem_desc,
2394  co.device_type,
2395  ra_exe_unit.scan_limit,
2396  gpu_smem_context)
2397  : query_template(cgen_state_->module_,
2398  agg_slot_count,
2399  co.hoist_literals,
2400  !!ra_exe_unit.estimator,
2401  gpu_smem_context);
2402  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
2403  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
2404  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
2405 
2406  cgen_state_->query_func_ = query_func;
2407  cgen_state_->row_func_call_ = row_func_call;
2408  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
2409  &query_func->getEntryBlock().front());
2410 
2411  // Generate the function signature and column head fetches s.t.
2412  // double indirection isn't needed in the inner loop
2413  auto& fetch_bb = query_func->front();
2414  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
2415  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
2416  auto col_heads = generate_column_heads_load(ra_exe_unit.input_col_descs.size(),
2417  query_func->args().begin(),
2418  fetch_ir_builder,
2419  cgen_state_->context_);
2420  CHECK_EQ(ra_exe_unit.input_col_descs.size(), col_heads.size());
2421 
2422  cgen_state_->row_func_ = create_row_function(ra_exe_unit.input_col_descs.size(),
2423  is_group_by ? 0 : agg_slot_count,
2424  co.hoist_literals,
2425  cgen_state_->module_,
2426  cgen_state_->context_);
2427  CHECK(cgen_state_->row_func_);
2428  cgen_state_->row_func_bb_ =
2429  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
2430 
2432  auto filter_func_ft =
2433  llvm::FunctionType::get(get_int_type(32, cgen_state_->context_), {}, false);
2434  cgen_state_->filter_func_ = llvm::Function::Create(filter_func_ft,
2435  llvm::Function::ExternalLinkage,
2436  "filter_func",
2437  cgen_state_->module_);
2438  CHECK(cgen_state_->filter_func_);
2439  cgen_state_->filter_func_bb_ = llvm::BasicBlock::Create(
2440  cgen_state_->context_, "entry", cgen_state_->filter_func_);
2441  }
2442 
2443  cgen_state_->current_func_ = cgen_state_->row_func_;
2444  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2445 
2446  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
2447  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
2448  const auto join_loops =
2449  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
2450 
2451  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
2452  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
2453  if (is_not_deleted_bb) {
2454  cgen_state_->row_func_bb_ = is_not_deleted_bb;
2455  }
2456  if (!join_loops.empty()) {
2457  codegenJoinLoops(join_loops,
2458  body_execution_unit,
2459  group_by_and_aggregate,
2460  query_func,
2461  cgen_state_->row_func_bb_,
2462  *(query_mem_desc.get()),
2463  co,
2464  eo);
2465  } else {
2466  const bool can_return_error = compileBody(
2467  ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
2468  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
2470  createErrorCheckControlFlow(query_func,
2473  co.device_type);
2474  }
2475  }
2476  std::vector<llvm::Value*> hoisted_literals;
2477 
2478  if (co.hoist_literals) {
2479  VLOG(1) << "number of hoisted literals: "
2480  << cgen_state_->query_func_literal_loads_.size()
2481  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
2482  << " bytes";
2483  }
2484 
2485  if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
2486  // we have some hoisted literals...
2487  hoisted_literals = inlineHoistedLiterals();
2488  }
2489 
2490  // replace the row func placeholder call with the call to the actual row func
2491  std::vector<llvm::Value*> row_func_args;
2492  for (size_t i = 0; i < cgen_state_->row_func_call_->getNumArgOperands(); ++i) {
2493  row_func_args.push_back(cgen_state_->row_func_call_->getArgOperand(i));
2494  }
2495  row_func_args.insert(row_func_args.end(), col_heads.begin(), col_heads.end());
2496  row_func_args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
2497  // push hoisted literals arguments, if any
2498  row_func_args.insert(
2499  row_func_args.end(), hoisted_literals.begin(), hoisted_literals.end());
2500  llvm::ReplaceInstWithInst(
2501  cgen_state_->row_func_call_,
2502  llvm::CallInst::Create(cgen_state_->row_func_, row_func_args, ""));
2503 
2504  // replace the filter func placeholder call with the call to the actual filter func
2505  if (cgen_state_->filter_func_) {
2506  std::vector<llvm::Value*> filter_func_args;
2507  for (auto arg_it = cgen_state_->filter_func_args_.begin();
2508  arg_it != cgen_state_->filter_func_args_.end();
2509  ++arg_it) {
2510  filter_func_args.push_back(arg_it->first);
2511  }
2512  llvm::ReplaceInstWithInst(
2513  cgen_state_->filter_func_call_,
2514  llvm::CallInst::Create(cgen_state_->filter_func_, filter_func_args, ""));
2515  }
2516 
2517  // Aggregate
2518  plan_state_->init_agg_vals_ =
2519  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
2520 
2521  /*
2522  * If we have decided to use GPU shared memory (decision is not made here), then
2523  * we generate proper code for extra components that it needs (buffer initialization and
2524  * gpu reduction from shared memory to global memory). We then replace these functions
2525  * into the already compiled query_func (replacing two placeholders, write_back_nop and
2526  * init_smem_nop). The rest of the code should be as before (row_func, etc.).
2527  */
2528  if (gpu_smem_context.isSharedMemoryUsed()) {
2529  if (query_mem_desc->getQueryDescriptionType() ==
2531  GpuSharedMemCodeBuilder gpu_smem_code(
2532  cgen_state_->module_,
2533  cgen_state_->context_,
2534  *query_mem_desc,
2535  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc),
2536  plan_state_->init_agg_vals_);
2537  gpu_smem_code.codegen();
2538  gpu_smem_code.injectFunctionsInto(query_func);
2539 
2540  // helper functions are used for caching purposes later
2541  cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
2542  cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
2543  LOG(IR) << gpu_smem_code.toString();
2544  }
2545  }
2546 
2547  auto multifrag_query_func = cgen_state_->module_->getFunction(
2548  "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
2549  CHECK(multifrag_query_func);
2550 
2551  bind_query(query_func,
2552  "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
2553  multifrag_query_func,
2554  cgen_state_->module_);
2555 
2556  std::vector<llvm::Function*> root_funcs{query_func, cgen_state_->row_func_};
2557  if (cgen_state_->filter_func_) {
2558  root_funcs.push_back(cgen_state_->filter_func_);
2559  }
2560  auto live_funcs = CodeGenerator::markDeadRuntimeFuncs(
2561  *cgen_state_->module_, root_funcs, {multifrag_query_func});
2562 
2563  // Always inline the row function and the filter function.
2564  // We don't want register spills in the inner loops.
2565  // LLVM seems to correctly free up alloca instructions
2566  // in these functions even when they are inlined.
2567  mark_function_always_inline(cgen_state_->row_func_);
2568  if (cgen_state_->filter_func_) {
2569  mark_function_always_inline(cgen_state_->filter_func_);
2570  }
2571 
2572 #ifndef NDEBUG
2573  // Add helpful metadata to the LLVM IR for debugging.
2575 #endif
2576 
2577  // Serialize the important LLVM IR functions to text for SQL EXPLAIN.
2578  std::string llvm_ir;
2579  if (eo.just_explain) {
2581 #ifdef WITH_JIT_DEBUG
2582  throw std::runtime_error(
2583  "Explain optimized not available when JIT runtime debug symbols are enabled");
2584 #else
2585  // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
2586  // optimized IR after NVVM reflect
2587  llvm::legacy::PassManager pass_manager;
2588  optimize_ir(query_func, cgen_state_->module_, pass_manager, live_funcs, co);
2589 #endif // WITH_JIT_DEBUG
2590  }
2591  llvm_ir =
2592  serialize_llvm_object(query_func) +
2593  serialize_llvm_object(cgen_state_->row_func_) +
2594  (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2595  : "");
2596 
2597 #ifndef NDEBUG
2598  llvm_ir += serialize_llvm_metadata_footnotes(query_func, cgen_state_.get());
2599 #endif
2600  }
2601 
2602  LOG(IR) << "\n\n" << query_mem_desc->toString() << "\n";
2603  LOG(IR) << "IR for the "
2604  << (co.device_type == ExecutorDeviceType::CPU ? "CPU:\n" : "GPU:\n");
2605 #ifdef NDEBUG
2606  LOG(IR) << serialize_llvm_object(query_func)
2607  << serialize_llvm_object(cgen_state_->row_func_)
2608  << (cgen_state_->filter_func_ ? serialize_llvm_object(cgen_state_->filter_func_)
2609  : "")
2610  << "\nEnd of IR";
2611 #else
2612  LOG(IR) << serialize_llvm_object(cgen_state_->module_) << "\nEnd of IR";
2613 #endif
2614 
2615  // Run some basic validation checks on the LLVM IR before code is generated below.
2616  verify_function_ir(cgen_state_->row_func_);
2617  if (cgen_state_->filter_func_) {
2618  verify_function_ir(cgen_state_->filter_func_);
2619  }
2620 
2621  // Generate final native code from the LLVM IR.
2622  return std::make_tuple(
2625  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
2626  : optimizeAndCodegenGPU(query_func,
2627  multifrag_query_func,
2628  live_funcs,
2629  is_group_by || ra_exe_unit.estimator,
2630  cuda_mgr,
2631  co),
2632  cgen_state_->getLiterals(),
2633  output_columnar,
2634  llvm_ir,
2635  std::move(gpu_smem_context)},
2636  std::move(query_mem_desc));
2637 }
2638 
2640  const RelAlgExecutionUnit& ra_exe_unit,
2641  const CompilationOptions& co) {
2642  AUTOMATIC_IR_METADATA(cgen_state_.get());
2643  if (!co.filter_on_deleted_column) {
2644  return nullptr;
2645  }
2646  CHECK(!ra_exe_unit.input_descs.empty());
2647  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
2648  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
2649  return nullptr;
2650  }
2651  const auto deleted_cd =
2652  plan_state_->getDeletedColForTable(outer_input_desc.getTableId());
2653  if (!deleted_cd) {
2654  return nullptr;
2655  }
2656  CHECK(deleted_cd->columnType.is_boolean());
2657  const auto deleted_expr =
2658  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
2659  outer_input_desc.getTableId(),
2660  deleted_cd->columnId,
2661  outer_input_desc.getNestLevel());
2662  CodeGenerator code_generator(this);
2663  const auto is_deleted =
2664  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
2665  const auto is_deleted_bb = llvm::BasicBlock::Create(
2666  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
2667  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
2668  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
2669  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
2670  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
2671  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2672  cgen_state_->ir_builder_.SetInsertPoint(bb);
2673  return bb;
2674 }
2675 
2677  GroupByAndAggregate& group_by_and_aggregate,
2678  const QueryMemoryDescriptor& query_mem_desc,
2679  const CompilationOptions& co,
2680  const GpuSharedMemoryContext& gpu_smem_context) {
2681  AUTOMATIC_IR_METADATA(cgen_state_.get());
2682 
2683  // Switch the code generation into a separate filter function if enabled.
2684  // Note that accesses to function arguments are still codegenned from the
2685  // row function's arguments, then later automatically forwarded and
2686  // remapped into filter function arguments by redeclareFilterFunction().
2687  cgen_state_->row_func_bb_ = cgen_state_->ir_builder_.GetInsertBlock();
2688  llvm::Value* loop_done{nullptr};
2689  std::unique_ptr<Executor::FetchCacheAnchor> fetch_cache_anchor;
2690  if (cgen_state_->filter_func_) {
2691  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2692  auto row_func_entry_bb = &cgen_state_->row_func_->getEntryBlock();
2693  cgen_state_->ir_builder_.SetInsertPoint(row_func_entry_bb,
2694  row_func_entry_bb->begin());
2695  loop_done = cgen_state_->ir_builder_.CreateAlloca(
2696  get_int_type(1, cgen_state_->context_), nullptr, "loop_done");
2697  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2698  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(true), loop_done);
2699  }
2700  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->filter_func_bb_);
2701  cgen_state_->current_func_ = cgen_state_->filter_func_;
2702  fetch_cache_anchor = std::make_unique<Executor::FetchCacheAnchor>(cgen_state_.get());
2703  }
2704 
2705  // generate the code for the filter
2706  std::vector<Analyzer::Expr*> primary_quals;
2707  std::vector<Analyzer::Expr*> deferred_quals;
2708  bool short_circuited =
2709  CodeGenerator::prioritizeQuals(ra_exe_unit, primary_quals, deferred_quals);
2710  if (short_circuited) {
2711  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
2712  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
2713  << " quals";
2714  }
2715  llvm::Value* filter_lv = cgen_state_->llBool(true);
2716  CodeGenerator code_generator(this);
2717  for (auto expr : primary_quals) {
2718  // Generate the filter for primary quals
2719  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
2720  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
2721  }
2722  CHECK(filter_lv->getType()->isIntegerTy(1));
2723  llvm::BasicBlock* sc_false{nullptr};
2724  if (!deferred_quals.empty()) {
2725  auto sc_true = llvm::BasicBlock::Create(
2726  cgen_state_->context_, "sc_true", cgen_state_->current_func_);
2727  sc_false = llvm::BasicBlock::Create(
2728  cgen_state_->context_, "sc_false", cgen_state_->current_func_);
2729  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
2730  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
2731  if (ra_exe_unit.join_quals.empty()) {
2732  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
2733  }
2734  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
2735  filter_lv = cgen_state_->llBool(true);
2736  }
2737  for (auto expr : deferred_quals) {
2738  filter_lv = cgen_state_->ir_builder_.CreateAnd(
2739  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
2740  }
2741 
2742  CHECK(filter_lv->getType()->isIntegerTy(1));
2743  auto ret = group_by_and_aggregate.codegen(
2744  filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
2745 
2746  // Switch the code generation back to the row function if a filter
2747  // function was enabled.
2748  if (cgen_state_->filter_func_) {
2749  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2750  cgen_state_->ir_builder_.CreateStore(cgen_state_->llBool(false), loop_done);
2751  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2752  }
2753 
2754  redeclareFilterFunction();
2755 
2756  cgen_state_->ir_builder_.SetInsertPoint(cgen_state_->row_func_bb_);
2757  cgen_state_->current_func_ = cgen_state_->row_func_;
2758  cgen_state_->filter_func_call_ =
2759  cgen_state_->ir_builder_.CreateCall(cgen_state_->filter_func_, {});
2760 
2761  if (cgen_state_->row_func_bb_->getName() == "loop_body") {
2762  auto loop_done_true = llvm::BasicBlock::Create(
2763  cgen_state_->context_, "loop_done_true", cgen_state_->row_func_);
2764  auto loop_done_false = llvm::BasicBlock::Create(
2765  cgen_state_->context_, "loop_done_false", cgen_state_->row_func_);
2766  auto loop_done_flag = cgen_state_->ir_builder_.CreateLoad(loop_done);
2767  cgen_state_->ir_builder_.CreateCondBr(
2768  loop_done_flag, loop_done_true, loop_done_false);
2769  cgen_state_->ir_builder_.SetInsertPoint(loop_done_true);
2770  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2771  cgen_state_->ir_builder_.SetInsertPoint(loop_done_false);
2772  } else {
2773  cgen_state_->ir_builder_.CreateRet(cgen_state_->filter_func_call_);
2774  }
2775  }
2776  return ret;
2777 }
2778 
2779 std::unique_ptr<llvm::Module> runtime_module_shallow_copy(CgenState* cgen_state) {
2780  return llvm::CloneModule(
2781 #if LLVM_VERSION_MAJOR >= 7
2782  *g_rt_module.get(),
2783 #else
2784  g_rt_module.get(),
2785 #endif
2786  cgen_state->vmap_,
2787  [](const llvm::GlobalValue* gv) {
2788  auto func = llvm::dyn_cast<llvm::Function>(gv);
2789  if (!func) {
2790  return true;
2791  }
2792  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2793  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage);
2794  });
2795 }
2796 
2797 std::vector<llvm::Value*> generate_column_heads_load(const int num_columns,
2798  llvm::Value* byte_stream_arg,
2799  llvm::IRBuilder<>& ir_builder,
2800  llvm::LLVMContext& ctx) {
2801  CHECK(byte_stream_arg);
2802  const auto max_col_local_id = num_columns - 1;
2803 
2804  std::vector<llvm::Value*> col_heads;
2805  for (int col_id = 0; col_id <= max_col_local_id; ++col_id) {
2806  col_heads.emplace_back(ir_builder.CreateLoad(ir_builder.CreateGEP(
2807  byte_stream_arg, llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id))));
2808  }
2809  return col_heads;
2810 }
std::vector< llvm::Function * > helper_functions_
Definition: CgenState.h:338
std::tuple< llvm::Function *, llvm::CallInst * > query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
std::vector< Analyzer::Expr * > target_exprs
std::string generatePTX(const std::string &) const
#define CHECK_EQ(x, y)
Definition: Logger.h:205
bool is_udf_module_present(bool cpu_only)
std::string filename(char const *path)
Definition: Logger.cpp:62
int64_t * src
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
llvm::Module * read_template_module(llvm::LLVMContext &context)
std::unique_ptr< llvm::Module > rt_udf_cpu_module
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
bool g_enable_smem_group_by
llvm::Function * getReductionFunction() const
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:989
std::unique_ptr< llvm::Module > udf_cpu_module
void mark_function_never_inline(llvm::Function *func)
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function *> &live_funcs, const CompilationOptions &co)
ExecutorDeviceType
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults > >> ColumnCacheMap
void initializeNVPTXBackend() const
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:188
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void add_intrinsics_to_module(llvm::Module *module)
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void mark_function_always_inline(llvm::Function *func)
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function *> &roots, const std::vector< llvm::Function *> &leaves)
ColRangeInfo getColRangeInfo()
llvm::StringRef get_gpu_data_layout()
void verify_function_ir(const llvm::Function *func)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
std::string join(T const &container, std::string const &delim)
std::unique_ptr< llvm::Module > runtime_module_shallow_copy(CgenState *cgen_state)
std::vector< InputDescriptor > input_descs
std::string serialize_llvm_metadata_footnotes(llvm::Function *query_func, CgenState *cgen_state)
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:288
std::vector< std::string > CodeCacheKey
Definition: CodeCache.h:25
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function *> &live_funcs, const CompilationOptions &co, const GPUTarget &gpu_target)
ExecutorOptLevel opt_level
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
int getDeviceCount() const
Definition: CudaMgr.h:86
std::unique_ptr< llvm::Module > udf_gpu_module
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
static void show_defined(std::unique_ptr< llvm::Module > &module)
unsigned getExpOfTwo(unsigned n)
Definition: MathUtils.h:24
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
std::string to_string(char const *&&v)
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
const CudaMgr_Namespace::CudaMgr * cuda_mgr
Definition: CodeGenerator.h:89
#define LOG_IF(severity, condition)
Definition: Logger.h:287
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function *> &live_funcs)
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *module)
llvm::Function * row_func_
Definition: CgenState.h:331
std::shared_ptr< CompilationContext > getCodeFromCache(const CodeCacheKey &, const CodeCache &)
bool g_enable_smem_non_grouped_agg
Definition: Execute.cpp:119
Definition: sqldefs.h:73
bool check_module_requires_libdevice(llvm::Module *module)
llvm::TargetMachine * nvptx_target_machine
Definition: CodeGenerator.h:88
bool isPowOfTwo(unsigned n)
Definition: MathUtils.h:20
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned gpu_blocksize, const unsigned num_blocks_per_mp)
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:64
llvm::StringRef get_gpu_target_triple_string()
llvm::Module * module_
Definition: CgenState.h:330
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
std::tuple< CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const PlanState::DeletedColumnsMap &deleted_cols_map, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
llvm::LLVMContext & context_
Definition: CgenState.h:339
const bool allow_multifrag
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function *> &, const CompilationOptions &)
const_list_iterator_t find(const key_t &key) const
Definition: LruCache.hpp:49
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:129
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
const bool with_dynamic_watchdog
std::unordered_map< TableId, const ColumnDescriptor * > DeletedColumnsMap
Definition: PlanState.h:44
const JoinQualsPerNestingLevel join_quals
bool countDistinctDescriptorsLogicallyEmpty() const
void read_rt_udf_gpu_module(const std::string &udf_ir_string)
ExecutorExplainType explain_type
const_list_iterator_t cend() const
Definition: LruCache.hpp:58
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:988
Definition: sqldefs.h:75
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr *> &target_exprs, const bool is_group_by)
std::unique_ptr< llvm::Module > g_rt_module
std::map< std::string, std::string > get_device_parameters(bool cpu_only)
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
Definition: CudaMgr.h:148
static void addCodeToCache(const CodeCacheKey &, std::shared_ptr< CompilationContext >, llvm::Module *, CodeCache &)
#define AUTOMATIC_IR_METADATA_DONE()
ExecutorDeviceType device_type
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr *> &primary_quals, std::vector< Analyzer::Expr *> &deferred_quals)
Definition: LogicalIR.cpp:157
void read_udf_gpu_module(const std::string &udf_ir_filename)
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function *> &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:26
void optimize_ir(llvm::Function *query_func, llvm::Module *module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function *> &live_funcs, const CompilationOptions &co)
llvm::Function * filter_func_
Definition: CgenState.h:332
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
llvm::Function * getInitFunction() const
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
SQLAgg get_aggtype() const
Definition: Analyzer.h:1095
bool g_enable_smem_grouped_non_count_agg
Definition: Execute.cpp:116
void read_rt_udf_cpu_module(const std::string &udf_ir_string)
Definition: sqldefs.h:76
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type)
int CUdevice
Definition: nocuda.h:20
bool g_enable_filter_function
Definition: Execute.cpp:77
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
llvm::Value * toBool(llvm::Value *)
Definition: LogicalIR.cpp:335
llvm::LLVMContext & getGlobalLLVMContext()
float g_fraction_code_cache_to_evict
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr *> &targets, const QueryMemoryDescriptor &query_mem_desc)
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls)
std::tuple< llvm::Function *, llvm::CallInst * > query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
std::list< std::shared_ptr< Analyzer::Expr > > quals
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:114
#define CHECK(condition)
Definition: Logger.h:197
#define DEBUG_TIMER(name)
Definition: Logger.h:313
llvm::ValueToValueMapTy vmap_
Definition: CgenState.h:340
std::string get_root_abs_path()
std::vector< llvm::Value * > inlineHoistedLiterals()
llvm::Function * create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *module, llvm::LLVMContext &context)
void injectFunctionsInto(llvm::Function *query_func)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="")
void read_udf_cpu_module(const std::string &udf_ir_filename)
const bool allow_runtime_query_interrupt
QueryDescriptionType getQueryDescriptionType() const
Definition: sqldefs.h:74
int cpu_threads()
Definition: thread_count.h:24
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
CubinResult ptx_to_cubin(const std::string &ptx, const unsigned block_size, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
Definition: sqldefs.h:72
void put(const key_t &key, value_t &&value)
Definition: LruCache.hpp:27
#define VLOG(n)
Definition: Logger.h:291
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
bool is_rt_udf_module_present(bool cpu_only)
size_t g_gpu_smem_threshold
Definition: Execute.cpp:111