OmniSciDB  72180abbfe
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
NativeCodegen.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
18 #include "Execute.h"
20 #include "GpuSharedMemoryUtils.h"
23 #include "QueryTemplateGenerator.h"
24 
25 #include "Shared/MathUtils.h"
26 #include "Shared/mapdpath.h"
27 #include "StreamingTopN.h"
28 
29 #if LLVM_VERSION_MAJOR < 4
30 static_assert(false, "LLVM Version >= 4 is required.");
31 #endif
32 
33 #include <llvm/Bitcode/BitcodeReader.h>
34 #include <llvm/Bitcode/BitcodeWriter.h>
35 #include <llvm/ExecutionEngine/MCJIT.h>
36 #include <llvm/IR/Attributes.h>
37 #include <llvm/IR/GlobalValue.h>
38 #include <llvm/IR/InstIterator.h>
39 #include <llvm/IR/LegacyPassManager.h>
40 #include <llvm/IR/Verifier.h>
41 #include <llvm/IRReader/IRReader.h>
42 #include <llvm/Support/Casting.h>
43 #include <llvm/Support/FileSystem.h>
44 #include <llvm/Support/FormattedStream.h>
45 #include <llvm/Support/MemoryBuffer.h>
46 #include <llvm/Support/SourceMgr.h>
47 #include <llvm/Support/TargetRegistry.h>
48 #include <llvm/Support/TargetSelect.h>
49 #include <llvm/Support/raw_os_ostream.h>
50 #include <llvm/Transforms/IPO.h>
51 #include <llvm/Transforms/IPO/AlwaysInliner.h>
52 #include <llvm/Transforms/IPO/PassManagerBuilder.h>
53 #include <llvm/Transforms/InstCombine/InstCombine.h>
54 #include <llvm/Transforms/Instrumentation.h>
55 #include <llvm/Transforms/Scalar.h>
56 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
57 #include <llvm/Transforms/Utils/Cloning.h>
58 #include "llvm/IR/IntrinsicInst.h"
59 #include "llvm/IR/Intrinsics.h"
60 
61 #if LLVM_VERSION_MAJOR >= 7
62 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
63 #include <llvm/Transforms/Utils.h>
64 #endif
65 #include <llvm/IRReader/IRReader.h>
66 #include <llvm/Linker/Linker.h>
67 #include <llvm/Support/SourceMgr.h>
68 #include <llvm/Support/raw_ostream.h>
69 
71 
72 std::unique_ptr<llvm::Module> udf_gpu_module;
73 std::unique_ptr<llvm::Module> udf_cpu_module;
74 std::unique_ptr<llvm::Module> rt_udf_gpu_module;
75 std::unique_ptr<llvm::Module> rt_udf_cpu_module;
76 
77 extern std::unique_ptr<llvm::Module> g_rt_module;
78 
79 #ifdef ENABLE_GEOS
80 extern std::unique_ptr<llvm::Module> g_rt_geos_module;
81 
82 #include <llvm/Support/DynamicLibrary.h>
83 
84 #ifndef GEOS_LIBRARY_FILENAME
85 #error Configuration should include GEOS library file name
86 #endif
87 std::unique_ptr<std::string> g_libgeos_so_filename(
88  new std::string(GEOS_LIBRARY_FILENAME));
89 static llvm::sys::DynamicLibrary geos_dynamic_library;
90 static std::mutex geos_init_mutex;
91 
92 namespace {
93 
94 void load_geos_dynamic_library() {
95  std::lock_guard<std::mutex> guard(geos_init_mutex);
96 
97  if (!geos_dynamic_library.isValid()) {
98  if (!g_libgeos_so_filename || g_libgeos_so_filename->empty()) {
99  LOG(WARNING) << "Misconfigured GEOS library file name, trying 'libgeos_c.so'";
100  g_libgeos_so_filename.reset(new std::string("libgeos_c.so"));
101  }
102  auto filename = *g_libgeos_so_filename;
103  std::string error_message;
104  geos_dynamic_library =
105  llvm::sys::DynamicLibrary::getPermanentLibrary(filename.c_str(), &error_message);
106  if (!geos_dynamic_library.isValid()) {
107  LOG(ERROR) << "Failed to load GEOS library '" + filename + "'";
108  std::string exception_message = "Failed to load GEOS library: " + error_message;
109  throw std::runtime_error(exception_message.c_str());
110  } else {
111  LOG(INFO) << "Loaded GEOS library '" + filename + "'";
112  }
113  }
114 }
115 
116 } // namespace
117 #endif
118 
119 namespace {
120 
121 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
123  llvm::Module& M,
124  const std::unordered_set<llvm::Function*>& live_funcs) {
125  std::vector<llvm::Function*> dead_funcs;
126  for (auto& F : M) {
127  bool bAlive = false;
128  if (live_funcs.count(&F)) {
129  continue;
130  }
131  for (auto U : F.users()) {
132  auto* C = llvm::dyn_cast<const llvm::CallInst>(U);
133  if (!C || C->getParent()->getParent() != &F) {
134  bAlive = true;
135  break;
136  }
137  }
138  if (!bAlive) {
139  dead_funcs.push_back(&F);
140  }
141  }
142  for (auto pFn : dead_funcs) {
143  pFn->eraseFromParent();
144  }
145 }
146 
147 void optimize_ir(llvm::Function* query_func,
148  llvm::Module* module,
149  llvm::legacy::PassManager& pass_manager,
150  const std::unordered_set<llvm::Function*>& live_funcs,
151  const CompilationOptions& co) {
152  pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
153  pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
154 #if LLVM_VERSION_MAJOR >= 7
155  pass_manager.add(llvm::createInstSimplifyLegacyPass());
156 #else
157  pass_manager.add(llvm::createInstructionSimplifierPass());
158 #endif
159  pass_manager.add(llvm::createInstructionCombiningPass());
160  pass_manager.add(llvm::createGlobalOptimizerPass());
161 
162  pass_manager.add(llvm::createLICMPass());
164  pass_manager.add(llvm::createLoopStrengthReducePass());
165  }
166  pass_manager.run(*module);
167 
168  eliminate_dead_self_recursive_funcs(*module, live_funcs);
169 }
170 #endif
171 
172 } // namespace
173 
175 
176 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine)
177  : execution_engine_(execution_engine) {}
178 
179 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine,
180  const CompilationOptions& co)
181  : execution_engine_(execution_engine) {
182  if (execution_engine_) {
184  intel_jit_listener_.reset(llvm::JITEventListener::createIntelJITEventListener());
186  execution_engine_->RegisterJITEventListener(intel_jit_listener_.get());
187  LOG(INFO) << "Registered IntelJITEventListener";
188  }
189  }
190 }
191 
193  llvm::ExecutionEngine* execution_engine) {
194  execution_engine_.reset(execution_engine);
195  intel_jit_listener_ = nullptr;
196  return *this;
197 }
198 
199 void verify_function_ir(const llvm::Function* func) {
200  std::stringstream err_ss;
201  llvm::raw_os_ostream err_os(err_ss);
202  if (llvm::verifyFunction(*func, &err_os)) {
203  func->print(llvm::outs());
204  LOG(FATAL) << err_ss.str();
205  }
206 }
207 
208 std::shared_ptr<CompilationContext> Executor::getCodeFromCache(const CodeCacheKey& key,
209  const CodeCache& cache) {
210  auto it = cache.find(key);
211  if (it != cache.cend()) {
212  delete cgen_state_->module_;
213  cgen_state_->module_ = it->second.second;
214  return it->second.first;
215  }
216  return {};
217 }
218 
220  std::shared_ptr<CompilationContext> compilation_context,
221  llvm::Module* module,
222  CodeCache& cache) {
223  cache.put(key,
224  std::make_pair<std::shared_ptr<CompilationContext>, decltype(module)>(
225  std::move(compilation_context), std::move(module)));
226 }
227 
229  llvm::Function* func,
230  const std::unordered_set<llvm::Function*>& live_funcs,
231  const CompilationOptions& co) {
232  auto module = func->getParent();
233  // run optimizations
234 #ifndef WITH_JIT_DEBUG
235  llvm::legacy::PassManager pass_manager;
236  optimize_ir(func, module, pass_manager, live_funcs, co);
237 #endif // WITH_JIT_DEBUG
238 
239  auto init_err = llvm::InitializeNativeTarget();
240  CHECK(!init_err);
241 
242  llvm::InitializeAllTargetMCs();
243  llvm::InitializeNativeTargetAsmPrinter();
244  llvm::InitializeNativeTargetAsmParser();
245 
246  std::string err_str;
247  std::unique_ptr<llvm::Module> owner(module);
248  llvm::EngineBuilder eb(std::move(owner));
249  eb.setErrorStr(&err_str);
250  eb.setEngineKind(llvm::EngineKind::JIT);
251  llvm::TargetOptions to;
252  to.EnableFastISel = true;
253  eb.setTargetOptions(to);
255  eb.setOptLevel(llvm::CodeGenOpt::None);
256  }
257 
258  ExecutionEngineWrapper execution_engine(eb.create(), co);
259  CHECK(execution_engine.get());
260 
261  execution_engine->finalizeObject();
262 
263  return execution_engine;
264 }
265 
266 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenCPU(
267  llvm::Function* query_func,
268  llvm::Function* multifrag_query_func,
269  const std::unordered_set<llvm::Function*>& live_funcs,
270  const CompilationOptions& co) {
271  auto module = multifrag_query_func->getParent();
272  CodeCacheKey key{serialize_llvm_object(query_func),
273  serialize_llvm_object(cgen_state_->row_func_)};
274  for (const auto helper : cgen_state_->helper_functions_) {
275  key.push_back(serialize_llvm_object(helper));
276  }
277  auto cached_code = getCodeFromCache(key, cpu_code_cache_);
278  if (cached_code) {
279  return cached_code;
280  }
281 
282  if (cgen_state_->needs_geos_) {
283 #ifdef ENABLE_GEOS
284  load_geos_dynamic_library();
285 
286  // Read geos runtime module and bind GEOS API function references to GEOS library
287  auto rt_geos_module_copy = llvm::CloneModule(
288 #if LLVM_VERSION_MAJOR >= 7
289  *g_rt_geos_module.get(),
290 #else
291  g_rt_geos_module.get(),
292 #endif
293  cgen_state_->vmap_,
294  [](const llvm::GlobalValue* gv) {
295  auto func = llvm::dyn_cast<llvm::Function>(gv);
296  if (!func) {
297  return true;
298  }
299  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
300  func->getLinkage() ==
301  llvm::GlobalValue::LinkageTypes::InternalLinkage ||
302  func->getLinkage() == llvm::GlobalValue::LinkageTypes::ExternalLinkage);
303  });
304  CodeGenerator::link_udf_module(rt_geos_module_copy,
305  *module,
306  cgen_state_.get(),
307  llvm::Linker::Flags::LinkOnlyNeeded);
308 #else
309  throw std::runtime_error("GEOS is disabled in this build");
310 #endif
311  }
312 
313  auto execution_engine =
314  CodeGenerator::generateNativeCPUCode(query_func, live_funcs, co);
315  auto cpu_compilation_context =
316  std::make_shared<CpuCompilationContext>(std::move(execution_engine));
317  cpu_compilation_context->setFunctionPointer(multifrag_query_func);
318  addCodeToCache(key, cpu_compilation_context, module, cpu_code_cache_);
319  return cpu_compilation_context;
320 }
321 
322 void CodeGenerator::link_udf_module(const std::unique_ptr<llvm::Module>& udf_module,
323  llvm::Module& module,
325  llvm::Linker::Flags flags) {
326  // throw a runtime error if the target module contains functions
327  // with the same name as in module of UDF functions.
328  for (auto& f : *udf_module.get()) {
329  auto func = module.getFunction(f.getName());
330  if (!(func == nullptr) && !f.isDeclaration() && flags == llvm::Linker::Flags::None) {
331  LOG(ERROR) << " Attempt to overwrite " << f.getName().str() << " in "
332  << module.getModuleIdentifier() << " from `"
333  << udf_module->getModuleIdentifier() << "`" << std::endl;
334  throw std::runtime_error(
335  "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
336  "function ***");
337  } else {
338  VLOG(1) << " Adding " << f.getName().str() << " to "
339  << module.getModuleIdentifier() << " from `"
340  << udf_module->getModuleIdentifier() << "`" << std::endl;
341  }
342  }
343 
344  std::unique_ptr<llvm::Module> udf_module_copy;
345 
346  udf_module_copy = llvm::CloneModule(
347 #if LLVM_VERSION_MAJOR >= 7
348  *udf_module.get(),
349 #else
350  udf_module.get(),
351 #endif
352  cgen_state->vmap_);
353 
354  udf_module_copy->setDataLayout(module.getDataLayout());
355  udf_module_copy->setTargetTriple(module.getTargetTriple());
356 
357  // Initialize linker with module for RuntimeFunctions.bc
358  llvm::Linker ld(module);
359  bool link_error = false;
360 
361  link_error = ld.linkInModule(std::move(udf_module_copy), flags);
362 
363  if (link_error) {
364  throw std::runtime_error("link_udf_module: *** error linking module ***");
365  }
366 }
367 
368 namespace {
369 
370 std::string cpp_to_llvm_name(const std::string& s) {
371  if (s == "int8_t") {
372  return "i8";
373  }
374  if (s == "int16_t") {
375  return "i16";
376  }
377  if (s == "int32_t") {
378  return "i32";
379  }
380  if (s == "int64_t") {
381  return "i64";
382  }
383  CHECK(s == "float" || s == "double");
384  return s;
385 }
386 
387 std::string gen_array_any_all_sigs() {
388  std::string result;
389  for (const std::string any_or_all : {"any", "all"}) {
390  for (const std::string elem_type :
391  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
392  for (const std::string needle_type :
393  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
394  for (const std::string op_name : {"eq", "ne", "lt", "le", "gt", "ge"}) {
395  result += ("declare i1 @array_" + any_or_all + "_" + op_name + "_" + elem_type +
396  "_" + needle_type + "(i8*, i64, " + cpp_to_llvm_name(needle_type) +
397  ", " + cpp_to_llvm_name(elem_type) + ");\n");
398  }
399  }
400  }
401  }
402  return result;
403 }
404 
406  std::string result;
407  for (const std::string key_type : {"int8_t", "int16_t", "int32_t", "int64_t"}) {
408  const auto key_llvm_type = cpp_to_llvm_name(key_type);
409  result += "declare i64 @translate_null_key_" + key_type + "(" + key_llvm_type + ", " +
410  key_llvm_type + ", i64);\n";
411  }
412  return result;
413 }
414 
415 const std::string cuda_rt_decls =
416  R"( declare void @llvm.dbg.declare(metadata, metadata, metadata) declare void @llvm.dbg.value(metadata, metadata, metadata) declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind declare i64 @get_thread_index(); declare i64 @get_block_index(); declare i32 @pos_start_impl(i32*); declare i32 @group_buff_idx_impl(); declare i32 @pos_step_impl(); declare i8 @thread_warp_idx(i8); declare i64* @init_shared_mem(i64*, i32); declare i64* @init_shared_mem_nop(i64*, i32); declare i64* @declare_dynamic_shared_memory(); declare void @write_back_nop(i64*, i64*, i32); declare void @write_back_non_grouped_agg(i64*, i64*, i32); declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8); declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32, i64*); declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32, i64*); declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32); declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32); declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32); declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32); declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64); declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64); declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64); declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64); declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64); declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double); declare i64 @get_bucket_key_for_range_double(i8*, i64, double); declare i32 @get_num_buckets_for_bounds(i8*, i32, double, double); declare i64 @get_candidate_rows(i32*, i32, i8*, i32, double, double, i32, i64, i64*, i64, i64, i64); declare i64 @agg_count_shared(i64*, i64); declare i64 @agg_count_skip_val_shared(i64*, i64, i64); declare i32 @agg_count_int32_shared(i32*, i32); declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32); declare i64 @agg_count_double_shared(i64*, double); declare i64 @agg_count_double_skip_val_shared(i64*, double, double); declare i32 @agg_count_float_shared(i32*, float); declare i32 @agg_count_float_skip_val_shared(i32*, float, float); declare i64 @agg_sum_shared(i64*, i64); declare i64 @agg_sum_skip_val_shared(i64*, i64, i64); declare i32 @agg_sum_int32_shared(i32*, i32); declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32); declare void @agg_sum_double_shared(i64*, double); declare void @agg_sum_double_skip_val_shared(i64*, double, double); declare void @agg_sum_float_shared(i32*, float); declare void @agg_sum_float_skip_val_shared(i32*, float, float); declare void @agg_max_shared(i64*, i64); declare void @agg_max_skip_val_shared(i64*, i64, i64); declare void @agg_max_int32_shared(i32*, i32); declare void @agg_max_int32_skip_val_shared(i32*, i32, i32); declare void @agg_max_int16_shared(i16*, i16); declare void @agg_max_int16_skip_val_shared(i16*, i16, i16); declare void @agg_max_int8_shared(i8*, i8); declare void @agg_max_int8_skip_val_shared(i8*, i8, i8); declare void @agg_max_double_shared(i64*, double); declare void @agg_max_double_skip_val_shared(i64*, double, double); declare void @agg_max_float_shared(i32*, float); declare void @agg_max_float_skip_val_shared(i32*, float, float); declare void @agg_min_shared(i64*, i64); declare void @agg_min_skip_val_shared(i64*, i64, i64); declare void @agg_min_int32_shared(i32*, i32); declare void @agg_min_int32_skip_val_shared(i32*, i32, i32); declare void @agg_min_int16_shared(i16*, i16); declare void @agg_min_int16_skip_val_shared(i16*, i16, i16); declare void @agg_min_int8_shared(i8*, i8); declare void @agg_min_int8_skip_val_shared(i8*, i8, i8); declare void @agg_min_double_shared(i64*, double); declare void @agg_min_double_skip_val_shared(i64*, double, double); declare void @agg_min_float_shared(i32*, float); declare void @agg_min_float_skip_val_shared(i32*, float, float); declare void @agg_id_shared(i64*, i64); declare void @agg_id_int32_shared(i32*, i32); declare void @agg_id_int16_shared(i16*, i16); declare void @agg_id_int8_shared(i8*, i8); declare void @agg_id_double_shared(i64*, double); declare void @agg_id_double_shared_slow(i64*, double*); declare void @agg_id_float_shared(i32*, float); declare i32 @checked_single_agg_id_shared(i64*, i64, i64); declare i32 @checked_single_agg_id_double_shared(i64*, double, double); declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double); declare i32 @checked_single_agg_id_float_shared(i32*, float, float); declare i1 @slotEmptyKeyCAS(i64*, i64, i64); declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32); declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16); declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8); declare i64 @extract_epoch(i64); declare i64 @extract_dateepoch(i64); declare i64 @extract_quarterday(i64); declare i64 @extract_hour(i64); declare i64 @extract_minute(i64); declare i64 @extract_second(i64); declare i64 @extract_millisecond(i64); declare i64 @extract_microsecond(i64); declare i64 @extract_nanosecond(i64); declare i64 @extract_dow(i64); declare i64 @extract_isodow(i64); declare i64 @extract_day(i64); declare i64 @extract_week(i64); declare i64 @extract_day_of_week(i64); declare i64 @extract_month(i64); declare i64 @extract_quarter(i64); declare i64 @extract_year(i64); declare i64 @DateTruncate(i32, i64); declare i64 @DateTruncateNullable(i32, i64, i64); declare i64 @DateTruncateHighPrecisionToDate(i64, i64); declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64); declare i64 @DateDiff(i32, i64, i64); declare i64 @DateDiffNullable(i32, i64, i64, i64); declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i64, i64, i64); declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i64, i64, i64, i64); declare i64 @DateAdd(i32, i64, i64); declare i64 @DateAddNullable(i32, i64, i64, i64); declare i64 @DateAddHighPrecision(i32, i64, i64, i64); declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i64, i64); declare i64 @string_decode(i8*, i64); declare i32 @array_size(i8*, i64, i32); declare i32 @array_size_nullable(i8*, i64, i32, i32); declare i32 @fast_fixlen_array_size(i8*, i32); declare i1 @array_is_null(i8*, i64); declare i1 @point_coord_array_is_null(i8*, i64); declare i8* @array_buff(i8*, i64); declare i8* @fast_fixlen_array_buff(i8*, i64); declare i8 @array_at_int8_t(i8*, i64, i32); declare i16 @array_at_int16_t(i8*, i64, i32); declare i32 @array_at_int32_t(i8*, i64, i32); declare i64 @array_at_int64_t(i8*, i64, i32); declare float @array_at_float(i8*, i64, i32); declare double @array_at_double(i8*, i64, i32); declare i8 @varlen_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_array_at_int64_t(i8*, i64, i32); declare float @varlen_array_at_float(i8*, i64, i32); declare double @varlen_array_at_double(i8*, i64, i32); declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32); declare float @varlen_notnull_array_at_float(i8*, i64, i32); declare double @varlen_notnull_array_at_double(i8*, i64, i32); declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8); declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16); declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32); declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64); declare float @array_at_float_checked(i8*, i64, i64, float); declare double @array_at_double_checked(i8*, i64, i64, double); declare i32 @char_length(i8*, i32); declare i32 @char_length_nullable(i8*, i32, i32); declare i32 @char_length_encoded(i8*, i32); declare i32 @char_length_encoded_nullable(i8*, i32, i32); declare i32 @key_for_string_encoded(i32); declare i1 @string_like(i8*, i32, i8*, i32, i8); declare i1 @string_ilike(i8*, i32, i8*, i32, i8); declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8); declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8); declare i1 @string_like_simple(i8*, i32, i8*, i32); declare i1 @string_ilike_simple(i8*, i32, i8*, i32); declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8); declare i1 @string_lt(i8*, i32, i8*, i32); declare i1 @string_le(i8*, i32, i8*, i32); declare i1 @string_gt(i8*, i32, i8*, i32); declare i1 @string_ge(i8*, i32, i8*, i32); declare i1 @string_eq(i8*, i32, i8*, i32); declare i1 @string_ne(i8*, i32, i8*, i32); declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8); declare i1 @regexp_like(i8*, i32, i8*, i32, i8); declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8); declare void @linear_probabilistic_count(i8*, i32, i8*, i32); declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64); declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64); declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64); declare i32 @record_error_code(i32, i32*); declare i1 @dynamic_watchdog(); declare i1 @check_interrupt(); declare void @force_sync(); declare void @sync_warp(); declare void @sync_warp_protected(i64, i64); declare void @sync_threadblock(); declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32); declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64); declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float); declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double); )" + gen_array_any_all_sigs() +
418 
419 #ifdef HAVE_CUDA
420 std::string extension_function_decls(const std::unordered_set<std::string>& udf_decls) {
421  const auto decls = ExtensionFunctionsWhitelist::getLLVMDeclarations(udf_decls);
422  return boost::algorithm::join(decls, "\n");
423 }
424 
425 void legalize_nvvm_ir(llvm::Function* query_func) {
426  // optimizations might add attributes to the function
427  // and NVPTX doesn't understand all of them; play it
428  // safe and clear all attributes
429  clear_function_attributes(query_func);
430  verify_function_ir(query_func);
431 
432  std::vector<llvm::Instruction*> stackrestore_intrinsics;
433  std::vector<llvm::Instruction*> stacksave_intrinsics;
434  for (auto& BB : *query_func) {
435  for (llvm::Instruction& I : BB) {
436  if (const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
437  if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
438  stacksave_intrinsics.push_back(&I);
439  } else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
440  stackrestore_intrinsics.push_back(&I);
441  }
442  }
443  }
444  }
445 
446  // stacksave and stackrestore intrinsics appear together, and
447  // stackrestore uses stacksaved result as its argument
448  // so it should be removed first.
449  for (auto& II : stackrestore_intrinsics) {
450  II->eraseFromParent();
451  }
452  for (auto& II : stacksave_intrinsics) {
453  II->eraseFromParent();
454  }
455 }
456 #endif // HAVE_CUDA
457 
458 } // namespace
459 
460 llvm::StringRef get_gpu_target_triple_string() {
461  return llvm::StringRef("nvptx64-nvidia-cuda");
462 }
463 
464 llvm::StringRef get_gpu_data_layout() {
465  return llvm::StringRef(
466  "e-p:64:64:64-i1:8:8-i8:8:8-"
467  "i16:16:16-i32:32:32-i64:64:64-"
468  "f32:32:32-f64:64:64-v16:16:16-"
469  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
470 }
471 
472 std::map<std::string, std::string> get_device_parameters() {
473  std::map<std::string, std::string> result;
474 
475  result.insert(std::make_pair("cpu_name", llvm::sys::getHostCPUName()));
476  result.insert(std::make_pair("cpu_triple", llvm::sys::getProcessTriple()));
477  result.insert(
478  std::make_pair("cpu_cores", std::to_string(llvm::sys::getHostNumPhysicalCores())));
479  result.insert(std::make_pair("cpu_threads", std::to_string(cpu_threads())));
480 
481  llvm::StringMap<bool> cpu_features;
482  if (llvm::sys::getHostCPUFeatures(cpu_features)) {
483  std::string features_str = "";
484  for (auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
485  features_str += (it->getValue() ? " +" : " -");
486  features_str += it->getKey().str();
487  }
488  result.insert(std::make_pair("cpu_features", features_str));
489  }
490 
491 #ifdef HAVE_CUDA
492  int device_count = 0;
493  checkCudaErrors(cuDeviceGetCount(&device_count));
494  if (device_count) {
495  CUdevice device{};
496  char device_name[256];
497  int major = 0, minor = 0;
498  checkCudaErrors(cuDeviceGet(&device, 0)); // assuming homogeneous multi-GPU system
499  checkCudaErrors(cuDeviceGetName(device_name, 256, device));
500  checkCudaErrors(cuDeviceGetAttribute(
501  &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
502  checkCudaErrors(cuDeviceGetAttribute(
503  &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
504 
505  result.insert(std::make_pair("gpu_name", device_name));
506  result.insert(std::make_pair("gpu_count", std::to_string(device_count)));
507  result.insert(std::make_pair("gpu_compute_capability",
508  std::to_string(major) + "." + std::to_string(minor)));
509  result.insert(std::make_pair("gpu_triple", get_gpu_target_triple_string()));
510  result.insert(std::make_pair("gpu_datalayout", get_gpu_data_layout()));
511  }
512 #endif
513 
514  return result;
515 }
516 
517 std::shared_ptr<GpuCompilationContext> CodeGenerator::generateNativeGPUCode(
518  llvm::Function* func,
519  llvm::Function* wrapper_func,
520  const std::unordered_set<llvm::Function*>& live_funcs,
521  const CompilationOptions& co,
522  const GPUTarget& gpu_target) {
523 #ifdef HAVE_CUDA
524  auto module = func->getParent();
525  module->setDataLayout(
526  "e-p:64:64:64-i1:8:8-i8:8:8-"
527  "i16:16:16-i32:32:32-i64:64:64-"
528  "f32:32:32-f64:64:64-v16:16:16-"
529  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
530  module->setTargetTriple("nvptx64-nvidia-cuda");
531  CHECK(gpu_target.nvptx_target_machine);
532  auto pass_manager_builder = llvm::PassManagerBuilder();
533  // add nvvm reflect pass replacing any NVVM conditionals with constants
534  gpu_target.nvptx_target_machine->adjustPassManager(pass_manager_builder);
535  pass_manager_builder.OptLevel = 0;
536 
537  llvm::legacy::PassManager pass_manager;
538  pass_manager_builder.populateModulePassManager(pass_manager);
539  // run optimizations
540  optimize_ir(func, module, pass_manager, live_funcs, co);
541  legalize_nvvm_ir(func);
542 
543  std::stringstream ss;
544  llvm::raw_os_ostream os(ss);
545 
546  llvm::LLVMContext& ctx = module->getContext();
547  // Get "nvvm.annotations" metadata node
548  llvm::NamedMDNode* md = module->getOrInsertNamedMetadata("nvvm.annotations");
549 
550  llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
551  llvm::MDString::get(ctx, "kernel"),
552  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
553  llvm::Type::getInt32Ty(ctx), 1))};
554 
555  // Append metadata to nvvm.annotations
556  md->addOperand(llvm::MDNode::get(ctx, md_vals));
557 
558  std::unordered_set<llvm::Function*> roots{wrapper_func, func};
559  if (gpu_target.row_func_not_inlined) {
560  clear_function_attributes(gpu_target.cgen_state->row_func_);
561  roots.insert(gpu_target.cgen_state->row_func_);
562  }
563 
564  // prevent helper functions from being removed
565  for (auto f : gpu_target.cgen_state->helper_functions_) {
566  roots.insert(f);
567  }
568 
569  // Prevent the udf function(s) from being removed the way the runtime functions are
570 
571  std::unordered_set<std::string> udf_declarations;
572  if (is_udf_module_present()) {
573  for (auto& f : udf_gpu_module->getFunctionList()) {
574  llvm::Function* udf_function = module->getFunction(f.getName());
575 
576  if (udf_function) {
577  legalize_nvvm_ir(udf_function);
578  roots.insert(udf_function);
579 
580  // If we have a udf that declares a external function
581  // note it so we can avoid duplicate declarations
582  if (f.isDeclaration()) {
583  udf_declarations.insert(f.getName().str());
584  }
585  }
586  }
587  }
588 
589  if (is_rt_udf_module_present()) {
590  for (auto& f : rt_udf_gpu_module->getFunctionList()) {
591  llvm::Function* udf_function = module->getFunction(f.getName());
592  if (udf_function) {
593  legalize_nvvm_ir(udf_function);
594  roots.insert(udf_function);
595 
596  // If we have a udf that declares a external function
597  // note it so we can avoid duplicate declarations
598  if (f.isDeclaration()) {
599  udf_declarations.insert(f.getName().str());
600  }
601  }
602  }
603  }
604 
605  std::vector<llvm::Function*> rt_funcs;
606  for (auto& Fn : *module) {
607  if (roots.count(&Fn)) {
608  continue;
609  }
610  rt_funcs.push_back(&Fn);
611  }
612  for (auto& pFn : rt_funcs) {
613  pFn->removeFromParent();
614  }
615  module->print(os, nullptr);
616  os.flush();
617 
618  for (auto& pFn : rt_funcs) {
619  module->getFunctionList().push_back(pFn);
620  }
621  module->eraseNamedMetadata(md);
622 
623  auto cuda_llir = cuda_rt_decls + extension_function_decls(udf_declarations) + ss.str();
624  const auto ptx = generatePTX(
625  cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state->context_);
626 
627  LOG(PTX) << "PTX for the GPU:\n" << ptx << "\nEnd of PTX";
628 
629  auto cubin_result = ptx_to_cubin(ptx, gpu_target.block_size, gpu_target.cuda_mgr);
630  auto& option_keys = cubin_result.option_keys;
631  auto& option_values = cubin_result.option_values;
632  auto cubin = cubin_result.cubin;
633  auto link_state = cubin_result.link_state;
634  const auto num_options = option_keys.size();
635 
636  auto func_name = wrapper_func->getName().str();
637  auto gpu_compilation_context = std::make_shared<GpuCompilationContext>();
638  for (int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
639  ++device_id) {
640  gpu_compilation_context->addDeviceCode(
641  std::make_unique<GpuDeviceCompilationContext>(cubin,
642  func_name,
643  device_id,
644  gpu_target.cuda_mgr,
645  num_options,
646  &option_keys[0],
647  &option_values[0]));
648  }
649 
650  checkCudaErrors(cuLinkDestroy(link_state));
651  return gpu_compilation_context;
652 #else
653  return {};
654 #endif
655 }
657 std::shared_ptr<CompilationContext> Executor::optimizeAndCodegenGPU(
658  llvm::Function* query_func,
659  llvm::Function* multifrag_query_func,
660  std::unordered_set<llvm::Function*>& live_funcs,
661  const bool no_inline,
662  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
663  const CompilationOptions& co) {
664 #ifdef HAVE_CUDA
665  auto module = multifrag_query_func->getParent();
666  CHECK(cuda_mgr);
667  CodeCacheKey key{serialize_llvm_object(query_func),
668  serialize_llvm_object(cgen_state_->row_func_)};
669 
670  for (const auto helper : cgen_state_->helper_functions_) {
671  key.push_back(serialize_llvm_object(helper));
672  }
673  auto cached_code = getCodeFromCache(key, gpu_code_cache_);
674  if (cached_code) {
675  return cached_code;
676  }
677 
678  bool row_func_not_inlined = false;
679  if (no_inline) {
680  for (auto it = llvm::inst_begin(cgen_state_->row_func_),
681  e = llvm::inst_end(cgen_state_->row_func_);
682  it != e;
683  ++it) {
684  if (llvm::isa<llvm::CallInst>(*it)) {
685  auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
686  if (get_gv_call.getCalledFunction()->getName() == "array_size" ||
687  get_gv_call.getCalledFunction()->getName() == "linear_probabilistic_count") {
688  mark_function_never_inline(cgen_state_->row_func_);
689  row_func_not_inlined = true;
690  break;
691  }
692  }
693  }
694  }
695 
696  initializeNVPTXBackend();
697  CodeGenerator::GPUTarget gpu_target{nvptx_target_machine_.get(),
698  cuda_mgr,
699  blockSize(),
700  cgen_state_.get(),
701  row_func_not_inlined};
702  std::shared_ptr<GpuCompilationContext> compilation_context;
703  try {
704  compilation_context = CodeGenerator::generateNativeGPUCode(
705  query_func, multifrag_query_func, live_funcs, co, gpu_target);
706  addCodeToCache(key, compilation_context, module, gpu_code_cache_);
707  } catch (CudaMgr_Namespace::CudaErrorException& cuda_error) {
708  if (cuda_error.getStatus() == CUDA_ERROR_OUT_OF_MEMORY) {
709  // Thrown if memory not able to be allocated on gpu
710  // Retry once after evicting portion of code cache
711  LOG(WARNING) << "Failed to allocate GPU memory for generated code. Evicting "
713  << "% of GPU code cache and re-trying.";
714  gpu_code_cache_.evictFractionEntries(g_fraction_code_cache_to_evict);
715  compilation_context = CodeGenerator::generateNativeGPUCode(
716  query_func, multifrag_query_func, live_funcs, co, gpu_target);
717  addCodeToCache(key, compilation_context, module, gpu_code_cache_);
718  } else {
719  throw;
720  }
721  }
722  CHECK(compilation_context);
723  return compilation_context;
724 #else
725  return nullptr;
726 #endif
727 }
728 
729 std::string CodeGenerator::generatePTX(const std::string& cuda_llir,
730  llvm::TargetMachine* nvptx_target_machine,
731  llvm::LLVMContext& context) {
732  auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir, "", false);
733 
734  llvm::SMDiagnostic err;
735 
736  auto module = llvm::parseIR(mem_buff->getMemBufferRef(), err, context);
737  if (!module) {
738  LOG(FATAL) << err.getMessage().str();
739  }
740 
741  llvm::SmallString<256> code_str;
742  llvm::raw_svector_ostream formatted_os(code_str);
743  CHECK(nvptx_target_machine);
744  {
745  llvm::legacy::PassManager ptxgen_pm;
746  module->setDataLayout(nvptx_target_machine->createDataLayout());
747 
748 #if LLVM_VERSION_MAJOR >= 10
749  nvptx_target_machine->addPassesToEmitFile(
750  ptxgen_pm, formatted_os, nullptr, llvm::CGFT_AssemblyFile);
751 #elif LLVM_VERSION_MAJOR >= 7
752  nvptx_target_machine->addPassesToEmitFile(
753  ptxgen_pm, formatted_os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
754 #else
755  nvptx_target_machine->addPassesToEmitFile(
756  ptxgen_pm, formatted_os, llvm::TargetMachine::CGFT_AssemblyFile);
757 #endif
758  ptxgen_pm.run(*module);
759  }
760 
761  return code_str.str();
762 }
763 
764 std::unique_ptr<llvm::TargetMachine> CodeGenerator::initializeNVPTXBackend(
766  llvm::InitializeAllTargets();
767  llvm::InitializeAllTargetMCs();
768  llvm::InitializeAllAsmPrinters();
769  std::string err;
770  auto target = llvm::TargetRegistry::lookupTarget("nvptx64", err);
771  if (!target) {
772  LOG(FATAL) << err;
773  }
774  return std::unique_ptr<llvm::TargetMachine>(
775  target->createTargetMachine("nvptx64-nvidia-cuda",
777  "",
778  llvm::TargetOptions(),
779  llvm::Reloc::Static));
780 }
781 
782 std::string Executor::generatePTX(const std::string& cuda_llir) const {
784  cuda_llir, nvptx_target_machine_.get(), cgen_state_->context_);
785 }
786 
788  if (nvptx_target_machine_) {
789  return;
790  }
791  const auto cuda_mgr = catalog_->getDataMgr().getCudaMgr();
792  LOG_IF(FATAL, cuda_mgr == nullptr) << "No CudaMgr instantiated, unable to check device "
793  "architecture or generate code for nvidia GPUs.";
794  const auto arch = cuda_mgr->getDeviceArch();
795  nvptx_target_machine_ = CodeGenerator::initializeNVPTXBackend(arch);
796 }
797 
798 // A small number of runtime functions don't get through CgenState::emitCall. List them
799 // explicitly here and always clone their implementation from the runtime module.
800 bool CodeGenerator::alwaysCloneRuntimeFunction(const llvm::Function* func) {
801  return func->getName() == "query_stub_hoisted_literals" ||
802  func->getName() == "multifrag_query_hoisted_literals" ||
803  func->getName() == "query_stub" || func->getName() == "multifrag_query" ||
804  func->getName() == "fixed_width_int_decode" ||
805  func->getName() == "fixed_width_unsigned_decode" ||
806  func->getName() == "diff_fixed_width_int_decode" ||
807  func->getName() == "fixed_width_double_decode" ||
808  func->getName() == "fixed_width_float_decode" ||
809  func->getName() == "fixed_width_small_date_decode" ||
810  func->getName() == "record_error_code";
811 }
812 
813 llvm::Module* read_template_module(llvm::LLVMContext& context) {
814  llvm::SMDiagnostic err;
815 
816  auto buffer_or_error = llvm::MemoryBuffer::getFile(mapd_root_abs_path() +
817  "/QueryEngine/RuntimeFunctions.bc");
818  CHECK(!buffer_or_error.getError());
819  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
820 
821  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
822  CHECK(!owner.takeError());
823  auto module = owner.get().release();
824  CHECK(module);
825 
826  return module;
827 }
828 
829 #ifdef ENABLE_GEOS
830 llvm::Module* read_geos_module(llvm::LLVMContext& context) {
831  llvm::SMDiagnostic err;
832 
833  auto buffer_or_error =
834  llvm::MemoryBuffer::getFile(mapd_root_abs_path() + "/QueryEngine/GeosRuntime.bc");
835  CHECK(!buffer_or_error.getError());
836  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
837 
838  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
839  CHECK(!owner.takeError());
840  auto module = owner.get().release();
841  CHECK(module);
842 
843  return module;
844 }
845 #endif
846 
847 namespace {
848 
849 void bind_pos_placeholders(const std::string& pos_fn_name,
850  const bool use_resume_param,
851  llvm::Function* query_func,
852  llvm::Module* module) {
853  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
854  ++it) {
855  if (!llvm::isa<llvm::CallInst>(*it)) {
856  continue;
857  }
858  auto& pos_call = llvm::cast<llvm::CallInst>(*it);
859  if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
860  if (use_resume_param) {
861  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
862  llvm::ReplaceInstWithInst(
863  &pos_call,
864  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl"),
865  error_code_arg));
866  } else {
867  llvm::ReplaceInstWithInst(
868  &pos_call,
869  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl")));
870  }
871  break;
872  }
873  }
874 }
875 
876 void set_row_func_argnames(llvm::Function* row_func,
877  const size_t in_col_count,
878  const size_t agg_col_count,
879  const bool hoist_literals) {
880  auto arg_it = row_func->arg_begin();
881 
882  if (agg_col_count) {
883  for (size_t i = 0; i < agg_col_count; ++i) {
884  arg_it->setName("out");
885  ++arg_it;
886  }
887  } else {
888  arg_it->setName("group_by_buff");
889  ++arg_it;
890  arg_it->setName("crt_matched");
891  ++arg_it;
892  arg_it->setName("total_matched");
893  ++arg_it;
894  arg_it->setName("old_total_matched");
895  ++arg_it;
896  arg_it->setName("max_matched");
897  ++arg_it;
898  }
899 
900  arg_it->setName("agg_init_val");
901  ++arg_it;
902 
903  arg_it->setName("pos");
904  ++arg_it;
905 
906  arg_it->setName("frag_row_off");
907  ++arg_it;
908 
909  arg_it->setName("num_rows_per_scan");
910  ++arg_it;
911 
912  if (hoist_literals) {
913  arg_it->setName("literals");
914  ++arg_it;
915  }
916 
917  for (size_t i = 0; i < in_col_count; ++i) {
918  arg_it->setName("col_buf" + std::to_string(i));
919  ++arg_it;
920  }
922  arg_it->setName("join_hash_tables");
923 }
924 
925 std::pair<llvm::Function*, std::vector<llvm::Value*>> create_row_function(
926  const size_t in_col_count,
927  const size_t agg_col_count,
928  const bool hoist_literals,
929  llvm::Function* query_func,
930  llvm::Module* module,
931  llvm::LLVMContext& context) {
932  std::vector<llvm::Type*> row_process_arg_types;
933 
934  if (agg_col_count) {
935  // output (aggregate) arguments
936  for (size_t i = 0; i < agg_col_count; ++i) {
937  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
938  }
939  } else {
940  // group by buffer
941  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
942  // current match count
943  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
944  // total match count passed from the caller
945  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
946  // old total match count returned to the caller
947  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
948  // max matched (total number of slots in the output buffer)
949  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
950  }
951 
952  // aggregate init values
953  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
954 
955  // position argument
956  row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
957 
958  // fragment row offset argument
959  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
960 
961  // number of rows for each scan
962  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
963 
964  // literals buffer argument
965  if (hoist_literals) {
966  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
967  }
968 
969  // Generate the function signature and column head fetches s.t.
970  // double indirection isn't needed in the inner loop
971  auto& fetch_bb = query_func->front();
972  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
973  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
974  auto col_heads = generate_column_heads_load(
975  in_col_count, query_func->args().begin(), fetch_ir_builder, context);
976  CHECK_EQ(in_col_count, col_heads.size());
977 
978  // column buffer arguments
979  for (size_t i = 0; i < in_col_count; ++i) {
980  row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
981  }
982 
983  // join hash table argument
984  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
985 
986  // generate the function
987  auto ft =
988  llvm::FunctionType::get(get_int_type(32, context), row_process_arg_types, false);
989 
990  auto row_func =
991  llvm::Function::Create(ft, llvm::Function::ExternalLinkage, "row_func", module);
993  // set the row function argument names; for debugging purposes only
994  set_row_func_argnames(row_func, in_col_count, agg_col_count, hoist_literals);
995 
996  return std::make_pair(row_func, col_heads);
997 }
998 
999 void bind_query(llvm::Function* query_func,
1000  const std::string& query_fname,
1001  llvm::Function* multifrag_query_func,
1002  llvm::Module* module) {
1003  std::vector<llvm::CallInst*> query_stubs;
1004  for (auto it = llvm::inst_begin(multifrag_query_func),
1005  e = llvm::inst_end(multifrag_query_func);
1006  it != e;
1007  ++it) {
1008  if (!llvm::isa<llvm::CallInst>(*it)) {
1009  continue;
1010  }
1011  auto& query_call = llvm::cast<llvm::CallInst>(*it);
1012  if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
1013  query_stubs.push_back(&query_call);
1014  }
1015  }
1016  for (auto& S : query_stubs) {
1017  std::vector<llvm::Value*> args;
1018  for (size_t i = 0; i < S->getNumArgOperands(); ++i) {
1019  args.push_back(S->getArgOperand(i));
1020  }
1021  llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args, ""));
1022  }
1023 }
1024 
1025 std::vector<std::string> get_agg_fnames(const std::vector<Analyzer::Expr*>& target_exprs,
1026  const bool is_group_by) {
1027  std::vector<std::string> result;
1028  for (size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1029  ++target_idx, ++agg_col_idx) {
1030  const auto target_expr = target_exprs[target_idx];
1031  CHECK(target_expr);
1032  const auto target_type_info = target_expr->get_type_info();
1033  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
1034  const bool is_varlen =
1035  (target_type_info.is_string() &&
1036  target_type_info.get_compression() == kENCODING_NONE) ||
1037  target_type_info.is_array(); // TODO: should it use is_varlen_array() ?
1038  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
1039  result.emplace_back(target_type_info.is_fp() ? "agg_id_double" : "agg_id");
1040  if (is_varlen) {
1041  result.emplace_back("agg_id");
1042  }
1043  if (target_type_info.is_geometry()) {
1044  result.emplace_back("agg_id");
1045  for (auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1046  result.emplace_back("agg_id");
1047  }
1048  }
1049  continue;
1050  }
1051  const auto agg_type = agg_expr->get_aggtype();
1052  const auto& agg_type_info =
1053  agg_type != kCOUNT ? agg_expr->get_arg()->get_type_info() : target_type_info;
1054  switch (agg_type) {
1055  case kAVG: {
1056  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1057  !agg_type_info.is_fp()) {
1058  throw std::runtime_error("AVG is only valid on integer and floating point");
1059  }
1060  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1061  ? "agg_sum"
1062  : "agg_sum_double");
1063  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1064  ? "agg_count"
1065  : "agg_count_double");
1066  break;
1067  }
1068  case kMIN: {
1069  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1070  agg_type_info.is_geometry()) {
1071  throw std::runtime_error(
1072  "MIN on strings, arrays or geospatial types not supported yet");
1073  }
1074  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1075  ? "agg_min"
1076  : "agg_min_double");
1077  break;
1078  }
1079  case kMAX: {
1080  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1081  agg_type_info.is_geometry()) {
1082  throw std::runtime_error(
1083  "MAX on strings, arrays or geospatial types not supported yet");
1084  }
1085  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1086  ? "agg_max"
1087  : "agg_max_double");
1088  break;
1089  }
1090  case kSUM: {
1091  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1092  !agg_type_info.is_fp()) {
1093  throw std::runtime_error("SUM is only valid on integer and floating point");
1094  }
1095  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1096  ? "agg_sum"
1097  : "agg_sum_double");
1098  break;
1099  }
1100  case kCOUNT:
1101  result.emplace_back(agg_expr->get_is_distinct() ? "agg_count_distinct"
1102  : "agg_count");
1103  break;
1104  case kSINGLE_VALUE: {
1105  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1106  break;
1107  }
1108  case kSAMPLE: {
1109  // Note that varlen SAMPLE arguments are handled separately above
1110  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1111  break;
1112  }
1114  result.emplace_back("agg_approximate_count_distinct");
1115  break;
1116  default:
1117  CHECK(false);
1118  }
1119  }
1120  return result;
1121 }
1122 
1123 } // namespace
1124 
1125 std::unique_ptr<llvm::Module> g_rt_module(read_template_module(getGlobalLLVMContext()));
1126 
1127 #ifdef ENABLE_GEOS
1128 std::unique_ptr<llvm::Module> g_rt_geos_module(read_geos_module(getGlobalLLVMContext()));
1129 #endif
1130 
1131 bool is_udf_module_present(bool cpu_only) {
1132  return (cpu_only || udf_gpu_module != nullptr) && (udf_cpu_module != nullptr);
1133 }
1134 
1135 bool is_rt_udf_module_present(bool cpu_only) {
1136  return (cpu_only || rt_udf_gpu_module != nullptr) && (rt_udf_cpu_module != nullptr);
1137 }
1138 
1139 void throw_parseIR_error(const llvm::SMDiagnostic& parse_error, std::string src = "") {
1140  std::string excname = "LLVM IR ParseError: ";
1141  llvm::raw_string_ostream ss(excname);
1142  parse_error.print(src.c_str(), ss, false, false);
1143  throw std::runtime_error(ss.str());
1144 }
1145 
1146 void read_udf_gpu_module(const std::string& udf_ir_filename) {
1147  llvm::SMDiagnostic parse_error;
1148 
1149  llvm::StringRef file_name_arg(udf_ir_filename);
1150 
1151  udf_gpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1152  if (!udf_gpu_module) {
1153  throw_parseIR_error(parse_error, udf_ir_filename);
1154  }
1155 }
1156 
1157 void read_udf_cpu_module(const std::string& udf_ir_filename) {
1158  llvm::SMDiagnostic parse_error;
1159 
1160  llvm::StringRef file_name_arg(udf_ir_filename);
1161 
1162  udf_cpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1163  if (!udf_cpu_module) {
1164  throw_parseIR_error(parse_error, udf_ir_filename);
1165  }
1166 }
1167 
1168 void read_rt_udf_gpu_module(const std::string& udf_ir_string) {
1169  llvm::SMDiagnostic parse_error;
1170 
1171  auto buf =
1172  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for GPU");
1173 
1174  rt_udf_gpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1175  if (!rt_udf_gpu_module) {
1176  throw_parseIR_error(parse_error);
1177  }
1178 }
1179 
1180 void read_rt_udf_cpu_module(const std::string& udf_ir_string) {
1181  llvm::SMDiagnostic parse_error;
1182 
1183  auto buf =
1184  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for CPU");
1185 
1186  rt_udf_cpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1187  if (!rt_udf_cpu_module) {
1188  throw_parseIR_error(parse_error);
1189  }
1190 }
1192 std::unordered_set<llvm::Function*> CodeGenerator::markDeadRuntimeFuncs(
1193  llvm::Module& module,
1194  const std::vector<llvm::Function*>& roots,
1195  const std::vector<llvm::Function*>& leaves) {
1196  std::unordered_set<llvm::Function*> live_funcs;
1197  live_funcs.insert(roots.begin(), roots.end());
1198  live_funcs.insert(leaves.begin(), leaves.end());
1199 
1200  if (auto F = module.getFunction("init_shared_mem_nop")) {
1201  live_funcs.insert(F);
1202  }
1203  if (auto F = module.getFunction("write_back_nop")) {
1204  live_funcs.insert(F);
1205  }
1206 
1207  for (const llvm::Function* F : roots) {
1208  for (const llvm::BasicBlock& BB : *F) {
1209  for (const llvm::Instruction& I : BB) {
1210  if (const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1211  live_funcs.insert(CI->getCalledFunction());
1212  }
1213  }
1214  }
1215  }
1216 
1217  for (llvm::Function& F : module) {
1218  if (!live_funcs.count(&F) && !F.isDeclaration()) {
1219  F.setLinkage(llvm::GlobalValue::InternalLinkage);
1220  }
1221  }
1222 
1223  return live_funcs;
1224 }
1225 
1226 namespace {
1227 // searches for a particular variable within a specific basic block (or all if bb_name is
1228 // empty)
1229 template <typename InstType>
1230 llvm::Value* find_variable_in_basic_block(llvm::Function* func,
1231  std::string bb_name,
1232  std::string variable_name) {
1233  llvm::Value* result = nullptr;
1234  if (func == nullptr || variable_name.empty()) {
1235  return result;
1236  }
1237  bool is_found = false;
1238  for (auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1239  if (!bb_name.empty() && bb_it->getName() != bb_name) {
1240  continue;
1241  }
1242  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1243  if (llvm::isa<InstType>(*inst_it)) {
1244  if (inst_it->getName() == variable_name) {
1245  result = &*inst_it;
1246  is_found = true;
1247  break;
1248  }
1249  }
1250  }
1251  }
1252  return result;
1253 }
1254 }; // namespace
1255 
1256 void Executor::createErrorCheckControlFlow(llvm::Function* query_func,
1257  bool run_with_dynamic_watchdog,
1258  bool run_with_allowing_runtime_interrupt,
1259  ExecutorDeviceType device_type) {
1260  // check whether the row processing was successful; currently, it can
1261  // fail by running out of group by buffer slots
1262 
1263  if (run_with_dynamic_watchdog && run_with_allowing_runtime_interrupt) {
1264  // when both dynamic watchdog and runtime interrupt turns on
1265  // we use dynamic watchdog
1266  run_with_allowing_runtime_interrupt = false;
1267  }
1268 
1269  llvm::Value* row_count = nullptr;
1270  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1271  device_type == ExecutorDeviceType::GPU) {
1272  row_count =
1273  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1274  }
1275 
1276  bool done_splitting = false;
1277  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1278  ++bb_it) {
1279  llvm::Value* pos = nullptr;
1280  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1281  if ((run_with_dynamic_watchdog || run_with_allowing_runtime_interrupt) &&
1282  llvm::isa<llvm::PHINode>(*inst_it)) {
1283  if (inst_it->getName() == "pos") {
1284  pos = &*inst_it;
1285  }
1286  continue;
1287  }
1288  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1289  continue;
1290  }
1291  auto& filter_call = llvm::cast<llvm::CallInst>(*inst_it);
1292  if (std::string(filter_call.getCalledFunction()->getName()) == "row_process") {
1293  auto next_inst_it = inst_it;
1294  ++next_inst_it;
1295  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1296  auto& br_instr = bb_it->back();
1297  llvm::IRBuilder<> ir_builder(&br_instr);
1298  llvm::Value* err_lv = &*inst_it;
1299  if (run_with_dynamic_watchdog) {
1300  CHECK(pos);
1301  llvm::Value* call_watchdog_lv = nullptr;
1302  if (device_type == ExecutorDeviceType::GPU) {
1303  // In order to make sure all threads within a block see the same barrier,
1304  // only those blocks whose none of their threads have experienced the critical
1305  // edge will go through the dynamic watchdog computation
1306  CHECK(row_count);
1307  auto crit_edge_rem =
1308  (blockSize() & (blockSize() - 1))
1309  ? ir_builder.CreateSRem(
1310  row_count,
1311  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1312  : ir_builder.CreateAnd(
1313  row_count,
1314  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1315  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1316  crit_edge_threshold->setName("crit_edge_threshold");
1317 
1318  // only those threads where pos < crit_edge_threshold go through dynamic
1319  // watchdog call
1320  call_watchdog_lv =
1321  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1322  } else {
1323  // CPU path: run watchdog for every 64th row
1324  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1325  call_watchdog_lv = ir_builder.CreateICmp(
1326  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1327  }
1328  CHECK(call_watchdog_lv);
1329  auto error_check_bb = bb_it->splitBasicBlock(
1330  llvm::BasicBlock::iterator(br_instr), ".error_check");
1331  auto& watchdog_br_instr = bb_it->back();
1332 
1333  auto watchdog_check_bb = llvm::BasicBlock::Create(
1334  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1335  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1336  auto detected_timeout = watchdog_ir_builder.CreateCall(
1337  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1338  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1339  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1340  watchdog_ir_builder.CreateBr(error_check_bb);
1341 
1342  llvm::ReplaceInstWithInst(
1343  &watchdog_br_instr,
1344  llvm::BranchInst::Create(
1345  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1346  ir_builder.SetInsertPoint(&br_instr);
1347  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1348 
1349  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1350  unified_err_lv->addIncoming(err_lv, &*bb_it);
1351  err_lv = unified_err_lv;
1352  } else if (run_with_allowing_runtime_interrupt) {
1353  CHECK(pos);
1354  llvm::Value* call_check_interrupt_lv = nullptr;
1355  if (device_type == ExecutorDeviceType::GPU) {
1356  // approximate how many times the %pos variable
1357  // is increased --> the number of iteration
1358  int32_t num_shift_by_gridDim = getExpOfTwo(gridSize());
1359  int32_t num_shift_by_blockDim = getExpOfTwo(blockSize());
1360  if (!isPowOfTwo(gridSize())) {
1361  num_shift_by_gridDim++;
1362  }
1363  if (!isPowOfTwo(blockSize())) {
1364  num_shift_by_blockDim++;
1365  }
1366  int total_num_shift = num_shift_by_gridDim + num_shift_by_blockDim;
1367  // check the interrupt flag for every 64th iteration
1368  llvm::Value* pos_shifted_per_iteration =
1369  ir_builder.CreateLShr(pos, cgen_state_->llInt(total_num_shift));
1370  auto interrupt_predicate =
1371  ir_builder.CreateAnd(pos_shifted_per_iteration, uint64_t(0x3f));
1372  call_check_interrupt_lv =
1373  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1374  interrupt_predicate,
1375  cgen_state_->llInt(int64_t(0LL)));
1376  } else {
1377  // CPU path: run interrupt checker for every 64th row
1378  auto interrupt_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1379  call_check_interrupt_lv =
1380  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1381  interrupt_predicate,
1382  cgen_state_->llInt(int64_t(0LL)));
1383  }
1384  CHECK(call_check_interrupt_lv);
1385  auto error_check_bb = bb_it->splitBasicBlock(
1386  llvm::BasicBlock::iterator(br_instr), ".error_check");
1387  auto& check_interrupt_br_instr = bb_it->back();
1388 
1389  auto interrupt_check_bb = llvm::BasicBlock::Create(
1390  cgen_state_->context_, ".interrupt_check", query_func, error_check_bb);
1391  llvm::IRBuilder<> interrupt_checker_ir_builder(interrupt_check_bb);
1392  auto detected_interrupt = interrupt_checker_ir_builder.CreateCall(
1393  cgen_state_->module_->getFunction("check_interrupt"), {});
1394  auto interrupt_err_lv = interrupt_checker_ir_builder.CreateSelect(
1395  detected_interrupt, cgen_state_->llInt(Executor::ERR_INTERRUPTED), err_lv);
1396  interrupt_checker_ir_builder.CreateBr(error_check_bb);
1397 
1398  llvm::ReplaceInstWithInst(
1399  &check_interrupt_br_instr,
1400  llvm::BranchInst::Create(
1401  interrupt_check_bb, error_check_bb, call_check_interrupt_lv));
1402  ir_builder.SetInsertPoint(&br_instr);
1403  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1404 
1405  unified_err_lv->addIncoming(interrupt_err_lv, interrupt_check_bb);
1406  unified_err_lv->addIncoming(err_lv, &*bb_it);
1407  err_lv = unified_err_lv;
1408  }
1409  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1410  err_lv =
1411  ir_builder.CreateCall(cgen_state_->module_->getFunction("record_error_code"),
1412  std::vector<llvm::Value*>{err_lv, error_code_arg});
1413  if (device_type == ExecutorDeviceType::GPU) {
1414  // let kernel execution finish as expected, regardless of the observed error,
1415  // unless it is from the dynamic watchdog where all threads within that block
1416  // return together.
1417  if (run_with_allowing_runtime_interrupt) {
1418  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1419  err_lv,
1420  cgen_state_->llInt(Executor::ERR_INTERRUPTED));
1421  } else {
1422  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1423  err_lv,
1424  cgen_state_->llInt(Executor::ERR_OUT_OF_TIME));
1425  }
1426 
1427  } else {
1428  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1429  err_lv,
1430  cgen_state_->llInt(static_cast<int32_t>(0)));
1431  }
1432  auto error_bb = llvm::BasicBlock::Create(
1433  cgen_state_->context_, ".error_exit", query_func, new_bb);
1434  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1435  llvm::ReplaceInstWithInst(&br_instr,
1436  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1437  done_splitting = true;
1438  break;
1439  }
1440  }
1441  }
1442  CHECK(done_splitting);
1443 }
1444 
1445 std::vector<llvm::Value*> Executor::inlineHoistedLiterals() {
1446  std::vector<llvm::Value*> hoisted_literals;
1447 
1448  // row_func_ is using literals whose defs have been hoisted up to the query_func_,
1449  // extend row_func_ signature to include extra args to pass these literal values.
1450  std::vector<llvm::Type*> row_process_arg_types;
1451 
1452  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1453  E = cgen_state_->row_func_->arg_end();
1454  I != E;
1455  ++I) {
1456  row_process_arg_types.push_back(I->getType());
1457  }
1458 
1459  for (auto& element : cgen_state_->query_func_literal_loads_) {
1460  for (auto value : element.second) {
1461  row_process_arg_types.push_back(value->getType());
1462  }
1463  }
1464 
1465  auto ft = llvm::FunctionType::get(
1466  get_int_type(32, cgen_state_->context_), row_process_arg_types, false);
1467  auto row_func_with_hoisted_literals =
1468  llvm::Function::Create(ft,
1469  llvm::Function::ExternalLinkage,
1470  "row_func_hoisted_literals",
1471  cgen_state_->row_func_->getParent());
1472 
1473  // make sure it's in-lined, we don't want register spills in the inner loop
1474  mark_function_always_inline(row_func_with_hoisted_literals);
1475 
1476  auto arg_it = row_func_with_hoisted_literals->arg_begin();
1477  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1478  E = cgen_state_->row_func_->arg_end();
1479  I != E;
1480  ++I) {
1481  if (I->hasName()) {
1482  arg_it->setName(I->getName());
1483  }
1484  ++arg_it;
1485  }
1486 
1487  std::unordered_map<int, std::vector<llvm::Value*>>
1488  query_func_literal_loads_function_arguments;
1489 
1490  for (auto& element : cgen_state_->query_func_literal_loads_) {
1491  std::vector<llvm::Value*> argument_values;
1492 
1493  for (auto value : element.second) {
1494  hoisted_literals.push_back(value);
1495  argument_values.push_back(&*arg_it);
1496  if (value->hasName()) {
1497  arg_it->setName("arg_" + value->getName());
1498  }
1499  ++arg_it;
1500  }
1501 
1502  query_func_literal_loads_function_arguments[element.first] = argument_values;
1503  }
1504 
1505  // copy the row_func function body over
1506  // see
1507  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
1508  row_func_with_hoisted_literals->getBasicBlockList().splice(
1509  row_func_with_hoisted_literals->begin(),
1510  cgen_state_->row_func_->getBasicBlockList());
1511 
1512  // also replace row_func arguments with the arguments from row_func_hoisted_literals
1513  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1514  E = cgen_state_->row_func_->arg_end(),
1515  I2 = row_func_with_hoisted_literals->arg_begin();
1516  I != E;
1517  ++I) {
1518  I->replaceAllUsesWith(&*I2);
1519  I2->takeName(&*I);
1520  ++I2;
1521  }
1522 
1523  cgen_state_->row_func_ = row_func_with_hoisted_literals;
1524 
1525  // and finally replace literal placeholders
1526  std::vector<llvm::Instruction*> placeholders;
1527  std::string prefix("__placeholder__literal_");
1528  for (auto it = llvm::inst_begin(row_func_with_hoisted_literals),
1529  e = llvm::inst_end(row_func_with_hoisted_literals);
1530  it != e;
1531  ++it) {
1532  if (it->hasName() && it->getName().startswith(prefix)) {
1533  auto offset_and_index_entry =
1534  cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
1535  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
1536 
1537  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
1538  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
1539 
1540  it->replaceAllUsesWith(
1541  query_func_literal_loads_function_arguments[lit_off][lit_idx]);
1542  placeholders.push_back(&*it);
1543  }
1544  }
1545  for (auto placeholder : placeholders) {
1546  placeholder->removeFromParent();
1547  }
1548 
1549  return hoisted_literals;
1550 }
1551 
1552 namespace {
1553 
1554 size_t get_shared_memory_size(const bool shared_mem_used,
1555  const QueryMemoryDescriptor* query_mem_desc_ptr) {
1556  return shared_mem_used
1557  ? (query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount())
1558  : 0;
1559 }
1560 
1561 bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor* query_mem_desc_ptr,
1562  const RelAlgExecutionUnit& ra_exe_unit,
1563  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
1564  const ExecutorDeviceType device_type,
1565  const unsigned gpu_blocksize,
1566  const unsigned num_blocks_per_mp) {
1567  if (device_type == ExecutorDeviceType::CPU) {
1568  return false;
1569  }
1570  if (query_mem_desc_ptr->didOutputColumnar()) {
1571  return false;
1572  }
1573  CHECK(query_mem_desc_ptr);
1574  CHECK(cuda_mgr);
1575  /*
1576  * We only use shared memory strategy if GPU hardware provides native shared
1577  * memory atomics support. From CUDA Toolkit documentation:
1578  * https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html#atomic-ops "Like
1579  * Maxwell, Pascal [and Volta] provides native shared memory atomic operations
1580  * for 32-bit integer arithmetic, along with native 32 or 64-bit compare-and-swap
1581  * (CAS)."
1582  *
1583  **/
1584  if (!cuda_mgr->isArchMaxwellOrLaterForAll()) {
1585  return false;
1586  }
1587 
1588  if (query_mem_desc_ptr->getQueryDescriptionType() ==
1591  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty()) {
1592  // TODO: relax this, if necessary
1593  if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
1594  return false;
1595  }
1596  // skip shared memory usage when dealing with 1) variable length targets, 2)
1597  // not a COUNT aggregate
1598  const auto target_infos =
1599  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
1600  std::unordered_set<SQLAgg> supported_aggs{kCOUNT};
1601  if (std::find_if(target_infos.begin(),
1602  target_infos.end(),
1603  [&supported_aggs](const TargetInfo& ti) {
1604  if (ti.sql_type.is_varlen() ||
1605  !supported_aggs.count(ti.agg_kind)) {
1606  return true;
1607  } else {
1608  return false;
1609  }
1610  }) == target_infos.end()) {
1611  return true;
1612  }
1613  }
1614  if (query_mem_desc_ptr->getQueryDescriptionType() ==
1625  if (gpu_blocksize < query_mem_desc_ptr->getEntryCount()) {
1626  return false;
1627  }
1628 
1629  // Fundamentally, we should use shared memory whenever the output buffer
1630  // is small enough so that we can fit it in the shared memory and yet expect
1631  // good occupancy.
1632  // For now, we allow keyless, row-wise layout, and only for perfect hash
1633  // group by operations.
1634  if (query_mem_desc_ptr->hasKeylessHash() &&
1635  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty() &&
1636  !query_mem_desc_ptr->useStreamingTopN()) {
1637  const size_t shared_memory_threshold_bytes = std::min(
1638  g_gpu_smem_threshold == 0 ? SIZE_MAX : g_gpu_smem_threshold,
1639  cuda_mgr->getMinSharedMemoryPerBlockForAllDevices() / num_blocks_per_mp);
1640  const auto output_buffer_size =
1641  query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount();
1642  if (output_buffer_size > shared_memory_threshold_bytes) {
1643  return false;
1644  }
1645 
1646  // skip shared memory usage when dealing with 1) variable length targets, 2)
1647  // non-basic aggregates (COUNT, SUM, MIN, MAX, AVG)
1648  // TODO: relax this if necessary
1649  const auto target_infos =
1650  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
1651  std::unordered_set<SQLAgg> supported_aggs{kCOUNT};
1653  supported_aggs = {kCOUNT, kMIN, kMAX, kSUM, kAVG};
1654  }
1655  if (std::find_if(target_infos.begin(),
1656  target_infos.end(),
1657  [&supported_aggs](const TargetInfo& ti) {
1658  if (ti.sql_type.is_varlen() ||
1659  !supported_aggs.count(ti.agg_kind)) {
1660  return true;
1661  } else {
1662  return false;
1663  }
1664  }) == target_infos.end()) {
1665  return true;
1666  }
1667  }
1668  }
1669  return false;
1670 }
1671 
1672 } // namespace
1673 
1674 std::tuple<Executor::CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
1675 Executor::compileWorkUnit(const std::vector<InputTableInfo>& query_infos,
1676  const RelAlgExecutionUnit& ra_exe_unit,
1677  const CompilationOptions& co,
1678  const ExecutionOptions& eo,
1679  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
1680  const bool allow_lazy_fetch,
1681  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
1682  const size_t max_groups_buffer_entry_guess,
1683  const int8_t crt_min_byte_width,
1684  const bool has_cardinality_estimation,
1685  ColumnCacheMap& column_cache,
1686  RenderInfo* render_info) {
1687  auto timer = DEBUG_TIMER(__func__);
1688  nukeOldState(allow_lazy_fetch, query_infos, &ra_exe_unit);
1689 
1690  GroupByAndAggregate group_by_and_aggregate(
1691  this, co.device_type, ra_exe_unit, query_infos, row_set_mem_owner);
1692  auto query_mem_desc =
1693  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
1694  max_groups_buffer_entry_guess,
1695  crt_min_byte_width,
1696  render_info,
1698 
1699  if (query_mem_desc->getQueryDescriptionType() ==
1701  !has_cardinality_estimation &&
1702  (!render_info || !render_info->isPotentialInSituRender()) && !eo.just_explain) {
1704  }
1705 
1706  const bool output_columnar = query_mem_desc->didOutputColumnar();
1707  const bool gpu_shared_mem_optimization =
1709  ra_exe_unit,
1710  cuda_mgr,
1711  co.device_type,
1712  cuda_mgr ? this->blockSize() : 1,
1713  cuda_mgr ? this->numBlocksPerMP() : 1);
1714  if (gpu_shared_mem_optimization) {
1715  // disable interleaved bins optimization on the GPU
1716  query_mem_desc->setHasInterleavedBinsOnGpu(false);
1717  LOG(DEBUG1) << "GPU shared memory is used for the " +
1718  query_mem_desc->queryDescTypeToString() + " query(" +
1719  std::to_string(get_shared_memory_size(gpu_shared_mem_optimization,
1720  query_mem_desc.get())) +
1721  " out of " + std::to_string(g_gpu_smem_threshold) + " bytes).";
1722  }
1723 
1724  const GpuSharedMemoryContext gpu_smem_context(
1725  get_shared_memory_size(gpu_shared_mem_optimization, query_mem_desc.get()));
1726 
1728  const size_t num_count_distinct_descs =
1729  query_mem_desc->getCountDistinctDescriptorsSize();
1730  for (size_t i = 0; i < num_count_distinct_descs; i++) {
1731  const auto& count_distinct_descriptor =
1732  query_mem_desc->getCountDistinctDescriptor(i);
1733  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::StdSet ||
1734  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
1735  !co.hoist_literals)) {
1736  throw QueryMustRunOnCpu();
1737  }
1738  }
1739  }
1740 
1741  // Read the module template and target either CPU or GPU
1742  // by binding the stream position functions to the right implementation:
1743  // stride access for GPU, contiguous for CPU
1744  auto rt_module_copy = llvm::CloneModule(
1745 #if LLVM_VERSION_MAJOR >= 7
1746  *g_rt_module.get(),
1747 #else
1748  g_rt_module.get(),
1749 #endif
1750  cgen_state_->vmap_,
1751  [](const llvm::GlobalValue* gv) {
1752  auto func = llvm::dyn_cast<llvm::Function>(gv);
1753  if (!func) {
1754  return true;
1755  }
1756  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
1757  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
1759  });
1760 
1762  if (is_udf_module_present(true)) {
1763  CodeGenerator::link_udf_module(udf_cpu_module, *rt_module_copy, cgen_state_.get());
1764  }
1765  if (is_rt_udf_module_present(true)) {
1767  rt_udf_cpu_module, *rt_module_copy, cgen_state_.get());
1768  }
1769  } else {
1770  rt_module_copy->setDataLayout(get_gpu_data_layout());
1771  rt_module_copy->setTargetTriple(get_gpu_target_triple_string());
1772 
1773  if (is_udf_module_present()) {
1774  llvm::Triple gpu_triple(udf_gpu_module->getTargetTriple());
1775 
1776  if (!gpu_triple.isNVPTX()) {
1777  throw QueryMustRunOnCpu();
1778  }
1779 
1780  CodeGenerator::link_udf_module(udf_gpu_module, *rt_module_copy, cgen_state_.get());
1781  }
1782  if (is_rt_udf_module_present()) {
1784  rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
1785  }
1786  }
1787 
1788  cgen_state_->module_ = rt_module_copy.release();
1789 
1790  auto agg_fnames =
1791  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
1792 
1793  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
1794 
1795  const bool is_group_by{query_mem_desc->isGroupBy()};
1796  auto query_func = is_group_by ? query_group_by_template(cgen_state_->module_,
1797  co.hoist_literals,
1798  *query_mem_desc,
1799  co.device_type,
1800  ra_exe_unit.scan_limit,
1801  gpu_smem_context)
1802  : query_template(cgen_state_->module_,
1803  agg_slot_count,
1804  co.hoist_literals,
1805  !!ra_exe_unit.estimator,
1806  gpu_smem_context);
1807  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
1808  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
1809  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
1810 
1811  cgen_state_->query_func_ = query_func;
1812  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
1813  &query_func->getEntryBlock().front());
1814 
1815  std::vector<llvm::Value*> col_heads;
1816  std::tie(cgen_state_->row_func_, col_heads) =
1817  create_row_function(ra_exe_unit.input_col_descs.size(),
1818  is_group_by ? 0 : agg_slot_count,
1819  co.hoist_literals,
1820  query_func,
1821  cgen_state_->module_,
1822  cgen_state_->context_);
1823  CHECK(cgen_state_->row_func_);
1824  // make sure it's in-lined, we don't want register spills in the inner loop
1825  mark_function_always_inline(cgen_state_->row_func_);
1826  auto bb =
1827  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
1828  cgen_state_->ir_builder_.SetInsertPoint(bb);
1829  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
1830  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
1831  const auto join_loops =
1832  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
1833  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
1834  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
1835  if (is_not_deleted_bb) {
1836  bb = is_not_deleted_bb;
1837  }
1838  if (!join_loops.empty()) {
1839  codegenJoinLoops(join_loops,
1840  body_execution_unit,
1841  group_by_and_aggregate,
1842  query_func,
1843  bb,
1844  *(query_mem_desc.get()),
1845  co,
1846  eo);
1847  } else {
1848  const bool can_return_error = compileBody(
1849  ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co, gpu_smem_context);
1850  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog ||
1852  createErrorCheckControlFlow(query_func,
1855  co.device_type);
1856  }
1857  }
1858  std::vector<llvm::Value*> hoisted_literals;
1859 
1860  if (co.hoist_literals) {
1861  VLOG(1) << "number of hoisted literals: "
1862  << cgen_state_->query_func_literal_loads_.size()
1863  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
1864  << " bytes";
1865  }
1866 
1867  if (co.hoist_literals && !cgen_state_->query_func_literal_loads_.empty()) {
1868  // we have some hoisted literals...
1869  hoisted_literals = inlineHoistedLiterals();
1870  }
1871  // iterate through all the instruction in the query template function and
1872  // replace the call to the filter placeholder with the call to the actual filter
1873  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1874  ++it) {
1875  if (!llvm::isa<llvm::CallInst>(*it)) {
1876  continue;
1877  }
1878  auto& filter_call = llvm::cast<llvm::CallInst>(*it);
1879  if (std::string(filter_call.getCalledFunction()->getName()) == "row_process") {
1880  std::vector<llvm::Value*> args;
1881  for (size_t i = 0; i < filter_call.getNumArgOperands(); ++i) {
1882  args.push_back(filter_call.getArgOperand(i));
1883  }
1884  args.insert(args.end(), col_heads.begin(), col_heads.end());
1885  args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
1886  // push hoisted literals arguments, if any
1887  args.insert(args.end(), hoisted_literals.begin(), hoisted_literals.end());
1888 
1889  llvm::ReplaceInstWithInst(&filter_call,
1890  llvm::CallInst::Create(cgen_state_->row_func_, args, ""));
1891  break;
1892  }
1893  }
1894 
1895  plan_state_->init_agg_vals_ =
1896  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
1897 
1898  /*
1899  * If we have decided to use GPU shared memory (decision is not made here), then
1900  * we generate proper code for extra components that it needs (buffer initialization and
1901  * gpu reduction from shared memory to global memory). We then replace these functions
1902  * into the already compiled query_func (replacing two placeholders, write_back_nop and
1903  * init_smem_nop). The rest of the code should be as before (row_func, etc.).
1904  */
1905  if (gpu_smem_context.isSharedMemoryUsed()) {
1906  if (query_mem_desc->getQueryDescriptionType() ==
1908  GpuSharedMemCodeBuilder gpu_smem_code(
1909  cgen_state_->module_,
1910  cgen_state_->context_,
1911  *query_mem_desc,
1913  plan_state_->init_agg_vals_);
1914  gpu_smem_code.codegen();
1915  gpu_smem_code.injectFunctionsInto(query_func);
1916 
1917  // helper functions are used for caching purposes later
1918  cgen_state_->helper_functions_.push_back(gpu_smem_code.getReductionFunction());
1919  cgen_state_->helper_functions_.push_back(gpu_smem_code.getInitFunction());
1920  LOG(IR) << gpu_smem_code.toString();
1921  }
1922  }
1923 
1924  auto multifrag_query_func = cgen_state_->module_->getFunction(
1925  "multifrag_query" + std::string(co.hoist_literals ? "_hoisted_literals" : ""));
1926  CHECK(multifrag_query_func);
1927 
1928  bind_query(query_func,
1929  "query_stub" + std::string(co.hoist_literals ? "_hoisted_literals" : ""),
1930  multifrag_query_func,
1931  cgen_state_->module_);
1932 
1933  auto live_funcs =
1934  CodeGenerator::markDeadRuntimeFuncs(*cgen_state_->module_,
1935  {query_func, cgen_state_->row_func_},
1936  {multifrag_query_func});
1937 
1938  std::string llvm_ir;
1939  if (eo.just_explain) {
1941 #ifdef WITH_JIT_DEBUG
1942  throw std::runtime_error(
1943  "Explain optimized not available when JIT runtime debug symbols are enabled");
1944 #else
1945  // Note that we don't run the NVVM reflect pass here. Use LOG(IR) to get the
1946  // optimized IR after NVVM reflect
1947  llvm::legacy::PassManager pass_manager;
1948  optimize_ir(query_func, cgen_state_->module_, pass_manager, live_funcs, co);
1949 #endif // WITH_JIT_DEBUG
1950  }
1951  llvm_ir =
1952  serialize_llvm_object(query_func) + serialize_llvm_object(cgen_state_->row_func_);
1953  }
1954  verify_function_ir(cgen_state_->row_func_);
1955 
1956  LOG(IR) << query_mem_desc->toString() << "\nGenerated IR\n"
1957  << serialize_llvm_object(query_func)
1958  << serialize_llvm_object(cgen_state_->row_func_) << "\nEnd of IR";
1959 
1960  return std::make_tuple(
1963  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
1964  : optimizeAndCodegenGPU(query_func,
1965  multifrag_query_func,
1966  live_funcs,
1967  is_group_by || ra_exe_unit.estimator,
1968  cuda_mgr,
1969  co),
1970  cgen_state_->getLiterals(),
1971  output_columnar,
1972  llvm_ir,
1973  std::move(gpu_smem_context)},
1974  std::move(query_mem_desc));
1975 }
1976 
1978  const RelAlgExecutionUnit& ra_exe_unit,
1979  const CompilationOptions& co) {
1980  if (!co.add_delete_column) {
1981  return nullptr;
1982  }
1983  CHECK(!ra_exe_unit.input_descs.empty());
1984  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
1985  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
1986  return nullptr;
1987  }
1988  const auto td = catalog_->getMetadataForTable(outer_input_desc.getTableId());
1989  CHECK(td);
1990  const auto deleted_cd = catalog_->getDeletedColumnIfRowsDeleted(td);
1991  if (!deleted_cd) {
1992  return nullptr;
1993  }
1994  CHECK(deleted_cd->columnType.is_boolean());
1995  const auto deleted_expr =
1996  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
1997  outer_input_desc.getTableId(),
1998  deleted_cd->columnId,
1999  outer_input_desc.getNestLevel());
2000  CodeGenerator code_generator(this);
2001  const auto is_deleted =
2002  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
2003  const auto is_deleted_bb = llvm::BasicBlock::Create(
2004  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
2005  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
2006  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
2007  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
2008  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
2009  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
2010  cgen_state_->ir_builder_.SetInsertPoint(bb);
2011  return bb;
2012 }
2013 
2014 bool Executor::compileBody(const RelAlgExecutionUnit& ra_exe_unit,
2015  GroupByAndAggregate& group_by_and_aggregate,
2017  const CompilationOptions& co,
2018  const GpuSharedMemoryContext& gpu_smem_context) {
2019  // generate the code for the filter
2020  std::vector<Analyzer::Expr*> primary_quals;
2021  std::vector<Analyzer::Expr*> deferred_quals;
2022  bool short_circuited =
2023  CodeGenerator::prioritizeQuals(ra_exe_unit, primary_quals, deferred_quals);
2024  if (short_circuited) {
2025  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
2026  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
2027  << " quals";
2028  }
2029  llvm::Value* filter_lv = cgen_state_->llBool(true);
2030  CodeGenerator code_generator(this);
2031  for (auto expr : primary_quals) {
2032  // Generate the filter for primary quals
2033  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
2034  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
2035  }
2036  CHECK(filter_lv->getType()->isIntegerTy(1));
2037  llvm::BasicBlock* sc_false{nullptr};
2038  if (!deferred_quals.empty()) {
2039  auto sc_true = llvm::BasicBlock::Create(
2040  cgen_state_->context_, "sc_true", cgen_state_->row_func_);
2041  sc_false = llvm::BasicBlock::Create(
2042  cgen_state_->context_, "sc_false", cgen_state_->row_func_);
2043  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
2044  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
2045  if (ra_exe_unit.join_quals.empty()) {
2046  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
2047  }
2048  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
2049  filter_lv = cgen_state_->llBool(true);
2050  }
2051  for (auto expr : deferred_quals) {
2052  filter_lv = cgen_state_->ir_builder_.CreateAnd(
2053  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
2054  }
2055 
2056  CHECK(filter_lv->getType()->isIntegerTy(1));
2057  return group_by_and_aggregate.codegen(
2058  filter_lv, sc_false, query_mem_desc, co, gpu_smem_context);
2059 }
2060 
2061 std::unique_ptr<llvm::Module> runtime_module_shallow_copy(CgenState* cgen_state) {
2062  return llvm::CloneModule(
2063 #if LLVM_VERSION_MAJOR >= 7
2064  *g_rt_module.get(),
2065 #else
2066  g_rt_module.get(),
2067 #endif
2068  cgen_state->vmap_,
2069  [](const llvm::GlobalValue* gv) {
2070  auto func = llvm::dyn_cast<llvm::Function>(gv);
2071  if (!func) {
2072  return true;
2073  }
2074  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
2075  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage);
2076  });
2077 }
2078 
2079 std::vector<llvm::Value*> generate_column_heads_load(const int num_columns,
2080  llvm::Value* byte_stream_arg,
2081  llvm::IRBuilder<>& ir_builder,
2082  llvm::LLVMContext& ctx) {
2083  CHECK(byte_stream_arg);
2084  const auto max_col_local_id = num_columns - 1;
2085 
2086  std::vector<llvm::Value*> col_heads;
2087  for (int col_id = 0; col_id <= max_col_local_id; ++col_id) {
2088  col_heads.emplace_back(ir_builder.CreateLoad(ir_builder.CreateGEP(
2089  byte_stream_arg, llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id))));
2090  }
2091  return col_heads;
2092 }
2093 
std::map< std::string, std::string > get_device_parameters()
void read_rt_udf_gpu_module(const std::string &udf_ir)
catalog_(nullptr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::string filename(char const *path)
Definition: Logger.cpp:62
int64_t * src
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
bool g_enable_smem_group_by
std::unique_ptr< llvm::Module > rt_udf_cpu_module
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
std::unique_ptr< llvm::Module > module(runtime_module_shallow_copy(cgen_state))
bool countDistinctDescriptorsLogicallyEmpty() const
std::unique_ptr< llvm::Module > runtime_module_shallow_copy(CgenState *cgen_state)
static const int32_t ERR_INTERRUPTED
Definition: Execute.h:1032
void mark_function_never_inline(llvm::Function *func)
std::unique_ptr< llvm::Module > udf_gpu_module
ExecutorDeviceType
void read_rt_udf_cpu_module(const std::string &udf_ir)
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:188
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
bool is_udf_module_present(bool cpu_only=false)
std::string mapd_root_abs_path()
Definition: mapdpath.h:30
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
std::string join(T const &container, std::string const &delim)
std::vector< InputDescriptor > input_descs
void read_udf_cpu_module(const std::string &udf_ir_filename)
llvm::Function * query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query, const GpuSharedMemoryContext &gpu_smem_context)
void read_udf_gpu_module(const std::string &udf_ir_filename)
std::vector< std::string > CodeCacheKey
Definition: CodeCache.h:25
ExecutorOptLevel opt_level
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, llvm::LLVMContext &context)
void optimize_ir(llvm::Function *query_func, llvm::Module *module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
unsigned getExpOfTwo(unsigned n)
Definition: MathUtils.h:24
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals)
Definition: LogicalIR.cpp:157
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
std::string to_string(char const *&&v)
#define LOG_IF(severity, condition)
Definition: Logger.h:287
gpu_code_cache_(code_cache_size)
std::tuple< Executor::CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
cpu_code_cache_(code_cache_size)
std::shared_ptr< CompilationContext > getCodeFromCache(const CodeCacheKey &, const CodeCache &)
bool g_enable_smem_non_grouped_agg
Definition: Execute.cpp:119
static std::shared_ptr< GpuCompilationContext > generateNativeGPUCode(llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co, const GPUTarget &gpu_target)
Definition: sqldefs.h:73
bool isPowOfTwo(unsigned n)
Definition: MathUtils.h:20
false auto cgen_state
bool is_gpu_shared_mem_supported(const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned gpu_blocksize, const unsigned num_blocks_per_mp)
llvm::StringRef get_gpu_target_triple_string()
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context={})
void verify_function_ir(const llvm::Function *func)
const bool allow_multifrag
CHECK(cgen_state)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:94
std::shared_ptr< CompilationContext > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
std::string generatePTX(const std::string &) const
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
const bool with_dynamic_watchdog
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > g_rt_module
ExecutorExplainType explain_type
std::shared_ptr< CompilationContext > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1031
void initializeNVPTXBackend() const
Definition: sqldefs.h:75
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:123
const_list_iterator_t cend() const
Definition: LruCache.hpp:55
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
QueryDescriptionType getQueryDescriptionType() const
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
Definition: CudaMgr.h:157
static void addCodeToCache(const CodeCacheKey &, std::shared_ptr< CompilationContext >, llvm::Module *, CodeCache &)
ExecutorDeviceType device_type
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults >>> ColumnCacheMap
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:288
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
llvm::Module * read_template_module(llvm::LLVMContext &context)
bool g_enable_smem_grouped_non_count_agg
Definition: Execute.cpp:116
Definition: sqldefs.h:76
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, bool run_with_allowing_runtime_interrupt, ExecutorDeviceType device_type)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
std::unique_ptr< llvm::Module > udf_cpu_module
int CUdevice
Definition: nocuda.h:20
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
llvm::LLVMContext & getGlobalLLVMContext()
float g_fraction_code_cache_to_evict
SQLAgg get_aggtype() const
Definition: Analyzer.h:1051
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:64
#define DEBUG_TIMER(name)
Definition: Logger.h:313
llvm::ValueToValueMapTy vmap_
Definition: CgenState.h:318
std::vector< llvm::Value * > inlineHoistedLiterals()
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="")
const bool allow_runtime_query_interrupt
Definition: sqldefs.h:74
NvidiaDeviceArch getDeviceArch() const
Definition: CudaMgr.h:179
int cpu_threads()
Definition: thread_count.h:25
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
CubinResult ptx_to_cubin(const std::string &ptx, const unsigned block_size, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
cgen_state module_
Definition: sqldefs.h:72
bool is_rt_udf_module_present(bool cpu_only=false)
llvm::Function * query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit, const GpuSharedMemoryContext &gpu_smem_context)
void put(const key_t &key, value_t &&value)
Definition: LruCache.hpp:27
const_list_iterator_t find(const key_t &key) const
Definition: LruCache.hpp:49
#define VLOG(n)
Definition: Logger.h:291
std::pair< llvm::Function *, std::vector< llvm::Value * > > create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Function *query_func, llvm::Module *module, llvm::LLVMContext &context)
size_t get_shared_memory_size(const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend(const CudaMgr_Namespace::NvidiaDeviceArch arch)
size_t g_gpu_smem_threshold
Definition: Execute.cpp:111