OmniSciDB  eee9fa949c
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
NativeCodegen.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
18 #include "Execute.h"
22 #include "QueryTemplateGenerator.h"
23 
24 #include "Shared/mapdpath.h"
25 
26 #if LLVM_VERSION_MAJOR < 4
27 static_assert(false, "LLVM Version >= 4 is required.");
28 #endif
29 
30 #include <llvm/Bitcode/BitcodeReader.h>
31 #include <llvm/Bitcode/BitcodeWriter.h>
32 #include <llvm/ExecutionEngine/MCJIT.h>
33 #include <llvm/IR/Attributes.h>
34 #include <llvm/IR/GlobalValue.h>
35 #include <llvm/IR/InstIterator.h>
36 #include <llvm/IR/LegacyPassManager.h>
37 #include <llvm/IR/Verifier.h>
38 #include <llvm/IRReader/IRReader.h>
39 #include <llvm/Support/Casting.h>
40 #include <llvm/Support/FileSystem.h>
41 #include <llvm/Support/FormattedStream.h>
42 #include <llvm/Support/MemoryBuffer.h>
43 #include <llvm/Support/SourceMgr.h>
44 #include <llvm/Support/TargetRegistry.h>
45 #include <llvm/Support/TargetSelect.h>
46 #include <llvm/Support/raw_os_ostream.h>
47 #include <llvm/Transforms/IPO.h>
48 #include <llvm/Transforms/IPO/AlwaysInliner.h>
49 #include <llvm/Transforms/InstCombine/InstCombine.h>
50 #include <llvm/Transforms/Instrumentation.h>
51 #include <llvm/Transforms/Scalar.h>
52 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
53 #include <llvm/Transforms/Utils/Cloning.h>
54 #include "llvm/IR/IntrinsicInst.h"
55 #include "llvm/IR/Intrinsics.h"
56 
57 #if LLVM_VERSION_MAJOR >= 7
58 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
59 #include <llvm/Transforms/Utils.h>
60 #endif
61 #include <llvm/IRReader/IRReader.h>
62 #include <llvm/Linker/Linker.h>
63 #include <llvm/Support/SourceMgr.h>
64 #include <llvm/Support/raw_ostream.h>
65 
66 std::unique_ptr<llvm::Module> udf_gpu_module;
67 std::unique_ptr<llvm::Module> udf_cpu_module;
68 std::unique_ptr<llvm::Module> rt_udf_gpu_module;
69 std::unique_ptr<llvm::Module> rt_udf_cpu_module;
70 
71 extern std::unique_ptr<llvm::Module> g_rt_module;
72 namespace {
73 
74 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
76  llvm::Module& M,
77  const std::unordered_set<llvm::Function*>& live_funcs) {
78  std::vector<llvm::Function*> dead_funcs;
79  for (auto& F : M) {
80  bool bAlive = false;
81  if (live_funcs.count(&F)) {
82  continue;
83  }
84  for (auto U : F.users()) {
85  auto* C = llvm::dyn_cast<const llvm::CallInst>(U);
86  if (!C || C->getParent()->getParent() != &F) {
87  bAlive = true;
88  break;
89  }
90  }
91  if (!bAlive) {
92  dead_funcs.push_back(&F);
93  }
94  }
95  for (auto pFn : dead_funcs) {
96  pFn->eraseFromParent();
97  }
98 }
99 
100 void optimize_ir(llvm::Function* query_func,
101  llvm::Module* module,
102  const std::unordered_set<llvm::Function*>& live_funcs,
103  const CompilationOptions& co) {
104  llvm::legacy::PassManager pass_manager;
105 
106  pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
107  pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
108 #if LLVM_VERSION_MAJOR >= 7
109  pass_manager.add(llvm::createInstSimplifyLegacyPass());
110 #else
111  pass_manager.add(llvm::createInstructionSimplifierPass());
112 #endif
113  pass_manager.add(llvm::createInstructionCombiningPass());
114  pass_manager.add(llvm::createGlobalOptimizerPass());
115 
116  pass_manager.add(llvm::createLICMPass());
118  pass_manager.add(llvm::createLoopStrengthReducePass());
119  }
120  pass_manager.run(*module);
121 
122  eliminate_dead_self_recursive_funcs(*module, live_funcs);
123 }
124 #endif
125 
126 } // namespace
127 
129 
130 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine)
131  : execution_engine_(execution_engine) {}
132 
133 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine,
134  const CompilationOptions& co)
135  : execution_engine_(execution_engine) {
136  if (execution_engine_) {
138  intel_jit_listener_.reset(llvm::JITEventListener::createIntelJITEventListener());
140  execution_engine_->RegisterJITEventListener(intel_jit_listener_.get());
141  LOG(INFO) << "Registered IntelJITEventListener";
142  }
143  }
144 }
145 
147  llvm::ExecutionEngine* execution_engine) {
148  execution_engine_.reset(execution_engine);
149  intel_jit_listener_ = nullptr;
150  return *this;
151 }
152 
153 void verify_function_ir(const llvm::Function* func) {
154  std::stringstream err_ss;
155  llvm::raw_os_ostream err_os(err_ss);
156  if (llvm::verifyFunction(*func, &err_os)) {
157  func->print(llvm::outs());
158  LOG(FATAL) << err_ss.str();
159  }
160 }
161 
162 std::vector<std::pair<void*, void*>> Executor::getCodeFromCache(const CodeCacheKey& key,
163  const CodeCache& cache) {
164  auto it = cache.find(key);
165  if (it != cache.cend()) {
166  delete cgen_state_->module_;
167  cgen_state_->module_ = it->second.second;
168  std::vector<std::pair<void*, void*>> native_functions;
169  for (auto& native_code : it->second.first) {
170  GpuCompilationContext* gpu_context = std::get<2>(native_code).get();
171  native_functions.emplace_back(std::get<0>(native_code),
172  gpu_context ? gpu_context->module() : nullptr);
173  }
174  return native_functions;
175  }
176  return {};
177 }
178 
180  const CodeCacheKey& key,
181  std::vector<std::tuple<void*, ExecutionEngineWrapper>> native_code,
182  llvm::Module* module,
183  CodeCache& cache) {
184  CHECK(!native_code.empty());
185  CodeCacheVal cache_val;
186  for (auto& native_func : native_code) {
187  cache_val.emplace_back(
188  std::get<0>(native_func), std::move(std::get<1>(native_func)), nullptr);
189  }
190  cache.put(key,
191  std::make_pair<decltype(cache_val), decltype(module)>(std::move(cache_val),
192  std::move(module)));
193 }
194 
196  const CodeCacheKey& key,
197  const std::vector<std::tuple<void*, GpuCompilationContext*>>& native_code,
198  llvm::Module* module,
199  CodeCache& cache) {
200  CHECK(!native_code.empty());
201  CodeCacheVal cache_val;
202  for (const auto& native_func : native_code) {
203  cache_val.emplace_back(
204  std::get<0>(native_func),
206  std::unique_ptr<GpuCompilationContext>(std::get<1>(native_func)));
207  }
208  cache.put(key,
209  std::make_pair<decltype(cache_val), decltype(module)>(std::move(cache_val),
210  std::move(module)));
211 }
212 
214  llvm::Function* func,
215  const std::unordered_set<llvm::Function*>& live_funcs,
216  const CompilationOptions& co) {
217  auto module = func->getParent();
218  // run optimizations
219 #ifndef WITH_JIT_DEBUG
220  optimize_ir(func, module, live_funcs, co);
221 #endif // WITH_JIT_DEBUG
222 
223  auto init_err = llvm::InitializeNativeTarget();
224  CHECK(!init_err);
225 
226  llvm::InitializeAllTargetMCs();
227  llvm::InitializeNativeTargetAsmPrinter();
228  llvm::InitializeNativeTargetAsmParser();
229 
230  std::string err_str;
231  std::unique_ptr<llvm::Module> owner(module);
232  llvm::EngineBuilder eb(std::move(owner));
233  eb.setErrorStr(&err_str);
234  eb.setEngineKind(llvm::EngineKind::JIT);
235  llvm::TargetOptions to;
236  to.EnableFastISel = true;
237  eb.setTargetOptions(to);
239  eb.setOptLevel(llvm::CodeGenOpt::None);
240  }
241 
242  ExecutionEngineWrapper execution_engine(eb.create(), co);
243  CHECK(execution_engine.get());
244 
245  execution_engine->finalizeObject();
246 
247  return execution_engine;
248 }
249 
250 std::vector<std::pair<void*, void*>> Executor::optimizeAndCodegenCPU(
251  llvm::Function* query_func,
252  llvm::Function* multifrag_query_func,
253  const std::unordered_set<llvm::Function*>& live_funcs,
254  const CompilationOptions& co) {
255  auto module = multifrag_query_func->getParent();
256  CodeCacheKey key{serialize_llvm_object(query_func),
257  serialize_llvm_object(cgen_state_->row_func_)};
258  for (const auto helper : cgen_state_->helper_functions_) {
259  key.push_back(serialize_llvm_object(helper));
260  }
261  auto cached_code = getCodeFromCache(key, cpu_code_cache_);
262  if (!cached_code.empty()) {
263  return cached_code;
264  }
265 
266  auto execution_engine =
267  CodeGenerator::generateNativeCPUCode(query_func, live_funcs, co);
268  auto native_code = execution_engine->getPointerToFunction(multifrag_query_func);
269  CHECK(native_code);
270 
271  std::vector<std::tuple<void*, ExecutionEngineWrapper>> cache;
272  cache.emplace_back(native_code, std::move(execution_engine));
273  addCodeToCache(key, std::move(cache), module, cpu_code_cache_);
274 
275  return {std::make_pair(native_code, nullptr)};
276 }
277 
278 void CodeGenerator::link_udf_module(const std::unique_ptr<llvm::Module>& udf_module,
279  llvm::Module& module,
281  llvm::Linker::Flags flags) {
282  // throw a runtime error if the target module contains functions
283  // with the same name as in module of UDF functions.
284  for (auto& f : *udf_module.get()) {
285  auto func = module.getFunction(f.getName());
286  if (!(func == nullptr) && !f.isDeclaration() && flags == llvm::Linker::Flags::None) {
287  LOG(ERROR) << " Attempt to overwrite " << f.getName().str() << " in "
288  << module.getModuleIdentifier() << " from `"
289  << udf_module->getModuleIdentifier() << "`" << std::endl;
290  throw std::runtime_error(
291  "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
292  "function ***");
293  } else {
294  LOG(INFO) << " Adding " << f.getName().str() << " to "
295  << module.getModuleIdentifier() << " from `"
296  << udf_module->getModuleIdentifier() << "`" << std::endl;
297  }
298  }
299 
300  std::unique_ptr<llvm::Module> udf_module_copy;
301 
302  udf_module_copy = llvm::CloneModule(
303 #if LLVM_VERSION_MAJOR >= 7
304  *udf_module.get(),
305 #else
306  udf_module.get(),
307 #endif
308  cgen_state->vmap_);
309 
310  udf_module_copy->setDataLayout(module.getDataLayout());
311  udf_module_copy->setTargetTriple(module.getTargetTriple());
312 
313  // Initialize linker with module for RuntimeFunctions.bc
314  llvm::Linker ld(module);
315  bool link_error = false;
316 
317  link_error = ld.linkInModule(std::move(udf_module_copy), flags);
318 
319  if (link_error) {
320  throw std::runtime_error("link_udf_module: *** error linking module ***");
321  }
322 }
323 
324 namespace {
325 
326 std::string cpp_to_llvm_name(const std::string& s) {
327  if (s == "int8_t") {
328  return "i8";
329  }
330  if (s == "int16_t") {
331  return "i16";
332  }
333  if (s == "int32_t") {
334  return "i32";
335  }
336  if (s == "int64_t") {
337  return "i64";
338  }
339  CHECK(s == "float" || s == "double");
340  return s;
341 }
342 
343 std::string gen_array_any_all_sigs() {
344  std::string result;
345  for (const std::string any_or_all : {"any", "all"}) {
346  for (const std::string elem_type :
347  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
348  for (const std::string needle_type :
349  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
350  for (const std::string op_name : {"eq", "ne", "lt", "le", "gt", "ge"}) {
351  result += ("declare i1 @array_" + any_or_all + "_" + op_name + "_" + elem_type +
352  "_" + needle_type + "(i8*, i64, " + cpp_to_llvm_name(needle_type) +
353  ", " + cpp_to_llvm_name(elem_type) + ");\n");
354  }
355  }
356  }
357  }
358  return result;
359 }
360 
362  std::string result;
363  for (const std::string key_type : {"int8_t", "int16_t", "int32_t", "int64_t"}) {
364  const auto key_llvm_type = cpp_to_llvm_name(key_type);
365  result += "declare i64 @translate_null_key_" + key_type + "(" + key_llvm_type + ", " +
366  key_llvm_type + ", i64);\n";
367  }
368  return result;
369 }
370 
371 const std::string cuda_rt_decls =
372  R"( declare void @llvm.dbg.declare(metadata, metadata, metadata) declare void @llvm.dbg.value(metadata, metadata, metadata) declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind declare i32 @pos_start_impl(i32*); declare i32 @group_buff_idx_impl(); declare i32 @pos_step_impl(); declare i8 @thread_warp_idx(i8); declare i64* @init_shared_mem(i64*, i32); declare i64* @init_shared_mem_nop(i64*, i32); declare i64* @init_shared_mem_dynamic(i64*, i32); declare i64* @alloc_shared_mem_dynamic(); declare void @set_shared_mem_to_identity(i64*, i32, i64); declare void @write_back(i64*, i64*, i32); declare void @write_back_smem_nop(i64*, i64*, i32); declare void @write_back_nop(i64*, i64*, i32); declare void @agg_from_smem_to_gmem_nop(i64*, i64*, i32); declare void @agg_from_smem_to_gmem_binId_count(i64*, i64*, i32); declare void @agg_from_smem_to_gmem_count_binId(i64*, i64*, i32); declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8); declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32, i64*); declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32, i64*); declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32); declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32); declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32); declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32); declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64); declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64); declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64); declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64); declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64); declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double); declare i64 @get_bucket_key_for_range_double(i8*, i64, double); declare i64 @agg_count_shared(i64*, i64); declare i64 @agg_count_skip_val_shared(i64*, i64, i64); declare i32 @agg_count_int32_shared(i32*, i32); declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32); declare i64 @agg_count_double_shared(i64*, double); declare i64 @agg_count_double_skip_val_shared(i64*, double, double); declare i32 @agg_count_float_shared(i32*, float); declare i32 @agg_count_float_skip_val_shared(i32*, float, float); declare i64 @agg_sum_shared(i64*, i64); declare i64 @agg_sum_skip_val_shared(i64*, i64, i64); declare i32 @agg_sum_int32_shared(i32*, i32); declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32); declare void @agg_sum_double_shared(i64*, double); declare void @agg_sum_double_skip_val_shared(i64*, double, double); declare void @agg_sum_float_shared(i32*, float); declare void @agg_sum_float_skip_val_shared(i32*, float, float); declare void @agg_max_shared(i64*, i64); declare void @agg_max_skip_val_shared(i64*, i64, i64); declare void @agg_max_int32_shared(i32*, i32); declare void @agg_max_int32_skip_val_shared(i32*, i32, i32); declare void @agg_max_int16_shared(i16*, i16); declare void @agg_max_int16_skip_val_shared(i16*, i16, i16); declare void @agg_max_int8_shared(i8*, i8); declare void @agg_max_int8_skip_val_shared(i8*, i8, i8); declare void @agg_max_double_shared(i64*, double); declare void @agg_max_double_skip_val_shared(i64*, double, double); declare void @agg_max_float_shared(i32*, float); declare void @agg_max_float_skip_val_shared(i32*, float, float); declare void @agg_min_shared(i64*, i64); declare void @agg_min_skip_val_shared(i64*, i64, i64); declare void @agg_min_int32_shared(i32*, i32); declare void @agg_min_int32_skip_val_shared(i32*, i32, i32); declare void @agg_min_int16_shared(i16*, i16); declare void @agg_min_int16_skip_val_shared(i16*, i16, i16); declare void @agg_min_int8_shared(i8*, i8); declare void @agg_min_int8_skip_val_shared(i8*, i8, i8); declare void @agg_min_double_shared(i64*, double); declare void @agg_min_double_skip_val_shared(i64*, double, double); declare void @agg_min_float_shared(i32*, float); declare void @agg_min_float_skip_val_shared(i32*, float, float); declare void @agg_id_shared(i64*, i64); declare void @agg_id_int32_shared(i32*, i32); declare void @agg_id_int16_shared(i16*, i16); declare void @agg_id_int8_shared(i8*, i8); declare void @agg_id_double_shared(i64*, double); declare void @agg_id_double_shared_slow(i64*, double*); declare void @agg_id_float_shared(i32*, float); declare i32 @checked_single_agg_id_shared(i64*, i64, i64); declare i32 @checked_single_agg_id_double_shared(i64*, double, double); declare i32 @checked_single_agg_id_double_shared_slow(i64*, double*, double); declare i32 @checked_single_agg_id_float_shared(i32*, float, float); declare i1 @slotEmptyKeyCAS(i64*, i64, i64); declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32); declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16); declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8); declare i64 @ExtractFromTime(i32, i64); declare i64 @ExtractFromTimeNullable(i32, i64, i64); declare i64 @DateTruncate(i32, i64); declare i64 @DateTruncateNullable(i32, i64, i64); declare i64 @DateTruncateHighPrecisionToDate(i64, i64); declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64); declare i64 @DateDiff(i32, i64, i64); declare i64 @DateDiffNullable(i32, i64, i64, i64); declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i64, i64, i64); declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i64, i64, i64, i64); declare i64 @DateAdd(i32, i64, i64); declare i64 @DateAddNullable(i32, i64, i64, i64); declare i64 @DateAddHighPrecision(i32, i64, i64, i64); declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i64, i64); declare i64 @string_decode(i8*, i64); declare i32 @array_size(i8*, i64, i32); declare i32 @array_size_nullable(i8*, i64, i32, i32); declare i32 @fast_fixlen_array_size(i8*, i32); declare i1 @array_is_null(i8*, i64); declare i8* @array_buff(i8*, i64); declare i8* @fast_fixlen_array_buff(i8*, i64); declare i8 @array_at_int8_t(i8*, i64, i32); declare i16 @array_at_int16_t(i8*, i64, i32); declare i32 @array_at_int32_t(i8*, i64, i32); declare i64 @array_at_int64_t(i8*, i64, i32); declare float @array_at_float(i8*, i64, i32); declare double @array_at_double(i8*, i64, i32); declare i8 @varlen_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_array_at_int64_t(i8*, i64, i32); declare float @varlen_array_at_float(i8*, i64, i32); declare double @varlen_array_at_double(i8*, i64, i32); declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32); declare float @varlen_notnull_array_at_float(i8*, i64, i32); declare double @varlen_notnull_array_at_double(i8*, i64, i32); declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8); declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16); declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32); declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64); declare float @array_at_float_checked(i8*, i64, i64, float); declare double @array_at_double_checked(i8*, i64, i64, double); declare i32 @char_length(i8*, i32); declare i32 @char_length_nullable(i8*, i32, i32); declare i32 @char_length_encoded(i8*, i32); declare i32 @char_length_encoded_nullable(i8*, i32, i32); declare i32 @key_for_string_encoded(i32); declare i1 @string_like(i8*, i32, i8*, i32, i8); declare i1 @string_ilike(i8*, i32, i8*, i32, i8); declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8); declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8); declare i1 @string_like_simple(i8*, i32, i8*, i32); declare i1 @string_ilike_simple(i8*, i32, i8*, i32); declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8); declare i1 @string_lt(i8*, i32, i8*, i32); declare i1 @string_le(i8*, i32, i8*, i32); declare i1 @string_gt(i8*, i32, i8*, i32); declare i1 @string_ge(i8*, i32, i8*, i32); declare i1 @string_eq(i8*, i32, i8*, i32); declare i1 @string_ne(i8*, i32, i8*, i32); declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8); declare i1 @regexp_like(i8*, i32, i8*, i32, i8); declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8); declare void @linear_probabilistic_count(i8*, i32, i8*, i32); declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64); declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64); declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64); declare i32 @record_error_code(i32, i32*); declare i1 @dynamic_watchdog(); declare void @force_sync(); declare void @sync_warp(); declare void @sync_warp_protected(i64, i64); declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32); declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64); declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float); declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double); )" + gen_array_any_all_sigs() +
374 
375 #ifdef HAVE_CUDA
376 std::string extension_function_decls(const std::unordered_set<std::string>& udf_decls) {
377  const auto decls = ExtensionFunctionsWhitelist::getLLVMDeclarations(udf_decls);
378  return boost::algorithm::join(decls, "\n");
379 }
380 
381 void legalize_nvvm_ir(llvm::Function* query_func) {
382  // optimizations might add attributes to the function
383  // and NVPTX doesn't understand all of them; play it
384  // safe and clear all attributes
385  clear_function_attributes(query_func);
386  verify_function_ir(query_func);
387 
388  std::vector<llvm::Instruction*> stackrestore_intrinsics;
389  std::vector<llvm::Instruction*> stacksave_intrinsics;
390  for (auto& BB : *query_func) {
391  for (llvm::Instruction& I : BB) {
392  if (const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
393  if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
394  stacksave_intrinsics.push_back(&I);
395  } else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
396  stackrestore_intrinsics.push_back(&I);
397  }
398  }
399  }
400  }
401 
402  // stacksave and stackrestore intrinsics appear together, and
403  // stackrestore uses stacksaved result as its argument
404  // so it should be removed first.
405  for (auto& II : stackrestore_intrinsics) {
406  II->eraseFromParent();
407  }
408  for (auto& II : stacksave_intrinsics) {
409  II->eraseFromParent();
410  }
411 }
412 #endif // HAVE_CUDA
413 
414 } // namespace
415 
416 llvm::StringRef get_gpu_target_triple_string() {
417  return llvm::StringRef("nvptx64-nvidia-cuda");
418 }
419 
420 llvm::StringRef get_gpu_data_layout() {
421  return llvm::StringRef(
422  "e-p:64:64:64-i1:8:8-i8:8:8-"
423  "i16:16:16-i32:32:32-i64:64:64-"
424  "f32:32:32-f64:64:64-v16:16:16-"
425  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
426 }
427 
428 std::map<std::string, std::string> get_device_parameters() {
429  std::map<std::string, std::string> result;
430 
431  result.insert(std::make_pair("cpu_name", llvm::sys::getHostCPUName()));
432  result.insert(std::make_pair("cpu_triple", llvm::sys::getProcessTriple()));
433  result.insert(
434  std::make_pair("cpu_cores", std::to_string(llvm::sys::getHostNumPhysicalCores())));
435  result.insert(std::make_pair("cpu_threads", std::to_string(cpu_threads())));
436 
437  llvm::StringMap<bool> cpu_features;
438  if (llvm::sys::getHostCPUFeatures(cpu_features)) {
439  std::string features_str = "";
440  for (auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
441  features_str += (it->getValue() ? " +" : " -");
442  features_str += it->getKey().str();
443  }
444  result.insert(std::make_pair("cpu_features", features_str));
445  }
446 
447 #ifdef HAVE_CUDA
448  int device_count = 0;
449  checkCudaErrors(cuDeviceGetCount(&device_count));
450  if (device_count) {
451  CUdevice device{};
452  char device_name[256];
453  int major = 0, minor = 0;
454  checkCudaErrors(cuDeviceGet(&device, 0)); // assuming homogeneous multi-GPU system
455  checkCudaErrors(cuDeviceGetName(device_name, 256, device));
456  checkCudaErrors(cuDeviceGetAttribute(
457  &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
458  checkCudaErrors(cuDeviceGetAttribute(
459  &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
460 
461  result.insert(std::make_pair("gpu_name", device_name));
462  result.insert(std::make_pair("gpu_count", std::to_string(device_count)));
463  result.insert(std::make_pair("gpu_compute_capability",
464  std::to_string(major) + "." + std::to_string(minor)));
465  result.insert(std::make_pair("gpu_triple", get_gpu_target_triple_string()));
466  result.insert(std::make_pair("gpu_datalayout", get_gpu_data_layout()));
467  }
468 #endif
469 
470  return result;
471 }
472 
474  llvm::Function* func,
475  llvm::Function* wrapper_func,
476  const std::unordered_set<llvm::Function*>& live_funcs,
477  const CompilationOptions& co,
478  const GPUTarget& gpu_target) {
479 #ifdef HAVE_CUDA
480  auto module = func->getParent();
481  module->setDataLayout(
482  "e-p:64:64:64-i1:8:8-i8:8:8-"
483  "i16:16:16-i32:32:32-i64:64:64-"
484  "f32:32:32-f64:64:64-v16:16:16-"
485  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
486  module->setTargetTriple("nvptx64-nvidia-cuda");
487  // run optimizations
488  optimize_ir(func, module, live_funcs, co);
489  legalize_nvvm_ir(func);
490 
491  std::stringstream ss;
492  llvm::raw_os_ostream os(ss);
493 
494  llvm::LLVMContext& ctx = module->getContext();
495  // Get "nvvm.annotations" metadata node
496  llvm::NamedMDNode* md = module->getOrInsertNamedMetadata("nvvm.annotations");
497 
498  llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
499  llvm::MDString::get(ctx, "kernel"),
500  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
501  llvm::Type::getInt32Ty(ctx), 1))};
502 
503  // Append metadata to nvvm.annotations
504  md->addOperand(llvm::MDNode::get(ctx, md_vals));
505 
506  std::unordered_set<llvm::Function*> roots{wrapper_func, func};
507  if (gpu_target.row_func_not_inlined) {
508  clear_function_attributes(gpu_target.cgen_state->row_func_);
509  roots.insert(gpu_target.cgen_state->row_func_);
510  }
511 
512  // Prevent the udf function(s) from being removed the way the runtime functions are
513 
514  std::unordered_set<std::string> udf_declarations;
515  if (is_udf_module_present()) {
516  for (auto& f : udf_gpu_module->getFunctionList()) {
517  llvm::Function* udf_function = module->getFunction(f.getName());
518 
519  if (udf_function) {
520  legalize_nvvm_ir(udf_function);
521  roots.insert(udf_function);
522 
523  // If we have a udf that declares a external function
524  // note it so we can avoid duplicate declarations
525  if (f.isDeclaration()) {
526  udf_declarations.insert(f.getName().str());
527  }
528  }
529  }
530  }
531 
532  if (is_rt_udf_module_present()) {
533  for (auto& f : rt_udf_gpu_module->getFunctionList()) {
534  llvm::Function* udf_function = module->getFunction(f.getName());
535  if (udf_function) {
536  legalize_nvvm_ir(udf_function);
537  roots.insert(udf_function);
538 
539  // If we have a udf that declares a external function
540  // note it so we can avoid duplicate declarations
541  if (f.isDeclaration()) {
542  udf_declarations.insert(f.getName().str());
543  }
544  }
545  }
546  }
547 
548  std::vector<llvm::Function*> rt_funcs;
549  for (auto& Fn : *module) {
550  if (roots.count(&Fn)) {
551  continue;
552  }
553  rt_funcs.push_back(&Fn);
554  }
555  for (auto& pFn : rt_funcs) {
556  pFn->removeFromParent();
557  }
558  module->print(os, nullptr);
559  os.flush();
560 
561  for (auto& pFn : rt_funcs) {
562  module->getFunctionList().push_back(pFn);
563  }
564  module->eraseNamedMetadata(md);
565 
566  auto cuda_llir = cuda_rt_decls + extension_function_decls(udf_declarations) + ss.str();
567 
568  std::vector<std::pair<void*, void*>> native_functions;
569  std::vector<std::tuple<void*, GpuCompilationContext*>> cached_functions;
570 
571  const auto ptx =
572  generatePTX(cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state);
573 
574  LOG(PTX) << "PTX for the GPU:\n" << ptx << "\nEnd of PTX";
575 
576  auto cubin_result = ptx_to_cubin(ptx, gpu_target.block_size, gpu_target.cuda_mgr);
577  auto& option_keys = cubin_result.option_keys;
578  auto& option_values = cubin_result.option_values;
579  auto cubin = cubin_result.cubin;
580  auto link_state = cubin_result.link_state;
581  const auto num_options = option_keys.size();
582 
583  auto func_name = wrapper_func->getName().str();
584  for (int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
585  ++device_id) {
586  auto gpu_context = new GpuCompilationContext(cubin,
587  func_name,
588  device_id,
589  gpu_target.cuda_mgr,
590  num_options,
591  &option_keys[0],
592  &option_values[0]);
593  auto native_code = gpu_context->kernel();
594  auto native_module = gpu_context->module();
595  CHECK(native_code);
596  CHECK(native_module);
597  native_functions.emplace_back(native_code, native_module);
598  cached_functions.emplace_back(native_code, gpu_context);
599  }
600 
601  checkCudaErrors(cuLinkDestroy(link_state));
602 
603  return {native_functions, cached_functions};
604 #else
605  return {};
606 #endif
607 }
608 
609 std::vector<std::pair<void*, void*>> Executor::optimizeAndCodegenGPU(
610  llvm::Function* query_func,
611  llvm::Function* multifrag_query_func,
612  std::unordered_set<llvm::Function*>& live_funcs,
613  const bool no_inline,
614  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
615  const CompilationOptions& co) {
616 #ifdef HAVE_CUDA
617  auto module = multifrag_query_func->getParent();
618  CHECK(cuda_mgr);
619  CodeCacheKey key{serialize_llvm_object(query_func),
620  serialize_llvm_object(cgen_state_->row_func_)};
621  for (const auto helper : cgen_state_->helper_functions_) {
622  key.push_back(serialize_llvm_object(helper));
623  }
624  auto cached_code = getCodeFromCache(key, gpu_code_cache_);
625  if (!cached_code.empty()) {
626  return cached_code;
627  }
628 
629  bool row_func_not_inlined = false;
630  if (no_inline) {
631  for (auto it = llvm::inst_begin(cgen_state_->row_func_),
632  e = llvm::inst_end(cgen_state_->row_func_);
633  it != e;
634  ++it) {
635  if (llvm::isa<llvm::CallInst>(*it)) {
636  auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
637  if (get_gv_call.getCalledFunction()->getName() == "array_size" ||
638  get_gv_call.getCalledFunction()->getName() == "linear_probabilistic_count") {
639  mark_function_never_inline(cgen_state_->row_func_);
640  row_func_not_inlined = true;
641  break;
642  }
643  }
644  }
645  }
646 
647  initializeNVPTXBackend();
648  CodeGenerator::GPUTarget gpu_target{nvptx_target_machine_.get(),
649  cuda_mgr,
650  blockSize(),
651  cgen_state_.get(),
652  row_func_not_inlined};
653  const auto gpu_code = CodeGenerator::generateNativeGPUCode(
654  query_func, multifrag_query_func, live_funcs, co, gpu_target);
655 
656  addCodeToCache(key, gpu_code.cached_functions, module, gpu_code_cache_);
657 
658  return gpu_code.native_functions;
659 #else
660  return {};
661 #endif
662 }
663 
664 std::string CodeGenerator::generatePTX(const std::string& cuda_llir,
665  llvm::TargetMachine* nvptx_target_machine,
667  auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir, "", false);
668 
669  llvm::SMDiagnostic err;
670 
671  auto module = llvm::parseIR(mem_buff->getMemBufferRef(), err, cgen_state->context_);
672  if (!module) {
673  LOG(FATAL) << err.getMessage().str();
674  }
675 
676  llvm::SmallString<256> code_str;
677  llvm::raw_svector_ostream formatted_os(code_str);
678  CHECK(nvptx_target_machine);
679  {
680  llvm::legacy::PassManager ptxgen_pm;
681  module->setDataLayout(nvptx_target_machine->createDataLayout());
682 
683 #if LLVM_VERSION_MAJOR >= 7
684  nvptx_target_machine->addPassesToEmitFile(
685  ptxgen_pm, formatted_os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
686 #else
687  nvptx_target_machine->addPassesToEmitFile(
688  ptxgen_pm, formatted_os, llvm::TargetMachine::CGFT_AssemblyFile);
689 #endif
690  ptxgen_pm.run(*module);
691  }
692 
693  return code_str.str();
694 }
695 
696 std::unique_ptr<llvm::TargetMachine> CodeGenerator::initializeNVPTXBackend() {
697  llvm::InitializeAllTargets();
698  llvm::InitializeAllTargetMCs();
699  llvm::InitializeAllAsmPrinters();
700  std::string err;
701  auto target = llvm::TargetRegistry::lookupTarget("nvptx64", err);
702  if (!target) {
703  LOG(FATAL) << err;
704  }
705  return std::unique_ptr<llvm::TargetMachine>(target->createTargetMachine(
706  "nvptx64-nvidia-cuda", "sm_30", "", llvm::TargetOptions(), llvm::Reloc::Static));
707 }
708 
709 std::string Executor::generatePTX(const std::string& cuda_llir) const {
711  cuda_llir, nvptx_target_machine_.get(), cgen_state_.get());
712 }
713 
715  if (nvptx_target_machine_) {
716  return;
717  }
718  nvptx_target_machine_ = CodeGenerator::initializeNVPTXBackend();
719 }
720 
721 // A small number of runtime functions don't get through CgenState::emitCall. List them
722 // explicitly here and always clone their implementation from the runtime module.
723 bool CodeGenerator::alwaysCloneRuntimeFunction(const llvm::Function* func) {
724  return func->getName() == "query_stub_hoisted_literals" ||
725  func->getName() == "multifrag_query_hoisted_literals" ||
726  func->getName() == "query_stub" || func->getName() == "multifrag_query" ||
727  func->getName() == "fixed_width_int_decode" ||
728  func->getName() == "fixed_width_unsigned_decode" ||
729  func->getName() == "diff_fixed_width_int_decode" ||
730  func->getName() == "fixed_width_double_decode" ||
731  func->getName() == "fixed_width_float_decode" ||
732  func->getName() == "fixed_width_small_date_decode" ||
733  func->getName() == "record_error_code";
734 }
735 
736 llvm::Module* read_template_module(llvm::LLVMContext& context) {
737  llvm::SMDiagnostic err;
738 
739  auto buffer_or_error = llvm::MemoryBuffer::getFile(mapd_root_abs_path() +
740  "/QueryEngine/RuntimeFunctions.bc");
741  CHECK(!buffer_or_error.getError());
742  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
743 
744  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
745  CHECK(!owner.takeError());
746  auto module = owner.get().release();
747  CHECK(module);
748 
749  return module;
750 }
751 
752 namespace {
753 
754 void bind_pos_placeholders(const std::string& pos_fn_name,
755  const bool use_resume_param,
756  llvm::Function* query_func,
757  llvm::Module* module) {
758  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
759  ++it) {
760  if (!llvm::isa<llvm::CallInst>(*it)) {
761  continue;
762  }
763  auto& pos_call = llvm::cast<llvm::CallInst>(*it);
764  if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
765  if (use_resume_param) {
766  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
767  llvm::ReplaceInstWithInst(
768  &pos_call,
769  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl"),
770  error_code_arg));
771  } else {
772  llvm::ReplaceInstWithInst(
773  &pos_call,
774  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl")));
775  }
776  break;
777  }
778  }
779 }
780 
781 void set_row_func_argnames(llvm::Function* row_func,
782  const size_t in_col_count,
783  const size_t agg_col_count,
784  const bool hoist_literals) {
785  auto arg_it = row_func->arg_begin();
786 
787  if (agg_col_count) {
788  for (size_t i = 0; i < agg_col_count; ++i) {
789  arg_it->setName("out");
790  ++arg_it;
791  }
792  } else {
793  arg_it->setName("group_by_buff");
794  ++arg_it;
795  arg_it->setName("crt_matched");
796  ++arg_it;
797  arg_it->setName("total_matched");
798  ++arg_it;
799  arg_it->setName("old_total_matched");
800  ++arg_it;
801  arg_it->setName("max_matched");
802  ++arg_it;
803  }
804 
805  arg_it->setName("agg_init_val");
806  ++arg_it;
807 
808  arg_it->setName("pos");
809  ++arg_it;
810 
811  arg_it->setName("frag_row_off");
812  ++arg_it;
813 
814  arg_it->setName("num_rows_per_scan");
815  ++arg_it;
816 
817  if (hoist_literals) {
818  arg_it->setName("literals");
819  ++arg_it;
820  }
821 
822  for (size_t i = 0; i < in_col_count; ++i) {
823  arg_it->setName("col_buf" + std::to_string(i));
824  ++arg_it;
825  }
826 
827  arg_it->setName("join_hash_tables");
828 }
829 
830 std::pair<llvm::Function*, std::vector<llvm::Value*>> create_row_function(
831  const size_t in_col_count,
832  const size_t agg_col_count,
833  const bool hoist_literals,
834  llvm::Function* query_func,
835  llvm::Module* module,
836  llvm::LLVMContext& context) {
837  std::vector<llvm::Type*> row_process_arg_types;
838 
839  if (agg_col_count) {
840  // output (aggregate) arguments
841  for (size_t i = 0; i < agg_col_count; ++i) {
842  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
843  }
844  } else {
845  // group by buffer
846  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
847  // current match count
848  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
849  // total match count passed from the caller
850  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
851  // old total match count returned to the caller
852  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
853  // max matched (total number of slots in the output buffer)
854  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
855  }
856 
857  // aggregate init values
858  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
859 
860  // position argument
861  row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
862 
863  // fragment row offset argument
864  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
865 
866  // number of rows for each scan
867  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
868 
869  // literals buffer argument
870  if (hoist_literals) {
871  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
872  }
873 
874  // Generate the function signature and column head fetches s.t.
875  // double indirection isn't needed in the inner loop
876  auto& fetch_bb = query_func->front();
877  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
878  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
879  auto col_heads = generate_column_heads_load(
880  in_col_count, query_func->args().begin(), fetch_ir_builder, context);
881  CHECK_EQ(in_col_count, col_heads.size());
882 
883  // column buffer arguments
884  for (size_t i = 0; i < in_col_count; ++i) {
885  row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
886  }
887 
888  // join hash table argument
889  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
891  // generate the function
892  auto ft =
893  llvm::FunctionType::get(get_int_type(32, context), row_process_arg_types, false);
894 
895  auto row_func =
896  llvm::Function::Create(ft, llvm::Function::ExternalLinkage, "row_func", module);
897 
898  // set the row function argument names; for debugging purposes only
899  set_row_func_argnames(row_func, in_col_count, agg_col_count, hoist_literals);
900 
901  return std::make_pair(row_func, col_heads);
902 }
903 
904 void bind_query(llvm::Function* query_func,
905  const std::string& query_fname,
906  llvm::Function* multifrag_query_func,
907  llvm::Module* module) {
908  std::vector<llvm::CallInst*> query_stubs;
909  for (auto it = llvm::inst_begin(multifrag_query_func),
910  e = llvm::inst_end(multifrag_query_func);
911  it != e;
912  ++it) {
913  if (!llvm::isa<llvm::CallInst>(*it)) {
914  continue;
915  }
916  auto& query_call = llvm::cast<llvm::CallInst>(*it);
917  if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
918  query_stubs.push_back(&query_call);
919  }
920  }
921  for (auto& S : query_stubs) {
922  std::vector<llvm::Value*> args;
923  for (size_t i = 0; i < S->getNumArgOperands(); ++i) {
924  args.push_back(S->getArgOperand(i));
925  }
926  llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args, ""));
927  }
928 }
929 
930 std::vector<std::string> get_agg_fnames(const std::vector<Analyzer::Expr*>& target_exprs,
931  const bool is_group_by) {
932  std::vector<std::string> result;
933  for (size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
934  ++target_idx, ++agg_col_idx) {
935  const auto target_expr = target_exprs[target_idx];
936  CHECK(target_expr);
937  const auto target_type_info = target_expr->get_type_info();
938  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
939  const bool is_varlen =
940  (target_type_info.is_string() &&
941  target_type_info.get_compression() == kENCODING_NONE) ||
942  target_type_info.is_array(); // TODO: should it use is_varlen_array() ?
943  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
944  result.emplace_back(target_type_info.is_fp() ? "agg_id_double" : "agg_id");
945  if (is_varlen) {
946  result.emplace_back("agg_id");
947  }
948  if (target_type_info.is_geometry()) {
949  result.emplace_back("agg_id");
950  for (auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
951  result.emplace_back("agg_id");
952  }
953  }
954  continue;
955  }
956  const auto agg_type = agg_expr->get_aggtype();
957  const auto& agg_type_info =
958  agg_type != kCOUNT ? agg_expr->get_arg()->get_type_info() : target_type_info;
959  switch (agg_type) {
960  case kAVG: {
961  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
962  !agg_type_info.is_fp()) {
963  throw std::runtime_error("AVG is only valid on integer and floating point");
964  }
965  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
966  ? "agg_sum"
967  : "agg_sum_double");
968  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
969  ? "agg_count"
970  : "agg_count_double");
971  break;
972  }
973  case kMIN: {
974  if (agg_type_info.is_string() || agg_type_info.is_array() ||
975  agg_type_info.is_geometry()) {
976  throw std::runtime_error(
977  "MIN on strings, arrays or geospatial types not supported yet");
978  }
979  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
980  ? "agg_min"
981  : "agg_min_double");
982  break;
983  }
984  case kMAX: {
985  if (agg_type_info.is_string() || agg_type_info.is_array() ||
986  agg_type_info.is_geometry()) {
987  throw std::runtime_error(
988  "MAX on strings, arrays or geospatial types not supported yet");
989  }
990  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
991  ? "agg_max"
992  : "agg_max_double");
993  break;
994  }
995  case kSUM: {
996  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
997  !agg_type_info.is_fp()) {
998  throw std::runtime_error("SUM is only valid on integer and floating point");
999  }
1000  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1001  ? "agg_sum"
1002  : "agg_sum_double");
1003  break;
1004  }
1005  case kCOUNT:
1006  result.emplace_back(agg_expr->get_is_distinct() ? "agg_count_distinct"
1007  : "agg_count");
1008  break;
1009  case kSINGLE_VALUE: {
1010  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1011  break;
1012  }
1013  case kSAMPLE: {
1014  // Note that varlen SAMPLE arguments are handled separately above
1015  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1016  break;
1017  }
1019  result.emplace_back("agg_approximate_count_distinct");
1020  break;
1021  default:
1022  CHECK(false);
1023  }
1024  }
1025  return result;
1026 }
1027 
1028 } // namespace
1029 
1030 std::unique_ptr<llvm::Module> g_rt_module(read_template_module(getGlobalLLVMContext()));
1031 
1032 bool is_udf_module_present(bool cpu_only) {
1033  return (cpu_only || udf_gpu_module != nullptr) && (udf_cpu_module != nullptr);
1034 }
1035 
1036 bool is_rt_udf_module_present(bool cpu_only) {
1037  return (cpu_only || rt_udf_gpu_module != nullptr) && (rt_udf_cpu_module != nullptr);
1038 }
1039 
1040 void throw_parseIR_error(const llvm::SMDiagnostic& parse_error, std::string src = "") {
1041  std::string excname = "LLVM IR ParseError: ";
1042  llvm::raw_string_ostream ss(excname);
1043  parse_error.print(src.c_str(), ss, false, false);
1044  throw std::runtime_error(ss.str());
1045 }
1046 
1047 void read_udf_gpu_module(const std::string& udf_ir_filename) {
1048  llvm::SMDiagnostic parse_error;
1049 
1050  llvm::StringRef file_name_arg(udf_ir_filename);
1051 
1052  udf_gpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1053  if (!udf_gpu_module) {
1054  throw_parseIR_error(parse_error, udf_ir_filename);
1055  }
1056 }
1057 
1058 void read_udf_cpu_module(const std::string& udf_ir_filename) {
1059  llvm::SMDiagnostic parse_error;
1060 
1061  llvm::StringRef file_name_arg(udf_ir_filename);
1062 
1063  udf_cpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1064  if (!udf_cpu_module) {
1065  throw_parseIR_error(parse_error, udf_ir_filename);
1066  }
1067 }
1068 
1069 void read_rt_udf_gpu_module(const std::string& udf_ir_string) {
1070  llvm::SMDiagnostic parse_error;
1071 
1072  auto buf =
1073  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for GPU");
1074 
1075  rt_udf_gpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1076  if (!rt_udf_gpu_module) {
1077  throw_parseIR_error(parse_error);
1078  }
1079 }
1081 void read_rt_udf_cpu_module(const std::string& udf_ir_string) {
1082  llvm::SMDiagnostic parse_error;
1083 
1084  auto buf =
1085  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for CPU");
1086 
1087  rt_udf_cpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1088  if (!rt_udf_cpu_module) {
1089  throw_parseIR_error(parse_error);
1090  }
1091 }
1092 
1093 std::unordered_set<llvm::Function*> CodeGenerator::markDeadRuntimeFuncs(
1094  llvm::Module& module,
1095  const std::vector<llvm::Function*>& roots,
1096  const std::vector<llvm::Function*>& leaves) {
1097  std::unordered_set<llvm::Function*> live_funcs;
1098  live_funcs.insert(roots.begin(), roots.end());
1099  live_funcs.insert(leaves.begin(), leaves.end());
1100 
1101  if (auto F = module.getFunction("init_shared_mem_nop")) {
1102  live_funcs.insert(F);
1103  }
1104  if (auto F = module.getFunction("write_back_nop")) {
1105  live_funcs.insert(F);
1106  }
1107 
1108  for (const llvm::Function* F : roots) {
1109  for (const llvm::BasicBlock& BB : *F) {
1110  for (const llvm::Instruction& I : BB) {
1111  if (const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1112  live_funcs.insert(CI->getCalledFunction());
1113  }
1114  }
1115  }
1116  }
1117 
1118  for (llvm::Function& F : module) {
1119  if (!live_funcs.count(&F) && !F.isDeclaration()) {
1120  F.setLinkage(llvm::GlobalValue::InternalLinkage);
1121  }
1122  }
1123 
1124  return live_funcs;
1125 }
1126 
1127 namespace {
1128 // searches for a particular variable within a specific basic block (or all if bb_name is
1129 // empty)
1130 template <typename InstType>
1131 llvm::Value* find_variable_in_basic_block(llvm::Function* func,
1132  std::string bb_name,
1133  std::string variable_name) {
1134  llvm::Value* result = nullptr;
1135  if (func == nullptr || variable_name.empty()) {
1136  return result;
1137  }
1138  bool is_found = false;
1139  for (auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1140  if (!bb_name.empty() && bb_it->getName() != bb_name) {
1141  continue;
1142  }
1143  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1144  if (llvm::isa<InstType>(*inst_it)) {
1145  if (inst_it->getName() == variable_name) {
1146  result = &*inst_it;
1147  is_found = true;
1148  break;
1149  }
1150  }
1151  }
1152  }
1153  return result;
1154 }
1155 }; // namespace
1156 
1157 void Executor::createErrorCheckControlFlow(llvm::Function* query_func,
1158  bool run_with_dynamic_watchdog,
1159  ExecutorDeviceType device_type) {
1160  // check whether the row processing was successful; currently, it can
1161  // fail by running out of group by buffer slots
1162 
1163  llvm::Value* row_count = nullptr;
1164  if (run_with_dynamic_watchdog && device_type == ExecutorDeviceType::GPU) {
1165  row_count =
1166  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1167  }
1168 
1169  bool done_splitting = false;
1170  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1171  ++bb_it) {
1172  llvm::Value* pos = nullptr;
1173  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1174  if (run_with_dynamic_watchdog && llvm::isa<llvm::PHINode>(*inst_it)) {
1175  if (inst_it->getName() == "pos") {
1176  pos = &*inst_it;
1177  }
1178  continue;
1179  }
1180  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1181  continue;
1182  }
1183  auto& filter_call = llvm::cast<llvm::CallInst>(*inst_it);
1184  if (std::string(filter_call.getCalledFunction()->getName()) == "row_process") {
1185  auto next_inst_it = inst_it;
1186  ++next_inst_it;
1187  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1188  auto& br_instr = bb_it->back();
1189  llvm::IRBuilder<> ir_builder(&br_instr);
1190  llvm::Value* err_lv = &*inst_it;
1191  if (run_with_dynamic_watchdog) {
1192  CHECK(pos);
1193  llvm::Value* call_watchdog_lv = nullptr;
1194  if (device_type == ExecutorDeviceType::GPU) {
1195  // In order to make sure all threads wihtin a block see the same barrier,
1196  // only those blocks whose none of their threads have experienced the critical
1197  // edge will go through the dynamic watchdog computation
1198  CHECK(row_count);
1199  auto crit_edge_rem =
1200  (blockSize() & (blockSize() - 1))
1201  ? ir_builder.CreateSRem(
1202  row_count,
1203  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1204  : ir_builder.CreateAnd(
1205  row_count,
1206  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1207  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1208  crit_edge_threshold->setName("crit_edge_threshold");
1209 
1210  // only those threads where pos < crit_edge_threshold go through dynamic
1211  // watchdog call
1212  call_watchdog_lv =
1213  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1214  } else {
1215  // CPU path: run watchdog for every 64th row
1216  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1217  call_watchdog_lv = ir_builder.CreateICmp(
1218  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1219  }
1220  CHECK(call_watchdog_lv);
1221  auto error_check_bb = bb_it->splitBasicBlock(
1222  llvm::BasicBlock::iterator(br_instr), ".error_check");
1223  auto& watchdog_br_instr = bb_it->back();
1224 
1225  auto watchdog_check_bb = llvm::BasicBlock::Create(
1226  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1227  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1228  auto detected_timeout = watchdog_ir_builder.CreateCall(
1229  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1230  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1231  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1232  watchdog_ir_builder.CreateBr(error_check_bb);
1233 
1234  llvm::ReplaceInstWithInst(
1235  &watchdog_br_instr,
1236  llvm::BranchInst::Create(
1237  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1238  ir_builder.SetInsertPoint(&br_instr);
1239  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1240 
1241  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1242  unified_err_lv->addIncoming(err_lv, &*bb_it);
1243  err_lv = unified_err_lv;
1244  }
1245  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1246  err_lv =
1247  ir_builder.CreateCall(cgen_state_->module_->getFunction("record_error_code"),
1248  std::vector<llvm::Value*>{err_lv, error_code_arg});
1249  if (device_type == ExecutorDeviceType::GPU) {
1250  // let kernel execution finish as expected, regardless of the observed error,
1251  // unless it is from the dynamic watchdog where all threads within that block
1252  // return together.
1253  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1254  err_lv,
1255  cgen_state_->llInt(Executor::ERR_OUT_OF_TIME));
1256  } else {
1257  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1258  err_lv,
1259  cgen_state_->llInt(static_cast<int32_t>(0)));
1260  }
1261  auto error_bb = llvm::BasicBlock::Create(
1262  cgen_state_->context_, ".error_exit", query_func, new_bb);
1263  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1264  llvm::ReplaceInstWithInst(&br_instr,
1265  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1266  done_splitting = true;
1267  break;
1268  }
1269  }
1270  }
1271  CHECK(done_splitting);
1272 }
1273 
1274 std::vector<llvm::Value*> Executor::inlineHoistedLiterals() {
1275  std::vector<llvm::Value*> hoisted_literals;
1276 
1277  // row_func_ is using literals whose defs have been hoisted up to the query_func_,
1278  // extend row_func_ signature to include extra args to pass these literal values.
1279  std::vector<llvm::Type*> row_process_arg_types;
1280 
1281  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1282  E = cgen_state_->row_func_->arg_end();
1283  I != E;
1284  ++I) {
1285  row_process_arg_types.push_back(I->getType());
1286  }
1287 
1288  for (auto& element : cgen_state_->query_func_literal_loads_) {
1289  for (auto value : element.second) {
1290  row_process_arg_types.push_back(value->getType());
1291  }
1292  }
1293 
1294  auto ft = llvm::FunctionType::get(
1295  get_int_type(32, cgen_state_->context_), row_process_arg_types, false);
1296  auto row_func_with_hoisted_literals =
1297  llvm::Function::Create(ft,
1298  llvm::Function::ExternalLinkage,
1299  "row_func_hoisted_literals",
1300  cgen_state_->row_func_->getParent());
1301 
1302  // make sure it's in-lined, we don't want register spills in the inner loop
1303  mark_function_always_inline(row_func_with_hoisted_literals);
1304 
1305  auto arg_it = row_func_with_hoisted_literals->arg_begin();
1306  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1307  E = cgen_state_->row_func_->arg_end();
1308  I != E;
1309  ++I) {
1310  if (I->hasName()) {
1311  arg_it->setName(I->getName());
1312  }
1313  ++arg_it;
1314  }
1315 
1316  std::unordered_map<int, std::vector<llvm::Value*>>
1317  query_func_literal_loads_function_arguments;
1318 
1319  for (auto& element : cgen_state_->query_func_literal_loads_) {
1320  std::vector<llvm::Value*> argument_values;
1321 
1322  for (auto value : element.second) {
1323  hoisted_literals.push_back(value);
1324  argument_values.push_back(&*arg_it);
1325  if (value->hasName()) {
1326  arg_it->setName("arg_" + value->getName());
1327  }
1328  ++arg_it;
1329  }
1330 
1331  query_func_literal_loads_function_arguments[element.first] = argument_values;
1332  }
1334  // copy the row_func function body over
1335  // see
1336  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
1337  row_func_with_hoisted_literals->getBasicBlockList().splice(
1338  row_func_with_hoisted_literals->begin(),
1339  cgen_state_->row_func_->getBasicBlockList());
1340 
1341  // also replace row_func arguments with the arguments from row_func_hoisted_literals
1342  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1343  E = cgen_state_->row_func_->arg_end(),
1344  I2 = row_func_with_hoisted_literals->arg_begin();
1345  I != E;
1346  ++I) {
1347  I->replaceAllUsesWith(&*I2);
1348  I2->takeName(&*I);
1349  ++I2;
1350  }
1351 
1352  cgen_state_->row_func_ = row_func_with_hoisted_literals;
1353 
1354  // and finally replace literal placeholders
1355  std::vector<llvm::Instruction*> placeholders;
1356  std::string prefix("__placeholder__literal_");
1357  for (auto it = llvm::inst_begin(row_func_with_hoisted_literals),
1358  e = llvm::inst_end(row_func_with_hoisted_literals);
1359  it != e;
1360  ++it) {
1361  if (it->hasName() && it->getName().startswith(prefix)) {
1362  auto offset_and_index_entry =
1363  cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
1364  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
1365 
1366  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
1367  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
1368 
1369  it->replaceAllUsesWith(
1370  query_func_literal_loads_function_arguments[lit_off][lit_idx]);
1371  placeholders.push_back(&*it);
1372  }
1373  }
1374  for (auto placeholder : placeholders) {
1375  placeholder->removeFromParent();
1376  }
1377 
1378  return hoisted_literals;
1379 }
1380 
1381 std::tuple<Executor::CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
1382 Executor::compileWorkUnit(const std::vector<InputTableInfo>& query_infos,
1383  const RelAlgExecutionUnit& ra_exe_unit,
1384  const CompilationOptions& co,
1385  const ExecutionOptions& eo,
1386  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
1387  const bool allow_lazy_fetch,
1388  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
1389  const size_t max_groups_buffer_entry_guess,
1390  const int8_t crt_min_byte_width,
1391  const bool has_cardinality_estimation,
1392  ColumnCacheMap& column_cache,
1393  RenderInfo* render_info) {
1394  auto timer = DEBUG_TIMER(__func__);
1395  nukeOldState(allow_lazy_fetch, query_infos, &ra_exe_unit);
1396 
1397  GroupByAndAggregate group_by_and_aggregate(
1398  this, co.device_type_, ra_exe_unit, query_infos, row_set_mem_owner);
1399  auto query_mem_desc =
1400  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
1401  max_groups_buffer_entry_guess,
1402  crt_min_byte_width,
1403  render_info,
1405 
1406  if (query_mem_desc->getQueryDescriptionType() ==
1408  !has_cardinality_estimation &&
1409  (!render_info || !render_info->isPotentialInSituRender()) && !eo.just_explain) {
1411  }
1412 
1413  const bool output_columnar = query_mem_desc->didOutputColumnar();
1414 
1416  const size_t num_count_distinct_descs =
1417  query_mem_desc->getCountDistinctDescriptorsSize();
1418  for (size_t i = 0; i < num_count_distinct_descs; i++) {
1419  const auto& count_distinct_descriptor =
1420  query_mem_desc->getCountDistinctDescriptor(i);
1421  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::StdSet ||
1422  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
1423  !co.hoist_literals_)) {
1424  throw QueryMustRunOnCpu();
1425  }
1426  }
1427  }
1428 
1429  // Read the module template and target either CPU or GPU
1430  // by binding the stream position functions to the right implementation:
1431  // stride access for GPU, contiguous for CPU
1432  auto rt_module_copy = llvm::CloneModule(
1433 #if LLVM_VERSION_MAJOR >= 7
1434  *g_rt_module.get(),
1435 #else
1436  g_rt_module.get(),
1437 #endif
1438  cgen_state_->vmap_,
1439  [](const llvm::GlobalValue* gv) {
1440  auto func = llvm::dyn_cast<llvm::Function>(gv);
1441  if (!func) {
1442  return true;
1443  }
1444  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
1445  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
1447  });
1448 
1451  CodeGenerator::link_udf_module(udf_cpu_module, *rt_module_copy, cgen_state_.get());
1452  }
1453  if (is_rt_udf_module_present(true)) {
1455  rt_udf_cpu_module, *rt_module_copy, cgen_state_.get());
1456  }
1457  } else {
1458  rt_module_copy->setDataLayout(get_gpu_data_layout());
1459  rt_module_copy->setTargetTriple(get_gpu_target_triple_string());
1460 
1461  if (is_udf_module_present()) {
1462  llvm::Triple gpu_triple(udf_gpu_module->getTargetTriple());
1463 
1464  if (!gpu_triple.isNVPTX()) {
1465  throw QueryMustRunOnCpu();
1466  }
1467 
1468  CodeGenerator::link_udf_module(udf_gpu_module, *rt_module_copy, cgen_state_.get());
1469  }
1470  if (is_rt_udf_module_present()) {
1472  rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
1473  }
1474  }
1475 
1476  cgen_state_->module_ = rt_module_copy.release();
1477 
1478  auto agg_fnames =
1479  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
1480 
1481  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
1482 
1483  const bool is_group_by{query_mem_desc->isGroupBy()};
1484  auto query_func = is_group_by ? query_group_by_template(cgen_state_->module_,
1485  co.hoist_literals_,
1486  *query_mem_desc,
1487  co.device_type_,
1488  ra_exe_unit.scan_limit)
1489  : query_template(cgen_state_->module_,
1490  agg_slot_count,
1491  co.hoist_literals_,
1492  !!ra_exe_unit.estimator);
1493  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
1494  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
1495  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
1496 
1497  cgen_state_->query_func_ = query_func;
1498  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
1499  &query_func->getEntryBlock().front());
1500 
1501  std::vector<llvm::Value*> col_heads;
1502  std::tie(cgen_state_->row_func_, col_heads) =
1503  create_row_function(ra_exe_unit.input_col_descs.size(),
1504  is_group_by ? 0 : agg_slot_count,
1505  co.hoist_literals_,
1506  query_func,
1507  cgen_state_->module_,
1508  cgen_state_->context_);
1509  CHECK(cgen_state_->row_func_);
1510  // make sure it's in-lined, we don't want register spills in the inner loop
1511  mark_function_always_inline(cgen_state_->row_func_);
1512  auto bb =
1513  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
1514  cgen_state_->ir_builder_.SetInsertPoint(bb);
1515  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
1516  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
1517  const auto join_loops =
1518  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
1519  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
1520  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
1521  if (is_not_deleted_bb) {
1522  bb = is_not_deleted_bb;
1523  }
1524  if (!join_loops.empty()) {
1525  codegenJoinLoops(join_loops,
1526  body_execution_unit,
1527  group_by_and_aggregate,
1528  query_func,
1529  bb,
1530  *(query_mem_desc.get()),
1531  co,
1532  eo);
1533  } else {
1534  const bool can_return_error =
1535  compileBody(ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co);
1536  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog) {
1537  createErrorCheckControlFlow(query_func, eo.with_dynamic_watchdog, co.device_type_);
1538  }
1539  }
1540  std::vector<llvm::Value*> hoisted_literals;
1541 
1542  if (co.hoist_literals_) {
1543  VLOG(1) << "number of hoisted literals: "
1544  << cgen_state_->query_func_literal_loads_.size()
1545  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
1546  << " bytes";
1547  }
1548 
1549  if (co.hoist_literals_ && !cgen_state_->query_func_literal_loads_.empty()) {
1550  // we have some hoisted literals...
1551  hoisted_literals = inlineHoistedLiterals();
1552  }
1553  // iterate through all the instruction in the query template function and
1554  // replace the call to the filter placeholder with the call to the actual filter
1555  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1556  ++it) {
1557  if (!llvm::isa<llvm::CallInst>(*it)) {
1558  continue;
1559  }
1560  auto& filter_call = llvm::cast<llvm::CallInst>(*it);
1561  if (std::string(filter_call.getCalledFunction()->getName()) == "row_process") {
1562  std::vector<llvm::Value*> args;
1563  for (size_t i = 0; i < filter_call.getNumArgOperands(); ++i) {
1564  args.push_back(filter_call.getArgOperand(i));
1565  }
1566  args.insert(args.end(), col_heads.begin(), col_heads.end());
1567  args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
1568  // push hoisted literals arguments, if any
1569  args.insert(args.end(), hoisted_literals.begin(), hoisted_literals.end());
1570 
1571  llvm::ReplaceInstWithInst(&filter_call,
1572  llvm::CallInst::Create(cgen_state_->row_func_, args, ""));
1573  break;
1574  }
1575  }
1576  plan_state_->init_agg_vals_ =
1577  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
1578 
1579  auto multifrag_query_func = cgen_state_->module_->getFunction(
1580  "multifrag_query" + std::string(co.hoist_literals_ ? "_hoisted_literals" : ""));
1581  CHECK(multifrag_query_func);
1582 
1583  bind_query(query_func,
1584  "query_stub" + std::string(co.hoist_literals_ ? "_hoisted_literals" : ""),
1585  multifrag_query_func,
1586  cgen_state_->module_);
1587 
1588  auto live_funcs =
1589  CodeGenerator::markDeadRuntimeFuncs(*cgen_state_->module_,
1590  {query_func, cgen_state_->row_func_},
1591  {multifrag_query_func});
1592 
1593  std::string llvm_ir;
1594  if (eo.just_explain) {
1596 #ifdef WITH_JIT_DEBUG
1597  throw std::runtime_error(
1598  "Explain optimized not available when JIT runtime debug symbols are enabled");
1599 #else
1600  optimize_ir(query_func, cgen_state_->module_, live_funcs, co);
1601 #endif // WITH_JIT_DEBUG
1602  }
1603  llvm_ir =
1604  serialize_llvm_object(query_func) + serialize_llvm_object(cgen_state_->row_func_);
1605  }
1606  verify_function_ir(cgen_state_->row_func_);
1607 
1608  LOG(IR) << query_mem_desc->toString() << "\nGenerated IR\n"
1609  << serialize_llvm_object(query_func)
1610  << serialize_llvm_object(cgen_state_->row_func_) << "\nEnd of IR";
1611 
1612  return std::make_tuple(
1615  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
1616  : optimizeAndCodegenGPU(query_func,
1617  multifrag_query_func,
1618  live_funcs,
1619  is_group_by || ra_exe_unit.estimator,
1620  cuda_mgr,
1621  co),
1622  cgen_state_->getLiterals(),
1623  output_columnar,
1624  llvm_ir},
1625  std::move(query_mem_desc));
1626 }
1627 
1629  const RelAlgExecutionUnit& ra_exe_unit,
1630  const CompilationOptions& co) {
1631  CHECK(!ra_exe_unit.input_descs.empty());
1632  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
1633  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
1634  return nullptr;
1635  }
1636  const auto td = catalog_->getMetadataForTable(outer_input_desc.getTableId());
1637  CHECK(td);
1638  const auto deleted_cd = catalog_->getDeletedColumnIfRowsDeleted(td);
1639  if (!deleted_cd) {
1640  return nullptr;
1641  }
1642  CHECK(deleted_cd->columnType.is_boolean());
1643  const auto deleted_expr =
1644  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
1645  outer_input_desc.getTableId(),
1646  deleted_cd->columnId,
1647  outer_input_desc.getNestLevel());
1648  CodeGenerator code_generator(this);
1649  const auto is_deleted =
1650  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
1651  const auto is_deleted_bb = llvm::BasicBlock::Create(
1652  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
1653  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
1654  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
1655  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
1656  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
1657  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
1658  cgen_state_->ir_builder_.SetInsertPoint(bb);
1659  return bb;
1660 }
1661 
1662 bool Executor::compileBody(const RelAlgExecutionUnit& ra_exe_unit,
1663  GroupByAndAggregate& group_by_and_aggregate,
1665  const CompilationOptions& co) {
1666  // generate the code for the filter
1667  std::vector<Analyzer::Expr*> primary_quals;
1668  std::vector<Analyzer::Expr*> deferred_quals;
1669  bool short_circuited =
1670  CodeGenerator::prioritizeQuals(ra_exe_unit, primary_quals, deferred_quals);
1671  if (short_circuited) {
1672  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
1673  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
1674  << " quals";
1675  }
1676  llvm::Value* filter_lv = cgen_state_->llBool(true);
1677  CodeGenerator code_generator(this);
1678  for (auto expr : primary_quals) {
1679  // Generate the filter for primary quals
1680  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
1681  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
1682  }
1683  CHECK(filter_lv->getType()->isIntegerTy(1));
1684  llvm::BasicBlock* sc_false{nullptr};
1685  if (!deferred_quals.empty()) {
1686  auto sc_true = llvm::BasicBlock::Create(
1687  cgen_state_->context_, "sc_true", cgen_state_->row_func_);
1688  sc_false = llvm::BasicBlock::Create(
1689  cgen_state_->context_, "sc_false", cgen_state_->row_func_);
1690  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
1691  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
1692  if (ra_exe_unit.join_quals.empty()) {
1693  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
1694  }
1695  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
1696  filter_lv = cgen_state_->llBool(true);
1697  }
1698  for (auto expr : deferred_quals) {
1699  filter_lv = cgen_state_->ir_builder_.CreateAnd(
1700  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
1701  }
1702 
1703  CHECK(filter_lv->getType()->isIntegerTy(1));
1704  return group_by_and_aggregate.codegen(filter_lv, sc_false, query_mem_desc, co);
1705 }
1706 
1707 std::unique_ptr<llvm::Module> runtime_module_shallow_copy(CgenState* cgen_state) {
1708  return llvm::CloneModule(
1709 #if LLVM_VERSION_MAJOR >= 7
1710  *g_rt_module.get(),
1711 #else
1712  g_rt_module.get(),
1713 #endif
1714  cgen_state->vmap_,
1715  [](const llvm::GlobalValue* gv) {
1716  auto func = llvm::dyn_cast<llvm::Function>(gv);
1717  if (!func) {
1718  return true;
1719  }
1720  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
1721  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage);
1722  });
1723 }
1724 
1725 std::vector<llvm::Value*> generate_column_heads_load(const int num_columns,
1726  llvm::Value* byte_stream_arg,
1727  llvm::IRBuilder<>& ir_builder,
1728  llvm::LLVMContext& ctx) {
1729  CHECK(byte_stream_arg);
1730  const auto max_col_local_id = num_columns - 1;
1731 
1732  std::vector<llvm::Value*> col_heads;
1733  for (int col_id = 0; col_id <= max_col_local_id; ++col_id) {
1734  col_heads.emplace_back(ir_builder.CreateLoad(ir_builder.CreateGEP(
1735  byte_stream_arg, llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id))));
1736  }
1737  return col_heads;
1738 }
1739 
std::map< std::string, std::string > get_device_parameters()
void read_rt_udf_gpu_module(const std::string &udf_ir)
catalog_(nullptr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:205
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
std::unique_ptr< llvm::Module > rt_udf_cpu_module
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
std::unique_ptr< llvm::Module > module(runtime_module_shallow_copy(cgen_state))
static void addCodeToCache(const CodeCacheKey &, std::vector< std::tuple< void *, ExecutionEngineWrapper >>, llvm::Module *, CodeCache &)
std::unique_ptr< llvm::Module > runtime_module_shallow_copy(CgenState *cgen_state)
void mark_function_never_inline(llvm::Function *func)
std::unique_ptr< llvm::Module > udf_gpu_module
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, CgenState *cgen_state)
ExecutorDeviceType
void read_rt_udf_cpu_module(const std::string &udf_ir)
#define LOG(tag)
Definition: Logger.h:188
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
bool is_udf_module_present(bool cpu_only=false)
std::string mapd_root_abs_path()
Definition: mapdpath.h:30
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const ExecutorOptLevel opt_level_
std::string join(T const &container, std::string const &delim)
llvm::Function * query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query)
static GPUCode generateNativeGPUCode(llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co, const GPUTarget &gpu_target)
int64_t * src
void read_udf_cpu_module(const std::string &udf_ir_filename)
void read_udf_gpu_module(const std::string &udf_ir_filename)
std::vector< std::string > CodeCacheKey
Definition: CodeCache.h:61
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
const std::vector< InputDescriptor > input_descs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals)
Definition: LogicalIR.cpp:157
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
std::string to_string(char const *&&v)
gpu_code_cache_(code_cache_size)
std::tuple< Executor::CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
std::vector< std::pair< void *, void * > > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
cpu_code_cache_(code_cache_size)
Definition: sqldefs.h:73
false auto cgen_state
llvm::StringRef get_gpu_target_triple_string()
void verify_function_ir(const llvm::Function *func)
llvm::LLVMContext & context_
Definition: CgenState.h:316
const bool allow_multifrag
CHECK(cgen_state)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:116
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co)
std::string generatePTX(const std::string &) const
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
Definition: CodeCache.h:56
const bool with_dynamic_watchdog
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > g_rt_module
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:979
void initializeNVPTXBackend() const
Definition: sqldefs.h:75
const bool register_intel_jit_listener_
const_list_iterator_t cend() const
Definition: LruCache.hpp:55
const bool output_columnar_hint
llvm::Function * query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit)
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
ExecutorDeviceType device_type_
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, ExecutorDeviceType device_type)
const ExecutorExplainType explain_type_
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
Definition: CodeCache.h:55
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
llvm::Module * read_template_module(llvm::LLVMContext &context)
Definition: sqldefs.h:76
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
std::unique_ptr< llvm::Module > udf_cpu_module
int CUdevice
Definition: nocuda.h:20
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
llvm::LLVMContext & getGlobalLLVMContext()
SQLAgg get_aggtype() const
Definition: Analyzer.h:1044
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls)
std::list< std::shared_ptr< Analyzer::Expr > > quals
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend()
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:61
#define DEBUG_TIMER(name)
Definition: Logger.h:308
llvm::ValueToValueMapTy vmap_
Definition: CgenState.h:317
std::vector< std::pair< void *, void * > > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults > > > ColumnCacheMap
std::vector< llvm::Value * > inlineHoistedLiterals()
std::vector< std::tuple< void *, ExecutionEngineWrapper, std::unique_ptr< GpuCompilationContext >>> CodeCacheVal
Definition: CodeCache.h:63
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="")
Definition: sqldefs.h:74
void optimize_ir(llvm::Function *query_func, llvm::Module *module, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
int cpu_threads()
Definition: thread_count.h:25
CubinResult ptx_to_cubin(const std::string &ptx, const unsigned block_size, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
cgen_state module_
Definition: sqldefs.h:72
bool is_rt_udf_module_present(bool cpu_only=false)
const_list_iterator_t find(const key_t &key) const
Definition: LruCache.hpp:49
#define VLOG(n)
Definition: Logger.h:291
std::pair< llvm::Function *, std::vector< llvm::Value * > > create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Function *query_func, llvm::Module *module, llvm::LLVMContext &context)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
std::vector< std::pair< void *, void * > > getCodeFromCache(const CodeCacheKey &, const CodeCache &)
void put(key_t const &key, value_t &&value)
Definition: LruCache.hpp:27