OmniSciDB  1dac507f6e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
NativeCodegen.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
18 #include "Execute.h"
22 #include "QueryTemplateGenerator.h"
23 
24 #include "Shared/mapdpath.h"
25 
26 #if LLVM_VERSION_MAJOR < 4
27 static_assert(false, "LLVM Version >= 4 is required.");
28 #endif
29 
30 #include <llvm/Bitcode/BitcodeReader.h>
31 #include <llvm/Bitcode/BitcodeWriter.h>
32 #include <llvm/ExecutionEngine/MCJIT.h>
33 #include <llvm/IR/Attributes.h>
34 #include <llvm/IR/GlobalValue.h>
35 #include <llvm/IR/InstIterator.h>
36 #include <llvm/IR/LegacyPassManager.h>
37 #include <llvm/IR/Verifier.h>
38 #include <llvm/IRReader/IRReader.h>
39 #include <llvm/Support/Casting.h>
40 #include <llvm/Support/FileSystem.h>
41 #include <llvm/Support/FormattedStream.h>
42 #include <llvm/Support/MemoryBuffer.h>
43 #include <llvm/Support/SourceMgr.h>
44 #include <llvm/Support/TargetRegistry.h>
45 #include <llvm/Support/TargetSelect.h>
46 #include <llvm/Support/raw_os_ostream.h>
47 #include <llvm/Transforms/IPO.h>
48 #include <llvm/Transforms/IPO/AlwaysInliner.h>
49 #include <llvm/Transforms/InstCombine/InstCombine.h>
50 #include <llvm/Transforms/Instrumentation.h>
51 #include <llvm/Transforms/Scalar.h>
52 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
53 #include <llvm/Transforms/Utils/Cloning.h>
54 #include "llvm/IR/IntrinsicInst.h"
55 #include "llvm/IR/Intrinsics.h"
56 
57 #if LLVM_VERSION_MAJOR >= 7
58 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
59 #include <llvm/Transforms/Utils.h>
60 #endif
61 #include <llvm/IRReader/IRReader.h>
62 #include <llvm/Linker/Linker.h>
63 #include <llvm/Support/SourceMgr.h>
64 #include <llvm/Support/raw_ostream.h>
65 
66 std::unique_ptr<llvm::Module> udf_gpu_module;
67 std::unique_ptr<llvm::Module> udf_cpu_module;
68 std::unique_ptr<llvm::Module> rt_udf_gpu_module;
69 std::unique_ptr<llvm::Module> rt_udf_cpu_module;
70 
71 extern std::unique_ptr<llvm::Module> g_rt_module;
72 namespace {
73 
74 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
76  llvm::Module& M,
77  const std::unordered_set<llvm::Function*>& live_funcs) {
78  std::vector<llvm::Function*> dead_funcs;
79  for (auto& F : M) {
80  bool bAlive = false;
81  if (live_funcs.count(&F)) {
82  continue;
83  }
84  for (auto U : F.users()) {
85  auto* C = llvm::dyn_cast<const llvm::CallInst>(U);
86  if (!C || C->getParent()->getParent() != &F) {
87  bAlive = true;
88  break;
89  }
90  }
91  if (!bAlive) {
92  dead_funcs.push_back(&F);
93  }
94  }
95  for (auto pFn : dead_funcs) {
96  pFn->eraseFromParent();
97  }
98 }
99 
100 void optimize_ir(llvm::Function* query_func,
101  llvm::Module* module,
102  const std::unordered_set<llvm::Function*>& live_funcs,
103  const CompilationOptions& co) {
104  llvm::legacy::PassManager pass_manager;
105 
106  pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
107  pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
108 #if LLVM_VERSION_MAJOR >= 7
109  pass_manager.add(llvm::createInstSimplifyLegacyPass());
110 #else
111  pass_manager.add(llvm::createInstructionSimplifierPass());
112 #endif
113  pass_manager.add(llvm::createInstructionCombiningPass());
114  pass_manager.add(llvm::createGlobalOptimizerPass());
115 
116  pass_manager.add(llvm::createLICMPass());
118  pass_manager.add(llvm::createLoopStrengthReducePass());
119  }
120  pass_manager.run(*module);
121 
122  eliminate_dead_self_recursive_funcs(*module, live_funcs);
123 }
124 #endif
125 
126 } // namespace
127 
128 template <class T>
129 std::string serialize_llvm_object(const T* llvm_obj) {
130  std::stringstream ss;
131  llvm::raw_os_ostream os(ss);
132  llvm_obj->print(os);
133  os.flush();
134  return ss.str();
135 }
136 
138 
139 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine)
140  : execution_engine_(execution_engine) {}
141 
142 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine,
143  const CompilationOptions& co)
144  : execution_engine_(execution_engine) {
145  if (execution_engine_) {
147  intel_jit_listener_.reset(llvm::JITEventListener::createIntelJITEventListener());
149  execution_engine_->RegisterJITEventListener(intel_jit_listener_.get());
150  LOG(INFO) << "Registered IntelJITEventListener";
151  }
152  }
153 }
154 
156  llvm::ExecutionEngine* execution_engine) {
157  execution_engine_.reset(execution_engine);
158  intel_jit_listener_ = nullptr;
159  return *this;
160 }
161 
162 void verify_function_ir(const llvm::Function* func) {
163  std::stringstream err_ss;
164  llvm::raw_os_ostream err_os(err_ss);
165  if (llvm::verifyFunction(*func, &err_os)) {
166  func->print(llvm::outs());
167  LOG(FATAL) << err_ss.str();
168  }
169 }
170 
171 std::vector<std::pair<void*, void*>> Executor::getCodeFromCache(const CodeCacheKey& key,
172  const CodeCache& cache) {
173  auto it = cache.find(key);
174  if (it != cache.cend()) {
175  delete cgen_state_->module_;
176  cgen_state_->module_ = it->second.second;
177  std::vector<std::pair<void*, void*>> native_functions;
178  for (auto& native_code : it->second.first) {
179  GpuCompilationContext* gpu_context = std::get<2>(native_code).get();
180  native_functions.emplace_back(std::get<0>(native_code),
181  gpu_context ? gpu_context->module() : nullptr);
182  }
183  return native_functions;
184  }
185  return {};
186 }
187 
189  const CodeCacheKey& key,
190  std::vector<std::tuple<void*, ExecutionEngineWrapper>> native_code,
191  llvm::Module* module,
192  CodeCache& cache) {
193  CHECK(!native_code.empty());
194  CodeCacheVal cache_val;
195  for (auto& native_func : native_code) {
196  cache_val.emplace_back(
197  std::get<0>(native_func), std::move(std::get<1>(native_func)), nullptr);
198  }
199  cache.put(key,
200  std::make_pair<decltype(cache_val), decltype(module)>(std::move(cache_val),
201  std::move(module)));
202 }
203 
205  const CodeCacheKey& key,
206  const std::vector<std::tuple<void*, GpuCompilationContext*>>& native_code,
207  llvm::Module* module,
208  CodeCache& cache) {
209  CHECK(!native_code.empty());
210  CodeCacheVal cache_val;
211  for (const auto& native_func : native_code) {
212  cache_val.emplace_back(
213  std::get<0>(native_func),
215  std::unique_ptr<GpuCompilationContext>(std::get<1>(native_func)));
216  }
217  cache.put(key,
218  std::make_pair<decltype(cache_val), decltype(module)>(std::move(cache_val),
219  std::move(module)));
220 }
221 
223  llvm::Function* func,
224  const std::unordered_set<llvm::Function*>& live_funcs,
225  const CompilationOptions& co) {
226  auto module = func->getParent();
227  // run optimizations
228 #ifndef WITH_JIT_DEBUG
229  optimize_ir(func, module, live_funcs, co);
230 #endif // WITH_JIT_DEBUG
231 
232  auto init_err = llvm::InitializeNativeTarget();
233  CHECK(!init_err);
234 
235  llvm::InitializeAllTargetMCs();
236  llvm::InitializeNativeTargetAsmPrinter();
237  llvm::InitializeNativeTargetAsmParser();
238 
239  std::string err_str;
240  std::unique_ptr<llvm::Module> owner(module);
241  llvm::EngineBuilder eb(std::move(owner));
242  eb.setErrorStr(&err_str);
243  eb.setEngineKind(llvm::EngineKind::JIT);
244  llvm::TargetOptions to;
245  to.EnableFastISel = true;
246  eb.setTargetOptions(to);
248  eb.setOptLevel(llvm::CodeGenOpt::None);
249  }
250 
251  ExecutionEngineWrapper execution_engine(eb.create(), co);
252  CHECK(execution_engine.get());
253 
254  execution_engine->finalizeObject();
255 
256  return execution_engine;
257 }
258 
259 std::vector<std::pair<void*, void*>> Executor::optimizeAndCodegenCPU(
260  llvm::Function* query_func,
261  llvm::Function* multifrag_query_func,
262  const std::unordered_set<llvm::Function*>& live_funcs,
263  const CompilationOptions& co) {
264  auto module = multifrag_query_func->getParent();
265  CodeCacheKey key{serialize_llvm_object(query_func),
266  serialize_llvm_object(cgen_state_->row_func_)};
267  for (const auto helper : cgen_state_->helper_functions_) {
268  key.push_back(serialize_llvm_object(helper));
269  }
270  auto cached_code = getCodeFromCache(key, cpu_code_cache_);
271  if (!cached_code.empty()) {
272  return cached_code;
273  }
274 
275  auto execution_engine =
276  CodeGenerator::generateNativeCPUCode(query_func, live_funcs, co);
277  auto native_code = execution_engine->getPointerToFunction(multifrag_query_func);
278  CHECK(native_code);
279 
280  std::vector<std::tuple<void*, ExecutionEngineWrapper>> cache;
281  cache.emplace_back(native_code, std::move(execution_engine));
282  addCodeToCache(key, std::move(cache), module, cpu_code_cache_);
283 
284  return {std::make_pair(native_code, nullptr)};
285 }
286 
287 void CodeGenerator::link_udf_module(const std::unique_ptr<llvm::Module>& udf_module,
288  llvm::Module& module,
290  llvm::Linker::Flags flags) {
291  // throw a runtime error if the target module contains functions
292  // with the same name as in module of UDF functions.
293  for (auto& f : *udf_module.get()) {
294  auto func = module.getFunction(f.getName());
295  if (!(func == nullptr) && !f.isDeclaration() && flags == llvm::Linker::Flags::None) {
296  LOG(ERROR) << " Attempt to overwrite " << f.getName().str() << " in "
297  << module.getModuleIdentifier() << " from `"
298  << udf_module->getModuleIdentifier() << "`" << std::endl;
299  throw std::runtime_error(
300  "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
301  "function ***");
302  } else {
303  LOG(INFO) << " Adding " << f.getName().str() << " to "
304  << module.getModuleIdentifier() << " from `"
305  << udf_module->getModuleIdentifier() << "`" << std::endl;
306  }
307  }
308 
309  std::unique_ptr<llvm::Module> udf_module_copy;
310 
311  udf_module_copy = llvm::CloneModule(
312 #if LLVM_VERSION_MAJOR >= 7
313  *udf_module.get(),
314 #else
315  udf_module.get(),
316 #endif
317  cgen_state->vmap_);
318 
319  udf_module_copy->setDataLayout(module.getDataLayout());
320  udf_module_copy->setTargetTriple(module.getTargetTriple());
321 
322  // Initialize linker with module for RuntimeFunctions.bc
323  llvm::Linker ld(module);
324  bool link_error = false;
325 
326  link_error = ld.linkInModule(std::move(udf_module_copy), flags);
327 
328  if (link_error) {
329  throw std::runtime_error("link_udf_module: *** error linking module ***");
330  }
331 }
332 
333 namespace {
334 
335 std::string cpp_to_llvm_name(const std::string& s) {
336  if (s == "int8_t") {
337  return "i8";
338  }
339  if (s == "int16_t") {
340  return "i16";
341  }
342  if (s == "int32_t") {
343  return "i32";
344  }
345  if (s == "int64_t") {
346  return "i64";
347  }
348  CHECK(s == "float" || s == "double");
349  return s;
350 }
351 
352 std::string gen_array_any_all_sigs() {
353  std::string result;
354  for (const std::string any_or_all : {"any", "all"}) {
355  for (const std::string elem_type :
356  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
357  for (const std::string needle_type :
358  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
359  for (const std::string op_name : {"eq", "ne", "lt", "le", "gt", "ge"}) {
360  result += ("declare i1 @array_" + any_or_all + "_" + op_name + "_" + elem_type +
361  "_" + needle_type + "(i8*, i64, " + cpp_to_llvm_name(needle_type) +
362  ", " + cpp_to_llvm_name(elem_type) + ");\n");
363  }
364  }
365  }
366  }
367  return result;
368 }
369 
371  std::string result;
372  for (const std::string key_type : {"int8_t", "int16_t", "int32_t", "int64_t"}) {
373  const auto key_llvm_type = cpp_to_llvm_name(key_type);
374  result += "declare i64 @translate_null_key_" + key_type + "(" + key_llvm_type + ", " +
375  key_llvm_type + ", i64);\n";
376  }
377  return result;
378 }
379 
380 const std::string cuda_rt_decls =
381  R"( declare void @llvm.dbg.declare(metadata, metadata, metadata) declare void @llvm.dbg.value(metadata, metadata, metadata) declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind declare i32 @pos_start_impl(i32*); declare i32 @group_buff_idx_impl(); declare i32 @pos_step_impl(); declare i8 @thread_warp_idx(i8); declare i64* @init_shared_mem(i64*, i32); declare i64* @init_shared_mem_nop(i64*, i32); declare i64* @init_shared_mem_dynamic(i64*, i32); declare i64* @alloc_shared_mem_dynamic(); declare void @set_shared_mem_to_identity(i64*, i32, i64); declare void @write_back(i64*, i64*, i32); declare void @write_back_smem_nop(i64*, i64*, i32); declare void @write_back_nop(i64*, i64*, i32); declare void @agg_from_smem_to_gmem_nop(i64*, i64*, i32); declare void @agg_from_smem_to_gmem_binId_count(i64*, i64*, i32); declare void @agg_from_smem_to_gmem_count_binId(i64*, i64*, i32); declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8); declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32, i64*); declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32, i64*); declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32); declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32); declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32); declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32); declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64); declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64); declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64); declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64); declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64); declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double); declare i64 @get_bucket_key_for_range_double(i8*, i64, double); declare i64 @agg_count_shared(i64*, i64); declare i64 @agg_count_skip_val_shared(i64*, i64, i64); declare i32 @agg_count_int32_shared(i32*, i32); declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32); declare i64 @agg_count_double_shared(i64*, double); declare i64 @agg_count_double_skip_val_shared(i64*, double, double); declare i32 @agg_count_float_shared(i32*, float); declare i32 @agg_count_float_skip_val_shared(i32*, float, float); declare i64 @agg_sum_shared(i64*, i64); declare i64 @agg_sum_skip_val_shared(i64*, i64, i64); declare i32 @agg_sum_int32_shared(i32*, i32); declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32); declare void @agg_sum_double_shared(i64*, double); declare void @agg_sum_double_skip_val_shared(i64*, double, double); declare void @agg_sum_float_shared(i32*, float); declare void @agg_sum_float_skip_val_shared(i32*, float, float); declare void @agg_max_shared(i64*, i64); declare void @agg_max_skip_val_shared(i64*, i64, i64); declare void @agg_max_int32_shared(i32*, i32); declare void @agg_max_int32_skip_val_shared(i32*, i32, i32); declare void @agg_max_int16_shared(i16*, i16); declare void @agg_max_int16_skip_val_shared(i16*, i16, i16); declare void @agg_max_int8_shared(i8*, i8); declare void @agg_max_int8_skip_val_shared(i8*, i8, i8); declare void @agg_max_double_shared(i64*, double); declare void @agg_max_double_skip_val_shared(i64*, double, double); declare void @agg_max_float_shared(i32*, float); declare void @agg_max_float_skip_val_shared(i32*, float, float); declare void @agg_min_shared(i64*, i64); declare void @agg_min_skip_val_shared(i64*, i64, i64); declare void @agg_min_int32_shared(i32*, i32); declare void @agg_min_int32_skip_val_shared(i32*, i32, i32); declare void @agg_min_int16_shared(i16*, i16); declare void @agg_min_int16_skip_val_shared(i16*, i16, i16); declare void @agg_min_int8_shared(i8*, i8); declare void @agg_min_int8_skip_val_shared(i8*, i8, i8); declare void @agg_min_double_shared(i64*, double); declare void @agg_min_double_skip_val_shared(i64*, double, double); declare void @agg_min_float_shared(i32*, float); declare void @agg_min_float_skip_val_shared(i32*, float, float); declare void @agg_id_shared(i64*, i64); declare void @agg_id_int32_shared(i32*, i32); declare void @agg_id_int16_shared(i16*, i16); declare void @agg_id_int8_shared(i8*, i8); declare void @agg_id_double_shared(i64*, double); declare void @agg_id_double_shared_slow(i64*, double*); declare void @agg_id_float_shared(i32*, float); declare i1 @slotEmptyKeyCAS(i64*, i64, i64); declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32); declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16); declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8); declare i64 @ExtractFromTime(i32, i64); declare i64 @ExtractFromTimeNullable(i32, i64, i64); declare i64 @DateTruncate(i32, i64); declare i64 @DateTruncateNullable(i32, i64, i64); declare i64 @DateTruncateHighPrecisionToDate(i64, i64); declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64); declare i64 @DateTruncateAlterPrecisionScaleUp(i64, i64); declare i64 @DateTruncateAlterPrecisionScaleDown(i64, i64); declare i64 @DateTruncateAlterPrecisionScaleUpNullable(i64, i64, i64); declare i64 @DateTruncateAlterPrecisionScaleDownNullable(i64, i64, i64); declare i64 @DateDiff(i32, i64, i64); declare i64 @DateDiffNullable(i32, i64, i64, i64); declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i64, i64, i64); declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i64, i64, i64, i64); declare i64 @DateAdd(i32, i64, i64); declare i64 @DateAddNullable(i32, i64, i64, i64); declare i64 @DateAddHighPrecision(i32, i64, i64, i64); declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i64, i64); declare i64 @string_decode(i8*, i64); declare i32 @array_size(i8*, i64, i32); declare i32 @array_size_nullable(i8*, i64, i32, i32); declare i32 @fast_fixlen_array_size(i8*, i32); declare i1 @array_is_null(i8*, i64); declare i8* @array_buff(i8*, i64); declare i8* @fast_fixlen_array_buff(i8*, i64); declare i8 @array_at_int8_t(i8*, i64, i32); declare i16 @array_at_int16_t(i8*, i64, i32); declare i32 @array_at_int32_t(i8*, i64, i32); declare i64 @array_at_int64_t(i8*, i64, i32); declare float @array_at_float(i8*, i64, i32); declare double @array_at_double(i8*, i64, i32); declare i8 @varlen_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_array_at_int64_t(i8*, i64, i32); declare float @varlen_array_at_float(i8*, i64, i32); declare double @varlen_array_at_double(i8*, i64, i32); declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32); declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32); declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32); declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32); declare float @varlen_notnull_array_at_float(i8*, i64, i32); declare double @varlen_notnull_array_at_double(i8*, i64, i32); declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8); declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16); declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32); declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64); declare float @array_at_float_checked(i8*, i64, i64, float); declare double @array_at_double_checked(i8*, i64, i64, double); declare i32 @char_length(i8*, i32); declare i32 @char_length_nullable(i8*, i32, i32); declare i32 @char_length_encoded(i8*, i32); declare i32 @char_length_encoded_nullable(i8*, i32, i32); declare i32 @key_for_string_encoded(i32); declare i1 @string_like(i8*, i32, i8*, i32, i8); declare i1 @string_ilike(i8*, i32, i8*, i32, i8); declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8); declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8); declare i1 @string_like_simple(i8*, i32, i8*, i32); declare i1 @string_ilike_simple(i8*, i32, i8*, i32); declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8); declare i1 @string_lt(i8*, i32, i8*, i32); declare i1 @string_le(i8*, i32, i8*, i32); declare i1 @string_gt(i8*, i32, i8*, i32); declare i1 @string_ge(i8*, i32, i8*, i32); declare i1 @string_eq(i8*, i32, i8*, i32); declare i1 @string_ne(i8*, i32, i8*, i32); declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8); declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8); declare i1 @regexp_like(i8*, i32, i8*, i32, i8); declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8); declare void @linear_probabilistic_count(i8*, i32, i8*, i32); declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64); declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64); declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64); declare i32 @record_error_code(i32, i32*); declare i1 @dynamic_watchdog(); declare void @force_sync(); declare void @sync_warp(); declare void @sync_warp_protected(i64, i64); declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32); declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64); declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float); declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double); )" + gen_array_any_all_sigs() +
383 
384 #ifdef HAVE_CUDA
385 std::string extension_function_decls(const std::unordered_set<std::string>& udf_decls) {
386  const auto decls = ExtensionFunctionsWhitelist::getLLVMDeclarations(udf_decls);
387  return boost::algorithm::join(decls, "\n");
388 }
389 
390 void legalize_nvvm_ir(llvm::Function* query_func) {
391  // optimizations might add attributes to the function
392  // and NVPTX doesn't understand all of them; play it
393  // safe and clear all attributes
394  clear_function_attributes(query_func);
395  verify_function_ir(query_func);
396 
397  std::vector<llvm::Instruction*> stackrestore_intrinsics;
398  std::vector<llvm::Instruction*> stacksave_intrinsics;
399  for (auto& BB : *query_func) {
400  for (llvm::Instruction& I : BB) {
401  if (const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
402  if (II->getIntrinsicID() == llvm::Intrinsic::stacksave) {
403  stacksave_intrinsics.push_back(&I);
404  } else if (II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
405  stackrestore_intrinsics.push_back(&I);
406  }
407  }
408  }
409  }
410 
411  // stacksave and stackrestore intrinsics appear together, and
412  // stackrestore uses stacksaved result as its argument
413  // so it should be removed first.
414  for (auto& II : stackrestore_intrinsics) {
415  II->eraseFromParent();
416  }
417  for (auto& II : stacksave_intrinsics) {
418  II->eraseFromParent();
419  }
420 }
421 #endif // HAVE_CUDA
422 
423 } // namespace
424 
425 llvm::StringRef get_gpu_target_triple_string() {
426  return llvm::StringRef("nvptx64-nvidia-cuda");
427 }
428 
429 llvm::StringRef get_gpu_data_layout() {
430  return llvm::StringRef(
431  "e-p:64:64:64-i1:8:8-i8:8:8-"
432  "i16:16:16-i32:32:32-i64:64:64-"
433  "f32:32:32-f64:64:64-v16:16:16-"
434  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
435 }
436 
437 std::map<std::string, std::string> get_device_parameters() {
438  std::map<std::string, std::string> result;
439 
440  result.insert(std::make_pair("cpu_name", llvm::sys::getHostCPUName()));
441  result.insert(std::make_pair("cpu_triple", llvm::sys::getProcessTriple()));
442  result.insert(
443  std::make_pair("cpu_cores", std::to_string(llvm::sys::getHostNumPhysicalCores())));
444  result.insert(std::make_pair("cpu_threads", std::to_string(cpu_threads())));
445 
446  llvm::StringMap<bool> cpu_features;
447  if (llvm::sys::getHostCPUFeatures(cpu_features)) {
448  std::string features_str = "";
449  for (auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
450  features_str += (it->getValue() ? " +" : " -");
451  features_str += it->getKey().str();
452  }
453  result.insert(std::make_pair("cpu_features", features_str));
454  }
455 
456 #ifdef HAVE_CUDA
457  int device_count = 0;
458  checkCudaErrors(cuDeviceGetCount(&device_count));
459  if (device_count) {
460  CUdevice device{};
461  char device_name[256];
462  int major = 0, minor = 0;
463  checkCudaErrors(cuDeviceGet(&device, 0)); // assuming homogeneous multi-GPU system
464  checkCudaErrors(cuDeviceGetName(device_name, 256, device));
465  checkCudaErrors(cuDeviceGetAttribute(
466  &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
467  checkCudaErrors(cuDeviceGetAttribute(
468  &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
469 
470  result.insert(std::make_pair("gpu_name", device_name));
471  result.insert(std::make_pair("gpu_count", std::to_string(device_count)));
472  result.insert(std::make_pair("gpu_compute_capability",
473  std::to_string(major) + "." + std::to_string(minor)));
474  result.insert(std::make_pair("gpu_triple", get_gpu_target_triple_string()));
475  result.insert(std::make_pair("gpu_datalayout", get_gpu_data_layout()));
476  }
477 #endif
478 
479  return result;
480 }
481 
483  llvm::Function* func,
484  llvm::Function* wrapper_func,
485  const std::unordered_set<llvm::Function*>& live_funcs,
486  const CompilationOptions& co,
487  const GPUTarget& gpu_target) {
488 #ifdef HAVE_CUDA
489  auto module = func->getParent();
490  module->setDataLayout(
491  "e-p:64:64:64-i1:8:8-i8:8:8-"
492  "i16:16:16-i32:32:32-i64:64:64-"
493  "f32:32:32-f64:64:64-v16:16:16-"
494  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
495  module->setTargetTriple("nvptx64-nvidia-cuda");
496  // run optimizations
497  optimize_ir(func, module, live_funcs, co);
498  legalize_nvvm_ir(func);
499 
500  std::stringstream ss;
501  llvm::raw_os_ostream os(ss);
502 
503  llvm::LLVMContext& ctx = module->getContext();
504  // Get "nvvm.annotations" metadata node
505  llvm::NamedMDNode* md = module->getOrInsertNamedMetadata("nvvm.annotations");
506 
507  llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
508  llvm::MDString::get(ctx, "kernel"),
509  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
510  llvm::Type::getInt32Ty(ctx), 1))};
511 
512  // Append metadata to nvvm.annotations
513  md->addOperand(llvm::MDNode::get(ctx, md_vals));
514 
515  std::unordered_set<llvm::Function*> roots{wrapper_func, func};
516  if (gpu_target.row_func_not_inlined) {
517  clear_function_attributes(gpu_target.cgen_state->row_func_);
518  roots.insert(gpu_target.cgen_state->row_func_);
519  }
520 
521  // Prevent the udf function(s) from being removed the way the runtime functions are
522 
523  std::unordered_set<std::string> udf_declarations;
524  if (is_udf_module_present()) {
525  for (auto& f : udf_gpu_module->getFunctionList()) {
526  llvm::Function* udf_function = module->getFunction(f.getName());
527 
528  if (udf_function) {
529  legalize_nvvm_ir(udf_function);
530  roots.insert(udf_function);
531 
532  // If we have a udf that declares a external function
533  // note it so we can avoid duplicate declarations
534  if (f.isDeclaration()) {
535  udf_declarations.insert(f.getName().str());
536  }
537  }
538  }
539  }
540 
541  if (is_rt_udf_module_present()) {
542  for (auto& f : rt_udf_gpu_module->getFunctionList()) {
543  llvm::Function* udf_function = module->getFunction(f.getName());
544  if (udf_function) {
545  legalize_nvvm_ir(udf_function);
546  roots.insert(udf_function);
547 
548  // If we have a udf that declares a external function
549  // note it so we can avoid duplicate declarations
550  if (f.isDeclaration()) {
551  udf_declarations.insert(f.getName().str());
552  }
553  }
554  }
555  }
556 
557  std::vector<llvm::Function*> rt_funcs;
558  for (auto& Fn : *module) {
559  if (roots.count(&Fn)) {
560  continue;
561  }
562  rt_funcs.push_back(&Fn);
563  }
564  for (auto& pFn : rt_funcs) {
565  pFn->removeFromParent();
566  }
567  module->print(os, nullptr);
568  os.flush();
569 
570  for (auto& pFn : rt_funcs) {
571  module->getFunctionList().push_back(pFn);
572  }
573  module->eraseNamedMetadata(md);
574 
575  auto cuda_llir = cuda_rt_decls + extension_function_decls(udf_declarations) + ss.str();
576 
577  std::vector<std::pair<void*, void*>> native_functions;
578  std::vector<std::tuple<void*, GpuCompilationContext*>> cached_functions;
579 
580  const auto ptx =
581  generatePTX(cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state);
582 
583  LOG(PTX) << "PTX for the GPU:\n" << ptx << "\nEnd of PTX";
584 
585  auto cubin_result = ptx_to_cubin(ptx, gpu_target.block_size, gpu_target.cuda_mgr);
586  auto& option_keys = cubin_result.option_keys;
587  auto& option_values = cubin_result.option_values;
588  auto cubin = cubin_result.cubin;
589  auto link_state = cubin_result.link_state;
590  const auto num_options = option_keys.size();
591 
592  auto func_name = wrapper_func->getName().str();
593  for (int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
594  ++device_id) {
595  auto gpu_context = new GpuCompilationContext(cubin,
596  func_name,
597  device_id,
598  gpu_target.cuda_mgr,
599  num_options,
600  &option_keys[0],
601  &option_values[0]);
602  auto native_code = gpu_context->kernel();
603  auto native_module = gpu_context->module();
604  CHECK(native_code);
605  CHECK(native_module);
606  native_functions.emplace_back(native_code, native_module);
607  cached_functions.emplace_back(native_code, gpu_context);
608  }
609 
610  checkCudaErrors(cuLinkDestroy(link_state));
611 
612  return {native_functions, cached_functions};
613 #else
614  return {};
615 #endif
616 }
617 
618 std::vector<std::pair<void*, void*>> Executor::optimizeAndCodegenGPU(
619  llvm::Function* query_func,
620  llvm::Function* multifrag_query_func,
621  std::unordered_set<llvm::Function*>& live_funcs,
622  const bool no_inline,
623  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
624  const CompilationOptions& co) {
625 #ifdef HAVE_CUDA
626  auto module = multifrag_query_func->getParent();
627  CHECK(cuda_mgr);
628  CodeCacheKey key{serialize_llvm_object(query_func),
629  serialize_llvm_object(cgen_state_->row_func_)};
630  for (const auto helper : cgen_state_->helper_functions_) {
631  key.push_back(serialize_llvm_object(helper));
632  }
633  auto cached_code = getCodeFromCache(key, gpu_code_cache_);
634  if (!cached_code.empty()) {
635  return cached_code;
636  }
637 
638  bool row_func_not_inlined = false;
639  if (no_inline) {
640  for (auto it = llvm::inst_begin(cgen_state_->row_func_),
641  e = llvm::inst_end(cgen_state_->row_func_);
642  it != e;
643  ++it) {
644  if (llvm::isa<llvm::CallInst>(*it)) {
645  auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
646  if (get_gv_call.getCalledFunction()->getName() == "get_group_value" ||
647  get_gv_call.getCalledFunction()->getName() ==
648  "get_group_value_with_watchdog" ||
649  get_gv_call.getCalledFunction()->getName() ==
650  "get_matching_group_value_perfect_hash" ||
651  get_gv_call.getCalledFunction()->getName() == "array_size" ||
652  get_gv_call.getCalledFunction()->getName() == "linear_probabilistic_count") {
653  mark_function_never_inline(cgen_state_->row_func_);
654  row_func_not_inlined = true;
655  break;
656  }
657  }
658  }
659  }
660 
661  initializeNVPTXBackend();
662  CodeGenerator::GPUTarget gpu_target{nvptx_target_machine_.get(),
663  cuda_mgr,
664  blockSize(),
665  cgen_state_.get(),
666  row_func_not_inlined};
667  const auto gpu_code = CodeGenerator::generateNativeGPUCode(
668  query_func, multifrag_query_func, live_funcs, co, gpu_target);
669 
670  addCodeToCache(key, gpu_code.cached_functions, module, gpu_code_cache_);
671 
672  return gpu_code.native_functions;
673 #else
674  return {};
675 #endif
676 }
677 
678 std::string CodeGenerator::generatePTX(const std::string& cuda_llir,
679  llvm::TargetMachine* nvptx_target_machine,
681  auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir, "", false);
682 
683  llvm::SMDiagnostic err;
684 
685  auto module = llvm::parseIR(mem_buff->getMemBufferRef(), err, cgen_state->context_);
686  if (!module) {
687  LOG(FATAL) << err.getMessage().str();
688  }
689 
690  llvm::SmallString<256> code_str;
691  llvm::raw_svector_ostream formatted_os(code_str);
692  CHECK(nvptx_target_machine);
693  {
694  llvm::legacy::PassManager ptxgen_pm;
695  module->setDataLayout(nvptx_target_machine->createDataLayout());
696 
697 #if LLVM_VERSION_MAJOR >= 7
698  nvptx_target_machine->addPassesToEmitFile(
699  ptxgen_pm, formatted_os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
700 #else
701  nvptx_target_machine->addPassesToEmitFile(
702  ptxgen_pm, formatted_os, llvm::TargetMachine::CGFT_AssemblyFile);
703 #endif
704  ptxgen_pm.run(*module);
705  }
706 
707  return code_str.str();
708 }
709 
710 std::unique_ptr<llvm::TargetMachine> CodeGenerator::initializeNVPTXBackend() {
711  llvm::InitializeAllTargets();
712  llvm::InitializeAllTargetMCs();
713  llvm::InitializeAllAsmPrinters();
714  std::string err;
715  auto target = llvm::TargetRegistry::lookupTarget("nvptx64", err);
716  if (!target) {
717  LOG(FATAL) << err;
718  }
719  return std::unique_ptr<llvm::TargetMachine>(target->createTargetMachine(
720  "nvptx64-nvidia-cuda", "sm_30", "", llvm::TargetOptions(), llvm::Reloc::Static));
721 }
722 
723 std::string Executor::generatePTX(const std::string& cuda_llir) const {
725  cuda_llir, nvptx_target_machine_.get(), cgen_state_.get());
726 }
727 
729  if (nvptx_target_machine_) {
730  return;
731  }
732  nvptx_target_machine_ = CodeGenerator::initializeNVPTXBackend();
733 }
734 
735 // A small number of runtime functions don't get through CgenState::emitCall. List them
736 // explicitly here and always clone their implementation from the runtime module.
737 bool CodeGenerator::alwaysCloneRuntimeFunction(const llvm::Function* func) {
738  return func->getName() == "query_stub_hoisted_literals" ||
739  func->getName() == "multifrag_query_hoisted_literals" ||
740  func->getName() == "query_stub" || func->getName() == "multifrag_query" ||
741  func->getName() == "fixed_width_int_decode" ||
742  func->getName() == "fixed_width_unsigned_decode" ||
743  func->getName() == "diff_fixed_width_int_decode" ||
744  func->getName() == "fixed_width_double_decode" ||
745  func->getName() == "fixed_width_float_decode" ||
746  func->getName() == "fixed_width_small_date_decode" ||
747  func->getName() == "record_error_code";
748 }
749 
750 llvm::Module* read_template_module(llvm::LLVMContext& context) {
751  llvm::SMDiagnostic err;
752 
753  auto buffer_or_error = llvm::MemoryBuffer::getFile(mapd_root_abs_path() +
754  "/QueryEngine/RuntimeFunctions.bc");
755  CHECK(!buffer_or_error.getError());
756  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
757 
758  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
759  CHECK(!owner.takeError());
760  auto module = owner.get().release();
761  CHECK(module);
762 
763  return module;
764 }
765 
766 namespace {
767 
768 void bind_pos_placeholders(const std::string& pos_fn_name,
769  const bool use_resume_param,
770  llvm::Function* query_func,
771  llvm::Module* module) {
772  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
773  ++it) {
774  if (!llvm::isa<llvm::CallInst>(*it)) {
775  continue;
776  }
777  auto& pos_call = llvm::cast<llvm::CallInst>(*it);
778  if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
779  if (use_resume_param) {
780  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
781  llvm::ReplaceInstWithInst(
782  &pos_call,
783  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl"),
784  error_code_arg));
785  } else {
786  llvm::ReplaceInstWithInst(
787  &pos_call,
788  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl")));
789  }
790  break;
791  }
792  }
793 }
795 void set_row_func_argnames(llvm::Function* row_func,
796  const size_t in_col_count,
797  const size_t agg_col_count,
798  const bool hoist_literals) {
799  auto arg_it = row_func->arg_begin();
800 
801  if (agg_col_count) {
802  for (size_t i = 0; i < agg_col_count; ++i) {
803  arg_it->setName("out");
804  ++arg_it;
805  }
806  } else {
807  arg_it->setName("group_by_buff");
808  ++arg_it;
809  arg_it->setName("crt_matched");
810  ++arg_it;
811  arg_it->setName("total_matched");
812  ++arg_it;
813  arg_it->setName("old_total_matched");
814  ++arg_it;
815  arg_it->setName("max_matched");
816  ++arg_it;
817  }
818 
819  arg_it->setName("agg_init_val");
820  ++arg_it;
821 
822  arg_it->setName("pos");
823  ++arg_it;
824 
825  arg_it->setName("frag_row_off");
826  ++arg_it;
827 
828  arg_it->setName("num_rows_per_scan");
829  ++arg_it;
830 
831  if (hoist_literals) {
832  arg_it->setName("literals");
833  ++arg_it;
834  }
835 
836  for (size_t i = 0; i < in_col_count; ++i) {
837  arg_it->setName("col_buf" + std::to_string(i));
838  ++arg_it;
839  }
840 
841  arg_it->setName("join_hash_tables");
842 }
843 
844 std::pair<llvm::Function*, std::vector<llvm::Value*>> create_row_function(
845  const size_t in_col_count,
846  const size_t agg_col_count,
847  const bool hoist_literals,
848  llvm::Function* query_func,
849  llvm::Module* module,
850  llvm::LLVMContext& context) {
851  std::vector<llvm::Type*> row_process_arg_types;
852 
853  if (agg_col_count) {
854  // output (aggregate) arguments
855  for (size_t i = 0; i < agg_col_count; ++i) {
856  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
857  }
858  } else {
859  // group by buffer
860  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
861  // current match count
862  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
863  // total match count passed from the caller
864  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
865  // old total match count returned to the caller
866  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
867  // max matched (total number of slots in the output buffer)
868  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
869  }
870 
871  // aggregate init values
872  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
873 
874  // position argument
875  row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
876 
877  // fragment row offset argument
878  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
879 
880  // number of rows for each scan
881  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
882 
883  // literals buffer argument
884  if (hoist_literals) {
885  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
886  }
887 
888  // Generate the function signature and column head fetches s.t.
889  // double indirection isn't needed in the inner loop
890  auto& fetch_bb = query_func->front();
891  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
892  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
893  auto col_heads = generate_column_heads_load(
894  in_col_count, query_func->args().begin(), fetch_ir_builder, context);
895  CHECK_EQ(in_col_count, col_heads.size());
896 
897  // column buffer arguments
898  for (size_t i = 0; i < in_col_count; ++i) {
899  row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
900  }
901 
902  // join hash table argument
903  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
905  // generate the function
906  auto ft =
907  llvm::FunctionType::get(get_int_type(32, context), row_process_arg_types, false);
908 
909  auto row_func =
910  llvm::Function::Create(ft, llvm::Function::ExternalLinkage, "row_func", module);
911 
912  // set the row function argument names; for debugging purposes only
913  set_row_func_argnames(row_func, in_col_count, agg_col_count, hoist_literals);
914 
915  return std::make_pair(row_func, col_heads);
916 }
917 
918 void bind_query(llvm::Function* query_func,
919  const std::string& query_fname,
920  llvm::Function* multifrag_query_func,
921  llvm::Module* module) {
922  std::vector<llvm::CallInst*> query_stubs;
923  for (auto it = llvm::inst_begin(multifrag_query_func),
924  e = llvm::inst_end(multifrag_query_func);
925  it != e;
926  ++it) {
927  if (!llvm::isa<llvm::CallInst>(*it)) {
928  continue;
929  }
930  auto& query_call = llvm::cast<llvm::CallInst>(*it);
931  if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
932  query_stubs.push_back(&query_call);
933  }
934  }
935  for (auto& S : query_stubs) {
936  std::vector<llvm::Value*> args;
937  for (size_t i = 0; i < S->getNumArgOperands(); ++i) {
938  args.push_back(S->getArgOperand(i));
939  }
940  llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args, ""));
941  }
942 }
943 
944 std::vector<std::string> get_agg_fnames(const std::vector<Analyzer::Expr*>& target_exprs,
945  const bool is_group_by) {
946  std::vector<std::string> result;
947  for (size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
948  ++target_idx, ++agg_col_idx) {
949  const auto target_expr = target_exprs[target_idx];
950  CHECK(target_expr);
951  const auto target_type_info = target_expr->get_type_info();
952  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
953  const bool is_varlen =
954  (target_type_info.is_string() &&
955  target_type_info.get_compression() == kENCODING_NONE) ||
956  target_type_info.is_array(); // TODO: should it use is_varlen_array() ?
957  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
958  result.emplace_back(target_type_info.is_fp() ? "agg_id_double" : "agg_id");
959  if (is_varlen) {
960  result.emplace_back("agg_id");
961  }
962  if (target_type_info.is_geometry()) {
963  result.emplace_back("agg_id");
964  for (auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
965  result.emplace_back("agg_id");
966  }
967  }
968  continue;
969  }
970  const auto agg_type = agg_expr->get_aggtype();
971  const auto& agg_type_info =
972  agg_type != kCOUNT ? agg_expr->get_arg()->get_type_info() : target_type_info;
973  switch (agg_type) {
974  case kAVG: {
975  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
976  !agg_type_info.is_fp()) {
977  throw std::runtime_error("AVG is only valid on integer and floating point");
978  }
979  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
980  ? "agg_sum"
981  : "agg_sum_double");
982  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
983  ? "agg_count"
984  : "agg_count_double");
985  break;
986  }
987  case kMIN: {
988  if (agg_type_info.is_string() || agg_type_info.is_array() ||
989  agg_type_info.is_geometry()) {
990  throw std::runtime_error(
991  "MIN on strings, arrays or geospatial types not supported yet");
992  }
993  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
994  ? "agg_min"
995  : "agg_min_double");
996  break;
997  }
998  case kMAX: {
999  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1000  agg_type_info.is_geometry()) {
1001  throw std::runtime_error(
1002  "MAX on strings, arrays or geospatial types not supported yet");
1003  }
1004  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1005  ? "agg_max"
1006  : "agg_max_double");
1007  break;
1008  }
1009  case kSUM: {
1010  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1011  !agg_type_info.is_fp()) {
1012  throw std::runtime_error("SUM is only valid on integer and floating point");
1013  }
1014  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1015  ? "agg_sum"
1016  : "agg_sum_double");
1017  break;
1018  }
1019  case kCOUNT:
1020  result.emplace_back(agg_expr->get_is_distinct() ? "agg_count_distinct"
1021  : "agg_count");
1022  break;
1023  case kSAMPLE: {
1024  // Note that varlen SAMPLE arguments are handled separately above
1025  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1026  break;
1027  }
1029  result.emplace_back("agg_approximate_count_distinct");
1030  break;
1031  default:
1032  CHECK(false);
1033  }
1034  }
1035  return result;
1036 }
1037 
1038 } // namespace
1039 
1040 std::unique_ptr<llvm::Module> g_rt_module(read_template_module(getGlobalLLVMContext()));
1041 
1042 bool is_udf_module_present(bool cpu_only) {
1043  return (cpu_only || udf_gpu_module != nullptr) && (udf_cpu_module != nullptr);
1044 }
1045 
1046 bool is_rt_udf_module_present(bool cpu_only) {
1047  return (cpu_only || rt_udf_gpu_module != nullptr) && (rt_udf_cpu_module != nullptr);
1048 }
1049 
1050 void throw_parseIR_error(const llvm::SMDiagnostic& parse_error, std::string src = "") {
1051  std::string excname = "LLVM IR ParseError: ";
1052  llvm::raw_string_ostream ss(excname);
1053  parse_error.print(src.c_str(), ss, false, false);
1054  throw std::runtime_error(ss.str());
1055 }
1056 
1057 void read_udf_gpu_module(const std::string& udf_ir_filename) {
1058  llvm::SMDiagnostic parse_error;
1059 
1060  llvm::StringRef file_name_arg(udf_ir_filename);
1061 
1062  udf_gpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1063  if (!udf_gpu_module) {
1064  throw_parseIR_error(parse_error, udf_ir_filename);
1065  }
1066 }
1067 
1068 void read_udf_cpu_module(const std::string& udf_ir_filename) {
1069  llvm::SMDiagnostic parse_error;
1070 
1071  llvm::StringRef file_name_arg(udf_ir_filename);
1072 
1073  udf_cpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1074  if (!udf_cpu_module) {
1075  throw_parseIR_error(parse_error, udf_ir_filename);
1076  }
1077 }
1078 
1079 void read_rt_udf_gpu_module(const std::string& udf_ir_string) {
1080  llvm::SMDiagnostic parse_error;
1081 
1082  auto buf =
1083  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for GPU");
1084 
1085  rt_udf_gpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1086  if (!rt_udf_gpu_module) {
1087  throw_parseIR_error(parse_error);
1088  }
1089 }
1090 
1091 void read_rt_udf_cpu_module(const std::string& udf_ir_string) {
1092  llvm::SMDiagnostic parse_error;
1093 
1094  auto buf =
1095  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for CPU");
1096 
1097  rt_udf_cpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1098  if (!rt_udf_cpu_module) {
1099  throw_parseIR_error(parse_error);
1100  }
1101 }
1102 
1103 std::unordered_set<llvm::Function*> CodeGenerator::markDeadRuntimeFuncs(
1104  llvm::Module& module,
1105  const std::vector<llvm::Function*>& roots,
1106  const std::vector<llvm::Function*>& leaves) {
1107  std::unordered_set<llvm::Function*> live_funcs;
1108  live_funcs.insert(roots.begin(), roots.end());
1109  live_funcs.insert(leaves.begin(), leaves.end());
1110 
1111  if (auto F = module.getFunction("init_shared_mem_nop")) {
1112  live_funcs.insert(F);
1113  }
1114  if (auto F = module.getFunction("write_back_nop")) {
1115  live_funcs.insert(F);
1116  }
1117 
1118  for (const llvm::Function* F : roots) {
1119  for (const llvm::BasicBlock& BB : *F) {
1120  for (const llvm::Instruction& I : BB) {
1121  if (const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1122  live_funcs.insert(CI->getCalledFunction());
1123  }
1124  }
1125  }
1126  }
1127 
1128  for (llvm::Function& F : module) {
1129  if (!live_funcs.count(&F) && !F.isDeclaration()) {
1130  F.setLinkage(llvm::GlobalValue::InternalLinkage);
1131  }
1132  }
1133 
1134  return live_funcs;
1135 }
1136 
1137 namespace {
1138 // searches for a particular variable within a specific basic block (or all if bb_name is
1139 // empty)
1140 template <typename InstType>
1141 llvm::Value* find_variable_in_basic_block(llvm::Function* func,
1142  std::string bb_name,
1143  std::string variable_name) {
1144  llvm::Value* result = nullptr;
1145  if (func == nullptr || variable_name.empty()) {
1146  return result;
1147  }
1148  bool is_found = false;
1149  for (auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1150  if (!bb_name.empty() && bb_it->getName() != bb_name) {
1151  continue;
1152  }
1153  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1154  if (llvm::isa<InstType>(*inst_it)) {
1155  if (inst_it->getName() == variable_name) {
1156  result = &*inst_it;
1157  is_found = true;
1158  break;
1159  }
1160  }
1161  }
1162  }
1163  return result;
1164 }
1165 }; // namespace
1166 
1167 void Executor::createErrorCheckControlFlow(llvm::Function* query_func,
1168  bool run_with_dynamic_watchdog,
1169  ExecutorDeviceType device_type) {
1170  // check whether the row processing was successful; currently, it can
1171  // fail by running out of group by buffer slots
1172 
1173  llvm::Value* row_count = nullptr;
1174  if (run_with_dynamic_watchdog && device_type == ExecutorDeviceType::GPU) {
1175  row_count =
1176  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1177  }
1178 
1179  bool done_splitting = false;
1180  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1181  ++bb_it) {
1182  llvm::Value* pos = nullptr;
1183  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1184  if (run_with_dynamic_watchdog && llvm::isa<llvm::PHINode>(*inst_it)) {
1185  if (inst_it->getName() == "pos") {
1186  pos = &*inst_it;
1187  }
1188  continue;
1189  }
1190  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1191  continue;
1192  }
1193  auto& filter_call = llvm::cast<llvm::CallInst>(*inst_it);
1194  if (std::string(filter_call.getCalledFunction()->getName()) == "row_process") {
1195  auto next_inst_it = inst_it;
1196  ++next_inst_it;
1197  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1198  auto& br_instr = bb_it->back();
1199  llvm::IRBuilder<> ir_builder(&br_instr);
1200  llvm::Value* err_lv = &*inst_it;
1201  if (run_with_dynamic_watchdog) {
1202  CHECK(pos);
1203  llvm::Value* call_watchdog_lv = nullptr;
1204  if (device_type == ExecutorDeviceType::GPU) {
1205  // In order to make sure all threads wihtin a block see the same barrier,
1206  // only those blocks whose none of their threads have experienced the critical
1207  // edge will go through the dynamic watchdog computation
1208  CHECK(row_count);
1209  auto crit_edge_rem =
1210  (blockSize() & (blockSize() - 1))
1211  ? ir_builder.CreateSRem(
1212  row_count,
1213  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1214  : ir_builder.CreateAnd(
1215  row_count,
1216  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1217  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1218  crit_edge_threshold->setName("crit_edge_threshold");
1219 
1220  // only those threads where pos < crit_edge_threshold go through dynamic
1221  // watchdog call
1222  call_watchdog_lv =
1223  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1224  } else {
1225  // CPU path: run watchdog for every 64th row
1226  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1227  call_watchdog_lv = ir_builder.CreateICmp(
1228  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1229  }
1230  CHECK(call_watchdog_lv);
1231  auto error_check_bb = bb_it->splitBasicBlock(
1232  llvm::BasicBlock::iterator(br_instr), ".error_check");
1233  auto& watchdog_br_instr = bb_it->back();
1234 
1235  auto watchdog_check_bb = llvm::BasicBlock::Create(
1236  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1237  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1238  auto detected_timeout = watchdog_ir_builder.CreateCall(
1239  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1240  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1241  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1242  watchdog_ir_builder.CreateBr(error_check_bb);
1243 
1244  llvm::ReplaceInstWithInst(
1245  &watchdog_br_instr,
1246  llvm::BranchInst::Create(
1247  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1248  ir_builder.SetInsertPoint(&br_instr);
1249  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1250 
1251  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1252  unified_err_lv->addIncoming(err_lv, &*bb_it);
1253  err_lv = unified_err_lv;
1254  }
1255  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1256  err_lv =
1257  ir_builder.CreateCall(cgen_state_->module_->getFunction("record_error_code"),
1258  std::vector<llvm::Value*>{err_lv, error_code_arg});
1259  if (device_type == ExecutorDeviceType::GPU) {
1260  // let kernel execution finish as expected, regardless of the observed error,
1261  // unless it is from the dynamic watchdog where all threads within that block
1262  // return together.
1263  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1264  err_lv,
1265  cgen_state_->llInt(Executor::ERR_OUT_OF_TIME));
1266  } else {
1267  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1268  err_lv,
1269  cgen_state_->llInt(static_cast<int32_t>(0)));
1270  }
1271  auto error_bb = llvm::BasicBlock::Create(
1272  cgen_state_->context_, ".error_exit", query_func, new_bb);
1273  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1274  llvm::ReplaceInstWithInst(&br_instr,
1275  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1276  done_splitting = true;
1277  break;
1278  }
1279  }
1280  }
1281  CHECK(done_splitting);
1282 }
1283 
1284 std::vector<llvm::Value*> Executor::inlineHoistedLiterals() {
1285  std::vector<llvm::Value*> hoisted_literals;
1286 
1287  // row_func_ is using literals whose defs have been hoisted up to the query_func_,
1288  // extend row_func_ signature to include extra args to pass these literal values.
1289  std::vector<llvm::Type*> row_process_arg_types;
1290 
1291  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1292  E = cgen_state_->row_func_->arg_end();
1293  I != E;
1294  ++I) {
1295  row_process_arg_types.push_back(I->getType());
1296  }
1297 
1298  for (auto& element : cgen_state_->query_func_literal_loads_) {
1299  for (auto value : element.second) {
1300  row_process_arg_types.push_back(value->getType());
1301  }
1302  }
1303 
1304  auto ft = llvm::FunctionType::get(
1305  get_int_type(32, cgen_state_->context_), row_process_arg_types, false);
1306  auto row_func_with_hoisted_literals =
1307  llvm::Function::Create(ft,
1308  llvm::Function::ExternalLinkage,
1309  "row_func_hoisted_literals",
1310  cgen_state_->row_func_->getParent());
1311 
1312  // make sure it's in-lined, we don't want register spills in the inner loop
1313  mark_function_always_inline(row_func_with_hoisted_literals);
1314 
1315  auto arg_it = row_func_with_hoisted_literals->arg_begin();
1316  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1317  E = cgen_state_->row_func_->arg_end();
1318  I != E;
1319  ++I) {
1320  if (I->hasName()) {
1321  arg_it->setName(I->getName());
1322  }
1323  ++arg_it;
1324  }
1325 
1326  std::unordered_map<int, std::vector<llvm::Value*>>
1327  query_func_literal_loads_function_arguments;
1328 
1329  for (auto& element : cgen_state_->query_func_literal_loads_) {
1330  std::vector<llvm::Value*> argument_values;
1331 
1332  for (auto value : element.second) {
1333  hoisted_literals.push_back(value);
1334  argument_values.push_back(&*arg_it);
1335  if (value->hasName()) {
1336  arg_it->setName("arg_" + value->getName());
1337  }
1338  ++arg_it;
1339  }
1340 
1341  query_func_literal_loads_function_arguments[element.first] = argument_values;
1342  }
1344  // copy the row_func function body over
1345  // see
1346  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
1347  row_func_with_hoisted_literals->getBasicBlockList().splice(
1348  row_func_with_hoisted_literals->begin(),
1349  cgen_state_->row_func_->getBasicBlockList());
1350 
1351  // also replace row_func arguments with the arguments from row_func_hoisted_literals
1352  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1353  E = cgen_state_->row_func_->arg_end(),
1354  I2 = row_func_with_hoisted_literals->arg_begin();
1355  I != E;
1356  ++I) {
1357  I->replaceAllUsesWith(&*I2);
1358  I2->takeName(&*I);
1359  ++I2;
1360  }
1361 
1362  cgen_state_->row_func_ = row_func_with_hoisted_literals;
1363 
1364  // and finally replace literal placeholders
1365  std::vector<llvm::Instruction*> placeholders;
1366  std::string prefix("__placeholder__literal_");
1367  for (auto it = llvm::inst_begin(row_func_with_hoisted_literals),
1368  e = llvm::inst_end(row_func_with_hoisted_literals);
1369  it != e;
1370  ++it) {
1371  if (it->hasName() && it->getName().startswith(prefix)) {
1372  auto offset_and_index_entry =
1373  cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
1374  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
1375 
1376  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
1377  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
1378 
1379  it->replaceAllUsesWith(
1380  query_func_literal_loads_function_arguments[lit_off][lit_idx]);
1381  placeholders.push_back(&*it);
1382  }
1383  }
1384  for (auto placeholder : placeholders) {
1385  placeholder->removeFromParent();
1386  }
1387 
1388  return hoisted_literals;
1389 }
1390 
1391 std::tuple<Executor::CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
1392 Executor::compileWorkUnit(const std::vector<InputTableInfo>& query_infos,
1393  const RelAlgExecutionUnit& ra_exe_unit,
1394  const CompilationOptions& co,
1395  const ExecutionOptions& eo,
1396  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
1397  const bool allow_lazy_fetch,
1398  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
1399  const size_t max_groups_buffer_entry_guess,
1400  const int8_t crt_min_byte_width,
1401  const bool has_cardinality_estimation,
1402  ColumnCacheMap& column_cache,
1403  RenderInfo* render_info) {
1404  auto timer = DEBUG_TIMER(__func__);
1405  nukeOldState(allow_lazy_fetch, query_infos, &ra_exe_unit);
1406 
1407  GroupByAndAggregate group_by_and_aggregate(
1408  this, co.device_type_, ra_exe_unit, query_infos, row_set_mem_owner);
1409  auto query_mem_desc =
1410  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
1411  max_groups_buffer_entry_guess,
1412  crt_min_byte_width,
1413  render_info,
1415 
1416  if (query_mem_desc->getQueryDescriptionType() ==
1418  !has_cardinality_estimation &&
1419  (!render_info || !render_info->isPotentialInSituRender()) && !eo.just_explain) {
1421  }
1422 
1423  const bool output_columnar = query_mem_desc->didOutputColumnar();
1424 
1426  const size_t num_count_distinct_descs =
1427  query_mem_desc->getCountDistinctDescriptorsSize();
1428  for (size_t i = 0; i < num_count_distinct_descs; i++) {
1429  const auto& count_distinct_descriptor =
1430  query_mem_desc->getCountDistinctDescriptor(i);
1431  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::StdSet ||
1432  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
1433  !co.hoist_literals_)) {
1434  throw QueryMustRunOnCpu();
1435  }
1436  }
1437  }
1438 
1439  // Read the module template and target either CPU or GPU
1440  // by binding the stream position functions to the right implementation:
1441  // stride access for GPU, contiguous for CPU
1442  auto rt_module_copy = llvm::CloneModule(
1443 #if LLVM_VERSION_MAJOR >= 7
1444  *g_rt_module.get(),
1445 #else
1446  g_rt_module.get(),
1447 #endif
1448  cgen_state_->vmap_,
1449  [](const llvm::GlobalValue* gv) {
1450  auto func = llvm::dyn_cast<llvm::Function>(gv);
1451  if (!func) {
1452  return true;
1453  }
1454  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
1455  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
1457  });
1458 
1461  CodeGenerator::link_udf_module(udf_cpu_module, *rt_module_copy, cgen_state_.get());
1462  }
1463  if (is_rt_udf_module_present(true)) {
1465  rt_udf_cpu_module, *rt_module_copy, cgen_state_.get());
1466  }
1467  } else {
1468  rt_module_copy->setDataLayout(get_gpu_data_layout());
1469  rt_module_copy->setTargetTriple(get_gpu_target_triple_string());
1470 
1471  if (is_udf_module_present()) {
1472  llvm::Triple gpu_triple(udf_gpu_module->getTargetTriple());
1473 
1474  if (!gpu_triple.isNVPTX()) {
1475  throw QueryMustRunOnCpu();
1476  }
1477 
1478  CodeGenerator::link_udf_module(udf_gpu_module, *rt_module_copy, cgen_state_.get());
1479  }
1480  if (is_rt_udf_module_present()) {
1482  rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
1483  }
1484  }
1485 
1486  cgen_state_->module_ = rt_module_copy.release();
1487 
1488  auto agg_fnames =
1489  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
1490 
1491  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
1492 
1493  const bool is_group_by{query_mem_desc->isGroupBy()};
1494  auto query_func = is_group_by ? query_group_by_template(cgen_state_->module_,
1495  co.hoist_literals_,
1496  *query_mem_desc,
1497  co.device_type_,
1498  ra_exe_unit.scan_limit)
1499  : query_template(cgen_state_->module_,
1500  agg_slot_count,
1501  co.hoist_literals_,
1502  !!ra_exe_unit.estimator);
1503  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
1504  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
1505  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
1506 
1507  cgen_state_->query_func_ = query_func;
1508  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
1509  &query_func->getEntryBlock().front());
1510 
1511  std::vector<llvm::Value*> col_heads;
1512  std::tie(cgen_state_->row_func_, col_heads) =
1513  create_row_function(ra_exe_unit.input_col_descs.size(),
1514  is_group_by ? 0 : agg_slot_count,
1515  co.hoist_literals_,
1516  query_func,
1517  cgen_state_->module_,
1518  cgen_state_->context_);
1519  CHECK(cgen_state_->row_func_);
1520  // make sure it's in-lined, we don't want register spills in the inner loop
1521  mark_function_always_inline(cgen_state_->row_func_);
1522  auto bb =
1523  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
1524  cgen_state_->ir_builder_.SetInsertPoint(bb);
1525  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
1526  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
1527  const auto join_loops =
1528  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
1529  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
1530  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
1531  if (is_not_deleted_bb) {
1532  bb = is_not_deleted_bb;
1533  }
1534  if (!join_loops.empty()) {
1535  codegenJoinLoops(join_loops,
1536  body_execution_unit,
1537  group_by_and_aggregate,
1538  query_func,
1539  bb,
1540  *(query_mem_desc.get()),
1541  co,
1542  eo);
1543  } else {
1544  const bool can_return_error =
1545  compileBody(ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co);
1546  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog) {
1547  createErrorCheckControlFlow(query_func, eo.with_dynamic_watchdog, co.device_type_);
1548  }
1549  }
1550  std::vector<llvm::Value*> hoisted_literals;
1551 
1552  if (co.hoist_literals_) {
1553  VLOG(1) << "number of hoisted literals: "
1554  << cgen_state_->query_func_literal_loads_.size()
1555  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
1556  << " bytes";
1557  }
1558 
1559  if (co.hoist_literals_ && !cgen_state_->query_func_literal_loads_.empty()) {
1560  // we have some hoisted literals...
1561  hoisted_literals = inlineHoistedLiterals();
1562  }
1563  // iterate through all the instruction in the query template function and
1564  // replace the call to the filter placeholder with the call to the actual filter
1565  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1566  ++it) {
1567  if (!llvm::isa<llvm::CallInst>(*it)) {
1568  continue;
1569  }
1570  auto& filter_call = llvm::cast<llvm::CallInst>(*it);
1571  if (std::string(filter_call.getCalledFunction()->getName()) == "row_process") {
1572  std::vector<llvm::Value*> args;
1573  for (size_t i = 0; i < filter_call.getNumArgOperands(); ++i) {
1574  args.push_back(filter_call.getArgOperand(i));
1575  }
1576  args.insert(args.end(), col_heads.begin(), col_heads.end());
1577  args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
1578  // push hoisted literals arguments, if any
1579  args.insert(args.end(), hoisted_literals.begin(), hoisted_literals.end());
1580 
1581  llvm::ReplaceInstWithInst(&filter_call,
1582  llvm::CallInst::Create(cgen_state_->row_func_, args, ""));
1583  break;
1584  }
1585  }
1586  plan_state_->init_agg_vals_ =
1587  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
1588 
1589  auto multifrag_query_func = cgen_state_->module_->getFunction(
1590  "multifrag_query" + std::string(co.hoist_literals_ ? "_hoisted_literals" : ""));
1591  CHECK(multifrag_query_func);
1592 
1593  bind_query(query_func,
1594  "query_stub" + std::string(co.hoist_literals_ ? "_hoisted_literals" : ""),
1595  multifrag_query_func,
1596  cgen_state_->module_);
1597 
1598  auto live_funcs =
1599  CodeGenerator::markDeadRuntimeFuncs(*cgen_state_->module_,
1600  {query_func, cgen_state_->row_func_},
1601  {multifrag_query_func});
1602 
1603  std::string llvm_ir;
1604  if (eo.just_explain) {
1606 #ifdef WITH_JIT_DEBUG
1607  throw std::runtime_error(
1608  "Explain optimized not available when JIT runtime debug symbols are enabled");
1609 #else
1610  optimize_ir(query_func, cgen_state_->module_, live_funcs, co);
1611 #endif // WITH_JIT_DEBUG
1612  }
1613  llvm_ir =
1614  serialize_llvm_object(query_func) + serialize_llvm_object(cgen_state_->row_func_);
1615  }
1616  verify_function_ir(cgen_state_->row_func_);
1617 
1618  LOG(IR) << query_mem_desc->toString() << "\nGenerated IR\n"
1619  << serialize_llvm_object(query_func)
1620  << serialize_llvm_object(cgen_state_->row_func_) << "\nEnd of IR";
1621 
1622  return std::make_tuple(
1625  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
1626  : optimizeAndCodegenGPU(query_func,
1627  multifrag_query_func,
1628  live_funcs,
1629  is_group_by || ra_exe_unit.estimator,
1630  cuda_mgr,
1631  co),
1632  cgen_state_->getLiterals(),
1633  output_columnar,
1634  llvm_ir},
1635  std::move(query_mem_desc));
1636 }
1637 
1639  const RelAlgExecutionUnit& ra_exe_unit,
1640  const CompilationOptions& co) {
1641  CHECK(!ra_exe_unit.input_descs.empty());
1642  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
1643  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
1644  return nullptr;
1645  }
1646  const auto td = catalog_->getMetadataForTable(outer_input_desc.getTableId());
1647  CHECK(td);
1648  const auto deleted_cd = catalog_->getDeletedColumnIfRowsDeleted(td);
1649  if (!deleted_cd) {
1650  return nullptr;
1651  }
1652  CHECK(deleted_cd->columnType.is_boolean());
1653  const auto deleted_expr =
1654  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
1655  outer_input_desc.getTableId(),
1656  deleted_cd->columnId,
1657  outer_input_desc.getNestLevel());
1658  CodeGenerator code_generator(this);
1659  const auto is_deleted =
1660  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
1661  const auto is_deleted_bb = llvm::BasicBlock::Create(
1662  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
1663  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
1664  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
1665  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
1666  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
1667  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
1668  cgen_state_->ir_builder_.SetInsertPoint(bb);
1669  return bb;
1670 }
1671 
1672 bool Executor::compileBody(const RelAlgExecutionUnit& ra_exe_unit,
1673  GroupByAndAggregate& group_by_and_aggregate,
1675  const CompilationOptions& co) {
1676  // generate the code for the filter
1677  std::vector<Analyzer::Expr*> primary_quals;
1678  std::vector<Analyzer::Expr*> deferred_quals;
1679  bool short_circuited =
1680  CodeGenerator::prioritizeQuals(ra_exe_unit, primary_quals, deferred_quals);
1681  if (short_circuited) {
1682  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
1683  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
1684  << " quals";
1685  }
1686  llvm::Value* filter_lv = cgen_state_->llBool(true);
1687  CodeGenerator code_generator(this);
1688  for (auto expr : primary_quals) {
1689  // Generate the filter for primary quals
1690  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
1691  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
1692  }
1693  CHECK(filter_lv->getType()->isIntegerTy(1));
1694  llvm::BasicBlock* sc_false{nullptr};
1695  if (!deferred_quals.empty()) {
1696  auto sc_true = llvm::BasicBlock::Create(
1697  cgen_state_->context_, "sc_true", cgen_state_->row_func_);
1698  sc_false = llvm::BasicBlock::Create(
1699  cgen_state_->context_, "sc_false", cgen_state_->row_func_);
1700  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
1701  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
1702  if (ra_exe_unit.join_quals.empty()) {
1703  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
1704  }
1705  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
1706  filter_lv = cgen_state_->llBool(true);
1707  }
1708  for (auto expr : deferred_quals) {
1709  filter_lv = cgen_state_->ir_builder_.CreateAnd(
1710  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
1711  }
1712 
1713  CHECK(filter_lv->getType()->isIntegerTy(1));
1714  return group_by_and_aggregate.codegen(filter_lv, sc_false, query_mem_desc, co);
1715 }
1716 
1717 std::unique_ptr<llvm::Module> runtime_module_shallow_copy(CgenState* cgen_state) {
1718  return llvm::CloneModule(
1719 #if LLVM_VERSION_MAJOR >= 7
1720  *g_rt_module.get(),
1721 #else
1722  g_rt_module.get(),
1723 #endif
1724  cgen_state->vmap_,
1725  [](const llvm::GlobalValue* gv) {
1726  auto func = llvm::dyn_cast<llvm::Function>(gv);
1727  if (!func) {
1728  return true;
1729  }
1730  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
1731  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage);
1732  });
1733 }
1734 
1735 std::vector<llvm::Value*> generate_column_heads_load(const int num_columns,
1736  llvm::Value* byte_stream_arg,
1737  llvm::IRBuilder<>& ir_builder,
1738  llvm::LLVMContext& ctx) {
1739  CHECK(byte_stream_arg);
1740  const auto max_col_local_id = num_columns - 1;
1741 
1742  std::vector<llvm::Value*> col_heads;
1743  for (int col_id = 0; col_id <= max_col_local_id; ++col_id) {
1744  col_heads.emplace_back(ir_builder.CreateLoad(ir_builder.CreateGEP(
1745  byte_stream_arg, llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), col_id))));
1746  }
1747  return col_heads;
1748 }
1749 
std::map< std::string, std::string > get_device_parameters()
void read_rt_udf_gpu_module(const std::string &udf_ir)
catalog_(nullptr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:198
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
std::unique_ptr< llvm::Module > rt_udf_cpu_module
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
std::unique_ptr< llvm::Module > module(runtime_module_shallow_copy(cgen_state))
static void addCodeToCache(const CodeCacheKey &, std::vector< std::tuple< void *, ExecutionEngineWrapper >>, llvm::Module *, CodeCache &)
std::unique_ptr< llvm::Module > runtime_module_shallow_copy(CgenState *cgen_state)
void mark_function_never_inline(llvm::Function *func)
std::unique_ptr< llvm::Module > udf_gpu_module
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, CgenState *cgen_state)
ExecutorDeviceType
void read_rt_udf_cpu_module(const std::string &udf_ir)
#define LOG(tag)
Definition: Logger.h:185
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void mark_function_always_inline(llvm::Function *func)
llvm::StringRef get_gpu_data_layout()
bool is_udf_module_present(bool cpu_only=false)
std::string mapd_root_abs_path()
Definition: mapdpath.h:30
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const ExecutorOptLevel opt_level_
std::string join(T const &container, std::string const &delim)
llvm::Function * query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query)
static GPUCode generateNativeGPUCode(llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co, const GPUTarget &gpu_target)
int64_t * src
void read_udf_cpu_module(const std::string &udf_ir_filename)
void read_udf_gpu_module(const std::string &udf_ir_filename)
std::vector< std::string > CodeCacheKey
Definition: CodeCache.h:61
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
const std::vector< InputDescriptor > input_descs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr * > &primary_quals, std::vector< Analyzer::Expr * > &deferred_quals)
Definition: LogicalIR.cpp:157
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
std::string to_string(char const *&&v)
gpu_code_cache_(code_cache_size)
std::tuple< Executor::CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
std::vector< std::pair< void *, void * > > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function * > &, const CompilationOptions &)
cpu_code_cache_(code_cache_size)
Definition: sqldefs.h:71
false auto cgen_state
llvm::StringRef get_gpu_target_triple_string()
void verify_function_ir(const llvm::Function *func)
llvm::LLVMContext & context_
Definition: CgenState.h:267
const bool allow_multifrag
CHECK(cgen_state)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:117
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function * > &roots, const std::vector< llvm::Function * > &leaves)
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co)
std::string generatePTX(const std::string &) const
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
Definition: CodeCache.h:56
const bool with_dynamic_watchdog
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > g_rt_module
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1041
void initializeNVPTXBackend() const
Definition: sqldefs.h:71
const bool register_intel_jit_listener_
const_list_iterator_t cend() const
Definition: LruCache.hpp:55
const bool output_columnar_hint
llvm::Function * query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit)
static void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
const std::shared_ptr< Analyzer::Estimator > estimator
ExecutorDeviceType device_type_
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, ExecutorDeviceType device_type)
const ExecutorExplainType explain_type_
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
Definition: CodeCache.h:55
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
std::string serialize_llvm_object(const T *llvm_obj)
void clear_function_attributes(llvm::Function *func)
llvm::Module * read_template_module(llvm::LLVMContext &context)
Definition: sqldefs.h:71
static bool alwaysCloneRuntimeFunction(const llvm::Function *func)
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Value *byte_stream_arg, llvm::IRBuilder<> &ir_builder, llvm::LLVMContext &ctx)
std::unique_ptr< llvm::Module > udf_cpu_module
int CUdevice
Definition: nocuda.h:20
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
llvm::LLVMContext & getGlobalLLVMContext()
SQLAgg get_aggtype() const
Definition: Analyzer.h:1044
static std::vector< std::string > getLLVMDeclarations(const std::unordered_set< std::string > &udf_decls)
std::list< std::shared_ptr< Analyzer::Expr > > quals
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend()
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:61
#define DEBUG_TIMER(name)
Definition: Logger.h:296
llvm::ValueToValueMapTy vmap_
Definition: CgenState.h:268
std::vector< std::pair< void *, void * > > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function * > &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults > > > ColumnCacheMap
std::vector< llvm::Value * > inlineHoistedLiterals()
std::vector< std::tuple< void *, ExecutionEngineWrapper, std::unique_ptr< GpuCompilationContext >>> CodeCacheVal
Definition: CodeCache.h:63
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="")
Definition: sqldefs.h:71
void optimize_ir(llvm::Function *query_func, llvm::Module *module, const std::unordered_set< llvm::Function * > &live_funcs, const CompilationOptions &co)
int cpu_threads()
Definition: thread_count.h:25
CubinResult ptx_to_cubin(const std::string &ptx, const unsigned block_size, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
cgen_state module_
Definition: sqldefs.h:71
bool is_rt_udf_module_present(bool cpu_only=false)
const_list_iterator_t find(const key_t &key) const
Definition: LruCache.hpp:49
#define VLOG(n)
Definition: Logger.h:280
std::pair< llvm::Function *, std::vector< llvm::Value * > > create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Function *query_func, llvm::Module *module, llvm::LLVMContext &context)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
std::vector< std::pair< void *, void * > > getCodeFromCache(const CodeCacheKey &, const CodeCache &)
void put(key_t const &key, value_t &&value)
Definition: LruCache.hpp:27