OmniSciDB  04ee39c94c
NativeCodegen.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CodeGenerator.h"
18 #include "Execute.h"
22 #include "QueryTemplateGenerator.h"
23 
24 #include "Shared/mapdpath.h"
25 
26 #if LLVM_VERSION_MAJOR < 4
27 static_assert(false, "LLVM Version >= 4 is required.");
28 #endif
29 
30 #include <llvm/Bitcode/BitcodeReader.h>
31 #include <llvm/Bitcode/BitcodeWriter.h>
32 #include <llvm/ExecutionEngine/MCJIT.h>
33 #include <llvm/IR/Attributes.h>
34 #include <llvm/IR/GlobalValue.h>
35 #include <llvm/IR/InstIterator.h>
36 #include <llvm/IR/LegacyPassManager.h>
37 #include <llvm/IR/Verifier.h>
38 #include <llvm/IRReader/IRReader.h>
39 #include <llvm/Support/Casting.h>
40 #include <llvm/Support/FileSystem.h>
41 #include <llvm/Support/FormattedStream.h>
42 #include <llvm/Support/MemoryBuffer.h>
43 #include <llvm/Support/SourceMgr.h>
44 #include <llvm/Support/TargetRegistry.h>
45 #include <llvm/Support/TargetSelect.h>
46 #include <llvm/Support/raw_os_ostream.h>
47 #include <llvm/Transforms/IPO.h>
48 #include <llvm/Transforms/IPO/AlwaysInliner.h>
49 #include <llvm/Transforms/InstCombine/InstCombine.h>
50 #include <llvm/Transforms/Instrumentation.h>
51 #include <llvm/Transforms/Scalar.h>
52 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
53 #include <llvm/Transforms/Utils/Cloning.h>
54 #include "llvm/IR/IntrinsicInst.h"
55 #include "llvm/IR/Intrinsics.h"
56 
57 #if LLVM_VERSION_MAJOR >= 7
58 #include <llvm/Transforms/Scalar/InstSimplifyPass.h>
59 #include <llvm/Transforms/Utils.h>
60 #endif
61 #include <llvm/IRReader/IRReader.h>
62 #include <llvm/Linker/Linker.h>
63 #include <llvm/Support/SourceMgr.h>
64 #include <llvm/Support/raw_ostream.h>
65 
66 std::unique_ptr<llvm::Module> udf_gpu_module;
67 std::unique_ptr<llvm::Module> udf_cpu_module;
68 std::unique_ptr<llvm::Module> rt_udf_gpu_module;
69 std::unique_ptr<llvm::Module> rt_udf_cpu_module;
70 
71 namespace {
72 
73 #if defined(HAVE_CUDA) || !defined(WITH_JIT_DEBUG)
75  llvm::Module& M,
76  const std::unordered_set<llvm::Function*>& live_funcs) {
77  std::vector<llvm::Function*> dead_funcs;
78  for (auto& F : M) {
79  bool bAlive = false;
80  if (live_funcs.count(&F)) {
81  continue;
82  }
83  for (auto U : F.users()) {
84  auto* C = llvm::dyn_cast<const llvm::CallInst>(U);
85  if (!C || C->getParent()->getParent() != &F) {
86  bAlive = true;
87  break;
88  }
89  }
90  if (!bAlive) {
91  dead_funcs.push_back(&F);
92  }
93  }
94  for (auto pFn : dead_funcs) {
95  pFn->eraseFromParent();
96  }
97 }
98 
99 void optimize_ir(llvm::Function* query_func,
100  llvm::Module* module,
101  const std::unordered_set<llvm::Function*>& live_funcs,
102  const CompilationOptions& co) {
103  llvm::legacy::PassManager pass_manager;
104 
105  pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
106  pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
107 #if LLVM_VERSION_MAJOR >= 7
108  pass_manager.add(llvm::createInstSimplifyLegacyPass());
109 #else
110  pass_manager.add(llvm::createInstructionSimplifierPass());
111 #endif
112  pass_manager.add(llvm::createInstructionCombiningPass());
113  pass_manager.add(llvm::createGlobalOptimizerPass());
114 
115  pass_manager.add(llvm::createLICMPass());
117  pass_manager.add(llvm::createLoopStrengthReducePass());
118  }
119  pass_manager.run(*module);
120 
121  eliminate_dead_self_recursive_funcs(*module, live_funcs);
122 }
123 #endif
124 
125 } // namespace
126 
127 template <class T>
128 std::string serialize_llvm_object(const T* llvm_obj) {
129  std::stringstream ss;
130  llvm::raw_os_ostream os(ss);
131  llvm_obj->print(os);
132  os.flush();
133  return ss.str();
134 }
135 
137 
138 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine)
139  : execution_engine_(execution_engine) {}
140 
141 ExecutionEngineWrapper::ExecutionEngineWrapper(llvm::ExecutionEngine* execution_engine,
142  const CompilationOptions& co)
143  : execution_engine_(execution_engine) {
144  if (execution_engine_) {
146  intel_jit_listener_.reset(llvm::JITEventListener::createIntelJITEventListener());
148  execution_engine_->RegisterJITEventListener(intel_jit_listener_.get());
149  LOG(INFO) << "Registered IntelJITEventListener";
150  }
151  }
152 }
153 
155  llvm::ExecutionEngine* execution_engine) {
156  execution_engine_.reset(execution_engine);
157  intel_jit_listener_ = nullptr;
158  return *this;
159 }
160 
161 void verify_function_ir(const llvm::Function* func) {
162  std::stringstream err_ss;
163  llvm::raw_os_ostream err_os(err_ss);
164  if (llvm::verifyFunction(*func, &err_os)) {
165  func->print(llvm::outs());
166  LOG(FATAL) << err_ss.str();
167  }
168 }
169 
170 std::vector<std::pair<void*, void*>> Executor::getCodeFromCache(const CodeCacheKey& key,
171  const CodeCache& cache) {
172  auto it = cache.find(key);
173  if (it != cache.cend()) {
174  delete cgen_state_->module_;
175  cgen_state_->module_ = it->second.second;
176  std::vector<std::pair<void*, void*>> native_functions;
177  for (auto& native_code : it->second.first) {
178  GpuCompilationContext* gpu_context = std::get<2>(native_code).get();
179  native_functions.emplace_back(std::get<0>(native_code),
180  gpu_context ? gpu_context->module() : nullptr);
181  }
182  return native_functions;
183  }
184  return {};
185 }
186 
188  const CodeCacheKey& key,
189  std::vector<std::tuple<void*, ExecutionEngineWrapper>> native_code,
190  llvm::Module* module,
191  CodeCache& cache) {
192  CHECK(!native_code.empty());
193  CodeCacheVal cache_val;
194  for (auto& native_func : native_code) {
195  cache_val.emplace_back(
196  std::get<0>(native_func), std::move(std::get<1>(native_func)), nullptr);
197  }
198  cache.put(key,
199  std::make_pair<decltype(cache_val), decltype(module)>(std::move(cache_val),
200  std::move(module)));
201 }
202 
204  const CodeCacheKey& key,
205  const std::vector<std::tuple<void*, GpuCompilationContext*>>& native_code,
206  llvm::Module* module,
207  CodeCache& cache) {
208  CHECK(!native_code.empty());
209  CodeCacheVal cache_val;
210  for (const auto& native_func : native_code) {
211  cache_val.emplace_back(
212  std::get<0>(native_func),
214  std::unique_ptr<GpuCompilationContext>(std::get<1>(native_func)));
215  }
216  cache.put(key,
217  std::make_pair<decltype(cache_val), decltype(module)>(std::move(cache_val),
218  std::move(module)));
219 }
220 
222  llvm::Function* func,
223  const std::unordered_set<llvm::Function*>& live_funcs,
224  const CompilationOptions& co) {
225  auto module = func->getParent();
226  // run optimizations
227 #ifndef WITH_JIT_DEBUG
228  optimize_ir(func, module, live_funcs, co);
229 #endif // WITH_JIT_DEBUG
230 
231  auto init_err = llvm::InitializeNativeTarget();
232  CHECK(!init_err);
233 
234  llvm::InitializeAllTargetMCs();
235  llvm::InitializeNativeTargetAsmPrinter();
236  llvm::InitializeNativeTargetAsmParser();
237 
238  std::string err_str;
239  std::unique_ptr<llvm::Module> owner(module);
240  llvm::EngineBuilder eb(std::move(owner));
241  eb.setErrorStr(&err_str);
242  eb.setEngineKind(llvm::EngineKind::JIT);
243  llvm::TargetOptions to;
244  to.EnableFastISel = true;
245  eb.setTargetOptions(to);
247  eb.setOptLevel(llvm::CodeGenOpt::None);
248  }
249 
250  ExecutionEngineWrapper execution_engine(eb.create(), co);
251  CHECK(execution_engine.get());
252 
253  execution_engine->finalizeObject();
254 
255  return execution_engine;
256 }
257 
258 std::vector<std::pair<void*, void*>> Executor::optimizeAndCodegenCPU(
259  llvm::Function* query_func,
260  llvm::Function* multifrag_query_func,
261  const std::unordered_set<llvm::Function*>& live_funcs,
262  const CompilationOptions& co) {
263  auto module = multifrag_query_func->getParent();
264  CodeCacheKey key{serialize_llvm_object(query_func),
265  serialize_llvm_object(cgen_state_->row_func_)};
266  for (const auto helper : cgen_state_->helper_functions_) {
267  key.push_back(serialize_llvm_object(helper));
268  }
269  auto cached_code = getCodeFromCache(key, cpu_code_cache_);
270  if (!cached_code.empty()) {
271  return cached_code;
272  }
273 
274  auto execution_engine =
275  CodeGenerator::generateNativeCPUCode(query_func, live_funcs, co);
276  auto native_code = execution_engine->getPointerToFunction(multifrag_query_func);
277  CHECK(native_code);
278 
279  std::vector<std::tuple<void*, ExecutionEngineWrapper>> cache;
280  cache.emplace_back(native_code, std::move(execution_engine));
281  addCodeToCache(key, std::move(cache), module, cpu_code_cache_);
282 
283  return {std::make_pair(native_code, nullptr)};
284 }
285 
286 namespace {
287 
288 std::string cpp_to_llvm_name(const std::string& s) {
289  if (s == "int8_t") {
290  return "i8";
291  }
292  if (s == "int16_t") {
293  return "i16";
294  }
295  if (s == "int32_t") {
296  return "i32";
297  }
298  if (s == "int64_t") {
299  return "i64";
300  }
301  CHECK(s == "float" || s == "double");
302  return s;
303 }
304 
305 std::string gen_array_any_all_sigs() {
306  std::string result;
307  for (const std::string any_or_all : {"any", "all"}) {
308  for (const std::string elem_type :
309  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
310  for (const std::string needle_type :
311  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
312  for (const std::string op_name : {"eq", "ne", "lt", "le", "gt", "ge"}) {
313  result += ("declare i1 @array_" + any_or_all + "_" + op_name + "_" + elem_type +
314  "_" + needle_type + "(i8*, i64, " + cpp_to_llvm_name(needle_type) +
315  ", " + cpp_to_llvm_name(elem_type) + ");\n");
316  }
317  }
318  }
319  }
320  return result;
321 }
322 
324  std::string result;
325  for (const std::string key_type : {"int8_t", "int16_t", "int32_t", "int64_t"}) {
326  const auto key_llvm_type = cpp_to_llvm_name(key_type);
327  result += "declare i64 @translate_null_key_" + key_type + "(" + key_llvm_type + ", " +
328  key_llvm_type + ", i64);\n";
329  }
330  return result;
331 }
332 
333 const std::string cuda_rt_decls =
334  R"(
335 declare void @llvm.dbg.declare(metadata, metadata, metadata)
336 declare void @llvm.dbg.value(metadata, metadata, metadata)
337 declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
338 declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
339 declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
340 declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
341 declare i32 @pos_start_impl(i32*);
342 declare i32 @group_buff_idx_impl();
343 declare i32 @pos_step_impl();
344 declare i8 @thread_warp_idx(i8);
345 declare i64* @init_shared_mem(i64*, i32);
346 declare i64* @init_shared_mem_nop(i64*, i32);
347 declare i64* @init_shared_mem_dynamic(i64*, i32);
348 declare i64* @alloc_shared_mem_dynamic();
349 declare void @set_shared_mem_to_identity(i64*, i32, i64);
350 declare void @write_back(i64*, i64*, i32);
351 declare void @write_back_smem_nop(i64*, i64*, i32);
352 declare void @write_back_nop(i64*, i64*, i32);
353 declare void @agg_from_smem_to_gmem_nop(i64*, i64*, i32);
354 declare void @agg_from_smem_to_gmem_binId_count(i64*, i64*, i32);
355 declare void @agg_from_smem_to_gmem_count_binId(i64*, i64*, i32);
356 declare void @init_group_by_buffer_gpu(i64*, i64*, i32, i32, i32, i1, i8);
357 declare i64* @get_group_value(i64*, i32, i64*, i32, i32, i32, i64*);
358 declare i64* @get_group_value_with_watchdog(i64*, i32, i64*, i32, i32, i32, i64*);
359 declare i32 @get_group_value_columnar_slot(i64*, i32, i64*, i32, i32);
360 declare i32 @get_group_value_columnar_slot_with_watchdog(i64*, i32, i64*, i32, i32);
361 declare i64* @get_group_value_fast(i64*, i64, i64, i64, i32);
362 declare i64* @get_group_value_fast_with_original_key(i64*, i64, i64, i64, i64, i32);
363 declare i32 @get_columnar_group_bin_offset(i64*, i64, i64, i64);
364 declare i64 @baseline_hash_join_idx_32(i8*, i8*, i64, i64);
365 declare i64 @baseline_hash_join_idx_64(i8*, i8*, i64, i64);
366 declare i64 @get_composite_key_index_32(i32*, i64, i32*, i64);
367 declare i64 @get_composite_key_index_64(i64*, i64, i64*, i64);
368 declare i64 @get_bucket_key_for_range_compressed(i8*, i64, double);
369 declare i64 @get_bucket_key_for_range_double(i8*, i64, double);
370 declare i64 @agg_count_shared(i64*, i64);
371 declare i64 @agg_count_skip_val_shared(i64*, i64, i64);
372 declare i32 @agg_count_int32_shared(i32*, i32);
373 declare i32 @agg_count_int32_skip_val_shared(i32*, i32, i32);
374 declare i64 @agg_count_double_shared(i64*, double);
375 declare i64 @agg_count_double_skip_val_shared(i64*, double, double);
376 declare i32 @agg_count_float_shared(i32*, float);
377 declare i32 @agg_count_float_skip_val_shared(i32*, float, float);
378 declare i64 @agg_sum_shared(i64*, i64);
379 declare i64 @agg_sum_skip_val_shared(i64*, i64, i64);
380 declare i32 @agg_sum_int32_shared(i32*, i32);
381 declare i32 @agg_sum_int32_skip_val_shared(i32*, i32, i32);
382 declare void @agg_sum_double_shared(i64*, double);
383 declare void @agg_sum_double_skip_val_shared(i64*, double, double);
384 declare void @agg_sum_float_shared(i32*, float);
385 declare void @agg_sum_float_skip_val_shared(i32*, float, float);
386 declare void @agg_max_shared(i64*, i64);
387 declare void @agg_max_skip_val_shared(i64*, i64, i64);
388 declare void @agg_max_int32_shared(i32*, i32);
389 declare void @agg_max_int32_skip_val_shared(i32*, i32, i32);
390 declare void @agg_max_int16_shared(i16*, i16);
391 declare void @agg_max_int16_skip_val_shared(i16*, i16, i16);
392 declare void @agg_max_int8_shared(i8*, i8);
393 declare void @agg_max_int8_skip_val_shared(i8*, i8, i8);
394 declare void @agg_max_double_shared(i64*, double);
395 declare void @agg_max_double_skip_val_shared(i64*, double, double);
396 declare void @agg_max_float_shared(i32*, float);
397 declare void @agg_max_float_skip_val_shared(i32*, float, float);
398 declare void @agg_min_shared(i64*, i64);
399 declare void @agg_min_skip_val_shared(i64*, i64, i64);
400 declare void @agg_min_int32_shared(i32*, i32);
401 declare void @agg_min_int32_skip_val_shared(i32*, i32, i32);
402 declare void @agg_min_int16_shared(i16*, i16);
403 declare void @agg_min_int16_skip_val_shared(i16*, i16, i16);
404 declare void @agg_min_int8_shared(i8*, i8);
405 declare void @agg_min_int8_skip_val_shared(i8*, i8, i8);
406 declare void @agg_min_double_shared(i64*, double);
407 declare void @agg_min_double_skip_val_shared(i64*, double, double);
408 declare void @agg_min_float_shared(i32*, float);
409 declare void @agg_min_float_skip_val_shared(i32*, float, float);
410 declare void @agg_id_shared(i64*, i64);
411 declare void @agg_id_int32_shared(i32*, i32);
412 declare void @agg_id_int16_shared(i16*, i16);
413 declare void @agg_id_int8_shared(i8*, i8);
414 declare void @agg_id_double_shared(i64*, double);
415 declare void @agg_id_double_shared_slow(i64*, double*);
416 declare void @agg_id_float_shared(i32*, float);
417 declare i1 @slotEmptyKeyCAS(i64*, i64, i64);
418 declare i1 @slotEmptyKeyCAS_int32(i32*, i32, i32);
419 declare i1 @slotEmptyKeyCAS_int16(i16*, i16, i16);
420 declare i1 @slotEmptyKeyCAS_int8(i8*, i8, i8);
421 declare i64 @ExtractFromTime(i32, i64);
422 declare i64 @ExtractFromTimeNullable(i32, i64, i64);
423 declare i64 @DateTruncate(i32, i64);
424 declare i64 @DateTruncateNullable(i32, i64, i64);
425 declare i64 @DateTruncateHighPrecisionToDate(i64, i64);
426 declare i64 @DateTruncateHighPrecisionToDateNullable(i64, i64, i64);
427 declare i64 @DateTruncateAlterPrecisionScaleUp(i64, i64);
428 declare i64 @DateTruncateAlterPrecisionScaleDown(i64, i64);
429 declare i64 @DateTruncateAlterPrecisionScaleUpNullable(i64, i64, i64);
430 declare i64 @DateTruncateAlterPrecisionScaleDownNullable(i64, i64, i64);
431 declare i64 @DateDiff(i32, i64, i64);
432 declare i64 @DateDiffNullable(i32, i64, i64, i64);
433 declare i64 @DateDiffHighPrecision(i32, i64, i64, i32, i64, i64, i64);
434 declare i64 @DateDiffHighPrecisionNullable(i32, i64, i64, i32, i64, i64, i64, i64);
435 declare i64 @DateAdd(i32, i64, i64);
436 declare i64 @DateAddNullable(i32, i64, i64, i64);
437 declare i64 @DateAddHighPrecision(i32, i64, i64, i64);
438 declare i64 @DateAddHighPrecisionNullable(i32, i64, i64, i64, i64);
439 declare i64 @string_decode(i8*, i64);
440 declare i32 @array_size(i8*, i64, i32);
441 declare i32 @array_size_nullable(i8*, i64, i32, i32);
442 declare i32 @fast_fixlen_array_size(i8*, i32);
443 declare i1 @array_is_null(i8*, i64);
444 declare i8* @array_buff(i8*, i64);
445 declare i8* @fast_fixlen_array_buff(i8*, i64);
446 declare i8 @array_at_int8_t(i8*, i64, i32);
447 declare i16 @array_at_int16_t(i8*, i64, i32);
448 declare i32 @array_at_int32_t(i8*, i64, i32);
449 declare i64 @array_at_int64_t(i8*, i64, i32);
450 declare float @array_at_float(i8*, i64, i32);
451 declare double @array_at_double(i8*, i64, i32);
452 declare i8 @varlen_array_at_int8_t(i8*, i64, i32);
453 declare i16 @varlen_array_at_int16_t(i8*, i64, i32);
454 declare i32 @varlen_array_at_int32_t(i8*, i64, i32);
455 declare i64 @varlen_array_at_int64_t(i8*, i64, i32);
456 declare float @varlen_array_at_float(i8*, i64, i32);
457 declare double @varlen_array_at_double(i8*, i64, i32);
458 declare i8 @varlen_notnull_array_at_int8_t(i8*, i64, i32);
459 declare i16 @varlen_notnull_array_at_int16_t(i8*, i64, i32);
460 declare i32 @varlen_notnull_array_at_int32_t(i8*, i64, i32);
461 declare i64 @varlen_notnull_array_at_int64_t(i8*, i64, i32);
462 declare float @varlen_notnull_array_at_float(i8*, i64, i32);
463 declare double @varlen_notnull_array_at_double(i8*, i64, i32);
464 declare i8 @array_at_int8_t_checked(i8*, i64, i64, i8);
465 declare i16 @array_at_int16_t_checked(i8*, i64, i64, i16);
466 declare i32 @array_at_int32_t_checked(i8*, i64, i64, i32);
467 declare i64 @array_at_int64_t_checked(i8*, i64, i64, i64);
468 declare float @array_at_float_checked(i8*, i64, i64, float);
469 declare double @array_at_double_checked(i8*, i64, i64, double);
470 declare i32 @char_length(i8*, i32);
471 declare i32 @char_length_nullable(i8*, i32, i32);
472 declare i32 @char_length_encoded(i8*, i32);
473 declare i32 @char_length_encoded_nullable(i8*, i32, i32);
474 declare i32 @key_for_string_encoded(i32);
475 declare i1 @string_like(i8*, i32, i8*, i32, i8);
476 declare i1 @string_ilike(i8*, i32, i8*, i32, i8);
477 declare i8 @string_like_nullable(i8*, i32, i8*, i32, i8, i8);
478 declare i8 @string_ilike_nullable(i8*, i32, i8*, i32, i8, i8);
479 declare i1 @string_like_simple(i8*, i32, i8*, i32);
480 declare i1 @string_ilike_simple(i8*, i32, i8*, i32);
481 declare i8 @string_like_simple_nullable(i8*, i32, i8*, i32, i8);
482 declare i8 @string_ilike_simple_nullable(i8*, i32, i8*, i32, i8);
483 declare i1 @string_lt(i8*, i32, i8*, i32);
484 declare i1 @string_le(i8*, i32, i8*, i32);
485 declare i1 @string_gt(i8*, i32, i8*, i32);
486 declare i1 @string_ge(i8*, i32, i8*, i32);
487 declare i1 @string_eq(i8*, i32, i8*, i32);
488 declare i1 @string_ne(i8*, i32, i8*, i32);
489 declare i8 @string_lt_nullable(i8*, i32, i8*, i32, i8);
490 declare i8 @string_le_nullable(i8*, i32, i8*, i32, i8);
491 declare i8 @string_gt_nullable(i8*, i32, i8*, i32, i8);
492 declare i8 @string_ge_nullable(i8*, i32, i8*, i32, i8);
493 declare i8 @string_eq_nullable(i8*, i32, i8*, i32, i8);
494 declare i8 @string_ne_nullable(i8*, i32, i8*, i32, i8);
495 declare i1 @regexp_like(i8*, i32, i8*, i32, i8);
496 declare i8 @regexp_like_nullable(i8*, i32, i8*, i32, i8, i8);
497 declare void @linear_probabilistic_count(i8*, i32, i8*, i32);
498 declare void @agg_count_distinct_bitmap_gpu(i64*, i64, i64, i64, i64, i64, i64);
499 declare void @agg_count_distinct_bitmap_skip_val_gpu(i64*, i64, i64, i64, i64, i64, i64, i64);
500 declare void @agg_approximate_count_distinct_gpu(i64*, i64, i32, i64, i64);
501 declare i32 @record_error_code(i32, i32*);
502 declare i1 @dynamic_watchdog();
503 declare void @force_sync();
504 declare void @sync_warp();
505 declare void @sync_warp_protected(i64, i64);
506 declare i64* @get_bin_from_k_heap_int32_t(i64*, i32, i32, i32, i1, i1, i1, i32, i32);
507 declare i64* @get_bin_from_k_heap_int64_t(i64*, i32, i32, i32, i1, i1, i1, i64, i64);
508 declare i64* @get_bin_from_k_heap_float(i64*, i32, i32, i32, i1, i1, i1, float, float);
509 declare i64* @get_bin_from_k_heap_double(i64*, i32, i32, i32, i1, i1, i1, double, double);
510 )" + gen_array_any_all_sigs() +
512 
513 #ifdef HAVE_CUDA
514 std::string extension_function_decls() {
516  return boost::algorithm::join(decls, "\n");
517 }
518 
519 void legalize_nvvm_ir(llvm::Function* query_func) {
520  // optimizations might add attributes to the function
521  // and NVPTX doesn't understand all of them; play it
522  // safe and clear all attributes
523  clear_function_attributes(query_func);
524  verify_function_ir(query_func);
525 
526  std::vector<llvm::Instruction*> unsupported_intrinsics;
527  for (auto& BB : *query_func) {
528  for (llvm::Instruction& I : BB) {
529  if (const llvm::IntrinsicInst* II = llvm::dyn_cast<llvm::IntrinsicInst>(&I)) {
530  if (II->getIntrinsicID() == llvm::Intrinsic::stacksave ||
531  II->getIntrinsicID() == llvm::Intrinsic::stackrestore) {
532  unsupported_intrinsics.push_back(&I);
533  }
534  }
535  }
536  }
537 
538  for (auto& II : unsupported_intrinsics) {
539  II->eraseFromParent();
540  }
541 }
542 #endif // HAVE_CUDA
543 
544 void link_udf_module(const std::unique_ptr<llvm::Module>& udf_module,
545  llvm::Module& module,
546  CgenState* cgen_state,
547  llvm::Linker::Flags flags = llvm::Linker::Flags::None) {
548  // throw a runtime error if the target module contains functions
549  // with the same name as in module of UDF functions.
550  for (auto& f : *udf_module.get()) {
551  auto func = module.getFunction(f.getName());
552  if (!(func == nullptr)) {
553  LOG(FATAL) << " Attempt to overwrite " << f.getName().str() << " in "
554  << module.getModuleIdentifier() << " from `"
555  << udf_module->getModuleIdentifier() << "`" << std::endl;
556  throw std::runtime_error(
557  "link_udf_module: *** attempt to overwrite a runtime function with a UDF "
558  "function ***");
559  } else {
560  LOG(INFO) << " Adding " << f.getName().str() << " to "
561  << module.getModuleIdentifier() << " from `"
562  << udf_module->getModuleIdentifier() << "`" << std::endl;
563  }
564  }
565 
566  std::unique_ptr<llvm::Module> udf_module_copy;
567 
568  udf_module_copy = llvm::CloneModule(
569 #if LLVM_VERSION_MAJOR >= 7
570  *udf_module.get(),
571 #else
572  udf_module.get(),
573 #endif
574  cgen_state->vmap_);
575 
576  udf_module_copy->setDataLayout(module.getDataLayout());
577  udf_module_copy->setTargetTriple(module.getTargetTriple());
578 
579  // Initialize linker with module for RuntimeFunctions.bc
580  llvm::Linker ld(module);
581  bool link_error = false;
582 
583  link_error = ld.linkInModule(std::move(udf_module_copy), flags);
584 
585  if (link_error) {
586  throw std::runtime_error("link_udf_module: *** error linking module ***");
587  }
588 }
589 
590 } // namespace
591 
592 llvm::StringRef get_gpu_target_triple_string() {
593  return llvm::StringRef("nvptx64-nvidia-cuda");
594 }
595 
596 llvm::StringRef get_gpu_data_layout() {
597  return llvm::StringRef(
598  "e-p:64:64:64-i1:8:8-i8:8:8-"
599  "i16:16:16-i32:32:32-i64:64:64-"
600  "f32:32:32-f64:64:64-v16:16:16-"
601  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
602 }
603 
604 std::map<std::string, std::string> get_device_parameters() {
605  std::map<std::string, std::string> result;
606 
607  result.insert(std::make_pair("cpu_name", llvm::sys::getHostCPUName()));
608  result.insert(std::make_pair("cpu_triple", llvm::sys::getProcessTriple()));
609  result.insert(
610  std::make_pair("cpu_cores", std::to_string(llvm::sys::getHostNumPhysicalCores())));
611  result.insert(std::make_pair("cpu_threads", std::to_string(cpu_threads())));
612 
613  llvm::StringMap<bool> cpu_features;
614  if (llvm::sys::getHostCPUFeatures(cpu_features)) {
615  std::string features_str = "";
616  for (auto it = cpu_features.begin(); it != cpu_features.end(); ++it) {
617  features_str += (it->getValue() ? " +" : " -");
618  features_str += it->getKey().str();
619  }
620  result.insert(std::make_pair("cpu_features", features_str));
621  }
622 
623 #ifdef HAVE_CUDA
624  int device_count = 0;
625  checkCudaErrors(cuDeviceGetCount(&device_count));
626  if (device_count) {
627  CUdevice device{};
628  char device_name[256];
629  int major = 0, minor = 0;
630  checkCudaErrors(cuDeviceGet(&device, 0)); // assuming homogeneous multi-GPU system
631  checkCudaErrors(cuDeviceGetName(device_name, 256, device));
632  checkCudaErrors(cuDeviceGetAttribute(
633  &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
634  checkCudaErrors(cuDeviceGetAttribute(
635  &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device));
636 
637  result.insert(std::make_pair("gpu_name", device_name));
638  result.insert(std::make_pair("gpu_count", std::to_string(device_count)));
639  result.insert(std::make_pair("gpu_compute_capability",
640  std::to_string(major) + "." + std::to_string(minor)));
641  result.insert(std::make_pair("gpu_triple", get_gpu_target_triple_string()));
642  result.insert(std::make_pair("gpu_datalayout", get_gpu_data_layout()));
643  }
644 #endif
645 
646  return result;
647 }
648 
650  llvm::Function* func,
651  llvm::Function* wrapper_func,
652  const std::unordered_set<llvm::Function*>& live_funcs,
653  const CompilationOptions& co,
654  const GPUTarget& gpu_target) {
655 #ifdef HAVE_CUDA
656  auto module = func->getParent();
657  module->setDataLayout(
658  "e-p:64:64:64-i1:8:8-i8:8:8-"
659  "i16:16:16-i32:32:32-i64:64:64-"
660  "f32:32:32-f64:64:64-v16:16:16-"
661  "v32:32:32-v64:64:64-v128:128:128-n16:32:64");
662  module->setTargetTriple("nvptx64-nvidia-cuda");
663  // run optimizations
664  optimize_ir(func, module, live_funcs, co);
665  legalize_nvvm_ir(func);
666 
667  std::stringstream ss;
668  llvm::raw_os_ostream os(ss);
669 
670  llvm::LLVMContext& ctx = module->getContext();
671  // Get "nvvm.annotations" metadata node
672  llvm::NamedMDNode* md = module->getOrInsertNamedMetadata("nvvm.annotations");
673 
674  llvm::Metadata* md_vals[] = {llvm::ConstantAsMetadata::get(wrapper_func),
675  llvm::MDString::get(ctx, "kernel"),
676  llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
677  llvm::Type::getInt32Ty(ctx), 1))};
678 
679  // Append metadata to nvvm.annotations
680  md->addOperand(llvm::MDNode::get(ctx, md_vals));
681 
682  std::unordered_set<llvm::Function*> roots{wrapper_func, func};
683  if (gpu_target.row_func_not_inlined) {
685  roots.insert(gpu_target.cgen_state->row_func_);
686  }
687 
688  // Prevent the udf function(s) from being removed the way the runtime functions are
689 
690  if (is_udf_module_present()) {
691  for (auto& f : udf_gpu_module->getFunctionList()) {
692  llvm::Function* udf_function = module->getFunction(f.getName());
693 
694  if (udf_function) {
695  legalize_nvvm_ir(udf_function);
696  roots.insert(udf_function);
697  }
698  }
699  }
700 
701  if (is_rt_udf_module_present()) {
702  for (auto& f : rt_udf_gpu_module->getFunctionList()) {
703  llvm::Function* udf_function = module->getFunction(f.getName());
704  if (udf_function) {
705  legalize_nvvm_ir(udf_function);
706  roots.insert(udf_function);
707  }
708  }
709  }
710 
711  std::vector<llvm::Function*> rt_funcs;
712  for (auto& Fn : *module) {
713  if (roots.count(&Fn)) {
714  continue;
715  }
716  rt_funcs.push_back(&Fn);
717  }
718  for (auto& pFn : rt_funcs) {
719  pFn->removeFromParent();
720  }
721  module->print(os, nullptr);
722  os.flush();
723  for (auto& pFn : rt_funcs) {
724  module->getFunctionList().push_back(pFn);
725  }
726  module->eraseNamedMetadata(md);
727 
728  auto cuda_llir = cuda_rt_decls + extension_function_decls() + ss.str();
729 
730  std::vector<std::pair<void*, void*>> native_functions;
731  std::vector<std::tuple<void*, GpuCompilationContext*>> cached_functions;
732 
733  const auto ptx =
734  generatePTX(cuda_llir, gpu_target.nvptx_target_machine, gpu_target.cgen_state);
735 
736  LOG(PTX) << "PTX for the GPU:\n" << ptx << "\nEnd of PTX";
737 
738  auto cubin_result = ptx_to_cubin(ptx, gpu_target.block_size, gpu_target.cuda_mgr);
739  auto& option_keys = cubin_result.option_keys;
740  auto& option_values = cubin_result.option_values;
741  auto cubin = cubin_result.cubin;
742  auto link_state = cubin_result.link_state;
743  const auto num_options = option_keys.size();
744 
745  auto func_name = wrapper_func->getName().str();
746  for (int device_id = 0; device_id < gpu_target.cuda_mgr->getDeviceCount();
747  ++device_id) {
748  auto gpu_context = new GpuCompilationContext(cubin,
749  func_name,
750  device_id,
751  gpu_target.cuda_mgr,
752  num_options,
753  &option_keys[0],
754  &option_values[0]);
755  auto native_code = gpu_context->kernel();
756  auto native_module = gpu_context->module();
757  CHECK(native_code);
758  CHECK(native_module);
759  native_functions.emplace_back(native_code, native_module);
760  cached_functions.emplace_back(native_code, gpu_context);
761  }
762 
763  checkCudaErrors(cuLinkDestroy(link_state));
764 
765  return {native_functions, cached_functions};
766 #else
767  return {};
768 #endif
769 }
770 
771 std::vector<std::pair<void*, void*>> Executor::optimizeAndCodegenGPU(
772  llvm::Function* query_func,
773  llvm::Function* multifrag_query_func,
774  std::unordered_set<llvm::Function*>& live_funcs,
775  const bool no_inline,
776  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
777  const CompilationOptions& co) {
778 #ifdef HAVE_CUDA
779  auto module = multifrag_query_func->getParent();
780  CHECK(cuda_mgr);
781  CodeCacheKey key{serialize_llvm_object(query_func),
782  serialize_llvm_object(cgen_state_->row_func_)};
783  for (const auto helper : cgen_state_->helper_functions_) {
784  key.push_back(serialize_llvm_object(helper));
785  }
786  auto cached_code = getCodeFromCache(key, gpu_code_cache_);
787  if (!cached_code.empty()) {
788  return cached_code;
789  }
790 
791  bool row_func_not_inlined = false;
792  if (no_inline) {
793  for (auto it = llvm::inst_begin(cgen_state_->row_func_),
794  e = llvm::inst_end(cgen_state_->row_func_);
795  it != e;
796  ++it) {
797  if (llvm::isa<llvm::CallInst>(*it)) {
798  auto& get_gv_call = llvm::cast<llvm::CallInst>(*it);
799  if (get_gv_call.getCalledFunction()->getName() == "get_group_value" ||
800  get_gv_call.getCalledFunction()->getName() ==
801  "get_group_value_with_watchdog" ||
802  get_gv_call.getCalledFunction()->getName() ==
803  "get_matching_group_value_perfect_hash" ||
804  get_gv_call.getCalledFunction()->getName() == "array_size" ||
805  get_gv_call.getCalledFunction()->getName() == "linear_probabilistic_count") {
806  mark_function_never_inline(cgen_state_->row_func_);
807  row_func_not_inlined = true;
808  break;
809  }
810  }
811  }
812  }
813 
814  initializeNVPTXBackend();
815  CodeGenerator::GPUTarget gpu_target{nvptx_target_machine_.get(),
816  cuda_mgr,
817  blockSize(),
818  cgen_state_.get(),
819  row_func_not_inlined};
820  const auto gpu_code = CodeGenerator::generateNativeGPUCode(
821  query_func, multifrag_query_func, live_funcs, co, gpu_target);
822 
823  addCodeToCache(key, gpu_code.cached_functions, module, gpu_code_cache_);
824 
825  return gpu_code.native_functions;
826 #else
827  return {};
828 #endif
829 }
830 
831 std::string CodeGenerator::generatePTX(const std::string& cuda_llir,
832  llvm::TargetMachine* nvptx_target_machine,
833  CgenState* cgen_state) {
834  auto mem_buff = llvm::MemoryBuffer::getMemBuffer(cuda_llir, "", false);
835 
836  llvm::SMDiagnostic err;
837 
838  auto module = llvm::parseIR(mem_buff->getMemBufferRef(), err, cgen_state->context_);
839  if (!module) {
840  LOG(FATAL) << err.getMessage().str();
841  }
842 
843  llvm::SmallString<256> code_str;
844  llvm::raw_svector_ostream formatted_os(code_str);
845  CHECK(nvptx_target_machine);
846  {
847  llvm::legacy::PassManager ptxgen_pm;
848  module->setDataLayout(nvptx_target_machine->createDataLayout());
849 
850 #if LLVM_VERSION_MAJOR >= 7
851  nvptx_target_machine->addPassesToEmitFile(
852  ptxgen_pm, formatted_os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
853 #else
854  nvptx_target_machine->addPassesToEmitFile(
855  ptxgen_pm, formatted_os, llvm::TargetMachine::CGFT_AssemblyFile);
856 #endif
857  ptxgen_pm.run(*module);
858  }
859 
860  return code_str.str();
861 }
862 
863 std::unique_ptr<llvm::TargetMachine> CodeGenerator::initializeNVPTXBackend() {
864  llvm::InitializeAllTargets();
865  llvm::InitializeAllTargetMCs();
866  llvm::InitializeAllAsmPrinters();
867  std::string err;
868  auto target = llvm::TargetRegistry::lookupTarget("nvptx64", err);
869  if (!target) {
870  LOG(FATAL) << err;
871  }
872  return std::unique_ptr<llvm::TargetMachine>(target->createTargetMachine(
873  "nvptx64-nvidia-cuda", "sm_30", "", llvm::TargetOptions(), llvm::Reloc::Static));
874 }
875 
876 std::string Executor::generatePTX(const std::string& cuda_llir) const {
878  cuda_llir, nvptx_target_machine_.get(), cgen_state_.get());
879 }
880 
882  if (nvptx_target_machine_) {
883  return;
884  }
885  nvptx_target_machine_ = CodeGenerator::initializeNVPTXBackend();
886 }
887 
888 llvm::Module* read_template_module(llvm::LLVMContext& context) {
889  llvm::SMDiagnostic err;
890 
891  auto buffer_or_error = llvm::MemoryBuffer::getFile(mapd_root_abs_path() +
892  "/QueryEngine/RuntimeFunctions.bc");
893  CHECK(!buffer_or_error.getError());
894  llvm::MemoryBuffer* buffer = buffer_or_error.get().get();
895 
896  auto owner = llvm::parseBitcodeFile(buffer->getMemBufferRef(), context);
897  CHECK(!owner.takeError());
898  auto module = owner.get().release();
899  CHECK(module);
900 
901  return module;
902 }
903 
904 namespace {
905 
906 void bind_pos_placeholders(const std::string& pos_fn_name,
907  const bool use_resume_param,
908  llvm::Function* query_func,
909  llvm::Module* module) {
910  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
911  ++it) {
912  if (!llvm::isa<llvm::CallInst>(*it)) {
913  continue;
914  }
915  auto& pos_call = llvm::cast<llvm::CallInst>(*it);
916  if (std::string(pos_call.getCalledFunction()->getName()) == pos_fn_name) {
917  if (use_resume_param) {
918  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
919  llvm::ReplaceInstWithInst(
920  &pos_call,
921  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl"),
922  error_code_arg));
923  } else {
924  llvm::ReplaceInstWithInst(
925  &pos_call,
926  llvm::CallInst::Create(module->getFunction(pos_fn_name + "_impl")));
927  }
928  break;
929  }
930  }
931 }
932 
933 std::vector<llvm::Value*> generate_column_heads_load(const int num_columns,
934  llvm::Function* query_func,
935  llvm::LLVMContext& context) {
936  auto max_col_local_id = num_columns - 1;
937  auto& fetch_bb = query_func->front();
938  llvm::IRBuilder<> fetch_ir_builder(&fetch_bb);
939  fetch_ir_builder.SetInsertPoint(&*fetch_bb.begin());
940  auto& byte_stream_arg = *query_func->args().begin();
941  std::vector<llvm::Value*> col_heads;
942  for (int col_id = 0; col_id <= max_col_local_id; ++col_id) {
943  col_heads.emplace_back(fetch_ir_builder.CreateLoad(fetch_ir_builder.CreateGEP(
944  &byte_stream_arg,
945  llvm::ConstantInt::get(llvm::Type::getInt32Ty(context), col_id))));
946  }
947  return col_heads;
948 }
949 
950 void set_row_func_argnames(llvm::Function* row_func,
951  const size_t in_col_count,
952  const size_t agg_col_count,
953  const bool hoist_literals) {
954  auto arg_it = row_func->arg_begin();
955 
956  if (agg_col_count) {
957  for (size_t i = 0; i < agg_col_count; ++i) {
958  arg_it->setName("out");
959  ++arg_it;
960  }
961  } else {
962  arg_it->setName("group_by_buff");
963  ++arg_it;
964  arg_it->setName("crt_matched");
965  ++arg_it;
966  arg_it->setName("total_matched");
967  ++arg_it;
968  arg_it->setName("old_total_matched");
969  ++arg_it;
970  arg_it->setName("max_matched");
971  ++arg_it;
972  }
973 
974  arg_it->setName("agg_init_val");
975  ++arg_it;
976 
977  arg_it->setName("pos");
978  ++arg_it;
979 
980  arg_it->setName("frag_row_off");
981  ++arg_it;
982 
983  arg_it->setName("num_rows_per_scan");
984  ++arg_it;
985 
986  if (hoist_literals) {
987  arg_it->setName("literals");
988  ++arg_it;
989  }
990 
991  for (size_t i = 0; i < in_col_count; ++i) {
992  arg_it->setName("col_buf" + std::to_string(i));
993  ++arg_it;
994  }
995 
996  arg_it->setName("join_hash_tables");
997 }
998 
999 std::pair<llvm::Function*, std::vector<llvm::Value*>> create_row_function(
1000  const size_t in_col_count,
1001  const size_t agg_col_count,
1002  const bool hoist_literals,
1003  llvm::Function* query_func,
1004  llvm::Module* module,
1005  llvm::LLVMContext& context) {
1006  std::vector<llvm::Type*> row_process_arg_types;
1007 
1008  if (agg_col_count) {
1009  // output (aggregate) arguments
1010  for (size_t i = 0; i < agg_col_count; ++i) {
1011  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1012  }
1013  } else {
1014  // group by buffer
1015  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1016  // current match count
1017  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1018  // total match count passed from the caller
1019  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1020  // old total match count returned to the caller
1021  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1022  // max matched (total number of slots in the output buffer)
1023  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1024  }
1025 
1026  // aggregate init values
1027  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1028 
1029  // position argument
1030  row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1031 
1032  // fragment row offset argument
1033  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1034 
1035  // number of rows for each scan
1036  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1037 
1038  // literals buffer argument
1039  if (hoist_literals) {
1040  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1041  }
1042 
1043  // Generate the function signature and column head fetches s.t.
1044  // double indirection isn't needed in the inner loop
1045  auto col_heads = generate_column_heads_load(in_col_count, query_func, context);
1046  CHECK_EQ(in_col_count, col_heads.size());
1047 
1048  // column buffer arguments
1049  for (size_t i = 0; i < in_col_count; ++i) {
1050  row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1051  }
1052 
1053  // join hash table argument
1054  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1055 
1056  // generate the function
1057  auto ft =
1058  llvm::FunctionType::get(get_int_type(32, context), row_process_arg_types, false);
1059 
1060  auto row_func =
1061  llvm::Function::Create(ft, llvm::Function::ExternalLinkage, "row_func", module);
1062 
1063  // set the row function argument names; for debugging purposes only
1064  set_row_func_argnames(row_func, in_col_count, agg_col_count, hoist_literals);
1065 
1066  return std::make_pair(row_func, col_heads);
1067 }
1068 
1069 void bind_query(llvm::Function* query_func,
1070  const std::string& query_fname,
1071  llvm::Function* multifrag_query_func,
1072  llvm::Module* module) {
1073  std::vector<llvm::CallInst*> query_stubs;
1074  for (auto it = llvm::inst_begin(multifrag_query_func),
1075  e = llvm::inst_end(multifrag_query_func);
1076  it != e;
1077  ++it) {
1078  if (!llvm::isa<llvm::CallInst>(*it)) {
1079  continue;
1080  }
1081  auto& query_call = llvm::cast<llvm::CallInst>(*it);
1082  if (std::string(query_call.getCalledFunction()->getName()) == query_fname) {
1083  query_stubs.push_back(&query_call);
1084  }
1085  }
1086  for (auto& S : query_stubs) {
1087  std::vector<llvm::Value*> args;
1088  for (size_t i = 0; i < S->getNumArgOperands(); ++i) {
1089  args.push_back(S->getArgOperand(i));
1090  }
1091  llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args, ""));
1092  }
1093 }
1094 
1095 std::vector<std::string> get_agg_fnames(const std::vector<Analyzer::Expr*>& target_exprs,
1096  const bool is_group_by) {
1097  std::vector<std::string> result;
1098  for (size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1099  ++target_idx, ++agg_col_idx) {
1100  const auto target_expr = target_exprs[target_idx];
1101  CHECK(target_expr);
1102  const auto target_type_info = target_expr->get_type_info();
1103  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
1104  const bool is_varlen =
1105  (target_type_info.is_string() &&
1106  target_type_info.get_compression() == kENCODING_NONE) ||
1107  target_type_info.is_array(); // TODO: should it use is_varlen_array() ?
1108  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
1109  result.emplace_back(target_type_info.is_fp() ? "agg_id_double" : "agg_id");
1110  if (is_varlen) {
1111  result.emplace_back("agg_id");
1112  }
1113  if (target_type_info.is_geometry()) {
1114  result.emplace_back("agg_id");
1115  for (auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1116  result.emplace_back("agg_id");
1117  }
1118  }
1119  continue;
1120  }
1121  const auto agg_type = agg_expr->get_aggtype();
1122  const auto& agg_type_info =
1123  agg_type != kCOUNT ? agg_expr->get_arg()->get_type_info() : target_type_info;
1124  switch (agg_type) {
1125  case kAVG: {
1126  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1127  !agg_type_info.is_fp()) {
1128  throw std::runtime_error("AVG is only valid on integer and floating point");
1129  }
1130  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1131  ? "agg_sum"
1132  : "agg_sum_double");
1133  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1134  ? "agg_count"
1135  : "agg_count_double");
1136  break;
1137  }
1138  case kMIN: {
1139  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1140  agg_type_info.is_geometry()) {
1141  throw std::runtime_error(
1142  "MIN on strings, arrays or geospatial types not supported yet");
1143  }
1144  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1145  ? "agg_min"
1146  : "agg_min_double");
1147  break;
1148  }
1149  case kMAX: {
1150  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1151  agg_type_info.is_geometry()) {
1152  throw std::runtime_error(
1153  "MAX on strings, arrays or geospatial types not supported yet");
1154  }
1155  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1156  ? "agg_max"
1157  : "agg_max_double");
1158  break;
1159  }
1160  case kSUM: {
1161  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1162  !agg_type_info.is_fp()) {
1163  throw std::runtime_error("SUM is only valid on integer and floating point");
1164  }
1165  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1166  ? "agg_sum"
1167  : "agg_sum_double");
1168  break;
1169  }
1170  case kCOUNT:
1171  result.emplace_back(agg_expr->get_is_distinct() ? "agg_count_distinct"
1172  : "agg_count");
1173  break;
1174  case kSAMPLE: {
1175  // Note that varlen SAMPLE arguments are handled separately above
1176  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1177  break;
1178  }
1180  result.emplace_back("agg_approximate_count_distinct");
1181  break;
1182  default:
1183  CHECK(false);
1184  }
1185  }
1186  return result;
1187 }
1188 
1189 } // namespace
1190 
1191 std::unique_ptr<llvm::Module> g_rt_module(read_template_module(getGlobalLLVMContext()));
1192 
1193 bool is_udf_module_present(bool cpu_only) {
1194  return (cpu_only || udf_gpu_module != nullptr) && (udf_cpu_module != nullptr);
1195 }
1196 
1197 bool is_rt_udf_module_present(bool cpu_only) {
1198  return (cpu_only || rt_udf_gpu_module != nullptr) && (rt_udf_cpu_module != nullptr);
1199 }
1200 
1201 void throw_parseIR_error(const llvm::SMDiagnostic& parse_error, std::string src = "") {
1202  std::string excname = "LLVM IR ParseError: ";
1203  llvm::raw_string_ostream ss(excname);
1204  parse_error.print(src.c_str(), ss, false, false);
1205  throw std::runtime_error(ss.str());
1206 }
1207 
1208 void read_udf_gpu_module(const std::string& udf_ir_filename) {
1209  llvm::SMDiagnostic parse_error;
1210 
1211  llvm::StringRef file_name_arg(udf_ir_filename);
1212 
1213  udf_gpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1214  if (!udf_gpu_module) {
1215  throw_parseIR_error(parse_error, udf_ir_filename);
1216  }
1217 }
1218 
1219 void read_udf_cpu_module(const std::string& udf_ir_filename) {
1220  llvm::SMDiagnostic parse_error;
1221 
1222  llvm::StringRef file_name_arg(udf_ir_filename);
1223 
1224  udf_cpu_module = llvm::parseIRFile(file_name_arg, parse_error, getGlobalLLVMContext());
1225  if (!udf_cpu_module) {
1226  throw_parseIR_error(parse_error, udf_ir_filename);
1227  }
1228 }
1229 
1230 void read_rt_udf_gpu_module(const std::string& udf_ir_string) {
1231  llvm::SMDiagnostic parse_error;
1232 
1233  auto buf =
1234  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for GPU");
1235 
1236  rt_udf_gpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1237  if (!rt_udf_gpu_module) {
1238  throw_parseIR_error(parse_error);
1239  }
1240 }
1241 
1242 void read_rt_udf_cpu_module(const std::string& udf_ir_string) {
1243  llvm::SMDiagnostic parse_error;
1244 
1245  auto buf =
1246  std::make_unique<llvm::MemoryBufferRef>(udf_ir_string, "Runtime UDF for CPU");
1247 
1248  rt_udf_cpu_module = llvm::parseIR(*buf, parse_error, getGlobalLLVMContext());
1249  if (!rt_udf_cpu_module) {
1250  throw_parseIR_error(parse_error);
1251  }
1252 }
1253 
1254 std::unordered_set<llvm::Function*> CodeGenerator::markDeadRuntimeFuncs(
1255  llvm::Module& module,
1256  const std::vector<llvm::Function*>& roots,
1257  const std::vector<llvm::Function*>& leaves) {
1258  std::unordered_set<llvm::Function*> live_funcs;
1259  live_funcs.insert(roots.begin(), roots.end());
1260  live_funcs.insert(leaves.begin(), leaves.end());
1261 
1262  if (auto F = module.getFunction("init_shared_mem_nop")) {
1263  live_funcs.insert(F);
1264  }
1265  if (auto F = module.getFunction("write_back_nop")) {
1266  live_funcs.insert(F);
1267  }
1268 
1269  for (const llvm::Function* F : roots) {
1270  for (const llvm::BasicBlock& BB : *F) {
1271  for (const llvm::Instruction& I : BB) {
1272  if (const llvm::CallInst* CI = llvm::dyn_cast<const llvm::CallInst>(&I)) {
1273  live_funcs.insert(CI->getCalledFunction());
1274  }
1275  }
1276  }
1277  }
1278 
1279  for (llvm::Function& F : module) {
1280  if (!live_funcs.count(&F) && !F.isDeclaration()) {
1281  F.setLinkage(llvm::GlobalValue::InternalLinkage);
1282  }
1283  }
1284 
1285  return live_funcs;
1286 }
1287 
1288 namespace {
1289 // searches for a particular variable within a specific basic block (or all if bb_name is
1290 // empty)
1291 template <typename InstType>
1292 llvm::Value* find_variable_in_basic_block(llvm::Function* func,
1293  std::string bb_name,
1294  std::string variable_name) {
1295  llvm::Value* result = nullptr;
1296  if (func == nullptr || variable_name.empty()) {
1297  return result;
1298  }
1299  bool is_found = false;
1300  for (auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
1301  if (!bb_name.empty() && bb_it->getName() != bb_name) {
1302  continue;
1303  }
1304  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
1305  if (llvm::isa<InstType>(*inst_it)) {
1306  if (inst_it->getName() == variable_name) {
1307  result = &*inst_it;
1308  is_found = true;
1309  break;
1310  }
1311  }
1312  }
1313  }
1314  return result;
1315 }
1316 }; // namespace
1317 
1318 void Executor::createErrorCheckControlFlow(llvm::Function* query_func,
1319  bool run_with_dynamic_watchdog,
1320  ExecutorDeviceType device_type) {
1321  // check whether the row processing was successful; currently, it can
1322  // fail by running out of group by buffer slots
1323 
1324  llvm::Value* row_count = nullptr;
1325  if (run_with_dynamic_watchdog && device_type == ExecutorDeviceType::GPU) {
1326  row_count =
1327  find_variable_in_basic_block<llvm::LoadInst>(query_func, ".entry", "row_count");
1328  }
1329 
1330  bool done_splitting = false;
1331  for (auto bb_it = query_func->begin(); bb_it != query_func->end() && !done_splitting;
1332  ++bb_it) {
1333  llvm::Value* pos = nullptr;
1334  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); ++inst_it) {
1335  if (run_with_dynamic_watchdog && llvm::isa<llvm::PHINode>(*inst_it)) {
1336  if (inst_it->getName() == "pos") {
1337  pos = &*inst_it;
1338  }
1339  continue;
1340  }
1341  if (!llvm::isa<llvm::CallInst>(*inst_it)) {
1342  continue;
1343  }
1344  auto& filter_call = llvm::cast<llvm::CallInst>(*inst_it);
1345  if (std::string(filter_call.getCalledFunction()->getName()) == "row_process") {
1346  auto next_inst_it = inst_it;
1347  ++next_inst_it;
1348  auto new_bb = bb_it->splitBasicBlock(next_inst_it);
1349  auto& br_instr = bb_it->back();
1350  llvm::IRBuilder<> ir_builder(&br_instr);
1351  llvm::Value* err_lv = &*inst_it;
1352  if (run_with_dynamic_watchdog) {
1353  CHECK(pos);
1354  llvm::Value* call_watchdog_lv = nullptr;
1355  if (device_type == ExecutorDeviceType::GPU) {
1356  // In order to make sure all threads wihtin a block see the same barrier,
1357  // only those blocks whose none of their threads have experienced the critical
1358  // edge will go through the dynamic watchdog computation
1359  CHECK(row_count);
1360  auto crit_edge_rem =
1361  (blockSize() & (blockSize() - 1))
1362  ? ir_builder.CreateSRem(
1363  row_count,
1364  cgen_state_->llInt(static_cast<int64_t>(blockSize())))
1365  : ir_builder.CreateAnd(
1366  row_count,
1367  cgen_state_->llInt(static_cast<int64_t>(blockSize() - 1)));
1368  auto crit_edge_threshold = ir_builder.CreateSub(row_count, crit_edge_rem);
1369  crit_edge_threshold->setName("crit_edge_threshold");
1370 
1371  // only those threads where pos < crit_edge_threshold go through dynamic
1372  // watchdog call
1373  call_watchdog_lv =
1374  ir_builder.CreateICmp(llvm::ICmpInst::ICMP_SLT, pos, crit_edge_threshold);
1375  } else {
1376  // CPU path: run watchdog for every 64th row
1377  auto dw_predicate = ir_builder.CreateAnd(pos, uint64_t(0x3f));
1378  call_watchdog_lv = ir_builder.CreateICmp(
1379  llvm::ICmpInst::ICMP_EQ, dw_predicate, cgen_state_->llInt(int64_t(0LL)));
1380  }
1381  CHECK(call_watchdog_lv);
1382  auto error_check_bb = bb_it->splitBasicBlock(
1383  llvm::BasicBlock::iterator(br_instr), ".error_check");
1384  auto& watchdog_br_instr = bb_it->back();
1385 
1386  auto watchdog_check_bb = llvm::BasicBlock::Create(
1387  cgen_state_->context_, ".watchdog_check", query_func, error_check_bb);
1388  llvm::IRBuilder<> watchdog_ir_builder(watchdog_check_bb);
1389  auto detected_timeout = watchdog_ir_builder.CreateCall(
1390  cgen_state_->module_->getFunction("dynamic_watchdog"), {});
1391  auto timeout_err_lv = watchdog_ir_builder.CreateSelect(
1392  detected_timeout, cgen_state_->llInt(Executor::ERR_OUT_OF_TIME), err_lv);
1393  watchdog_ir_builder.CreateBr(error_check_bb);
1394 
1395  llvm::ReplaceInstWithInst(
1396  &watchdog_br_instr,
1397  llvm::BranchInst::Create(
1398  watchdog_check_bb, error_check_bb, call_watchdog_lv));
1399  ir_builder.SetInsertPoint(&br_instr);
1400  auto unified_err_lv = ir_builder.CreatePHI(err_lv->getType(), 2);
1401 
1402  unified_err_lv->addIncoming(timeout_err_lv, watchdog_check_bb);
1403  unified_err_lv->addIncoming(err_lv, &*bb_it);
1404  err_lv = unified_err_lv;
1405  }
1406  const auto error_code_arg = get_arg_by_name(query_func, "error_code");
1407  err_lv =
1408  ir_builder.CreateCall(cgen_state_->module_->getFunction("record_error_code"),
1409  std::vector<llvm::Value*>{err_lv, error_code_arg});
1410  if (device_type == ExecutorDeviceType::GPU) {
1411  // let kernel execution finish as expected, regardless of the observed error,
1412  // unless it is from the dynamic watchdog where all threads within that block
1413  // return together.
1414  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_EQ,
1415  err_lv,
1416  cgen_state_->llInt(Executor::ERR_OUT_OF_TIME));
1417  } else {
1418  err_lv = ir_builder.CreateICmp(llvm::ICmpInst::ICMP_NE,
1419  err_lv,
1420  cgen_state_->llInt(static_cast<int32_t>(0)));
1421  }
1422  auto error_bb = llvm::BasicBlock::Create(
1423  cgen_state_->context_, ".error_exit", query_func, new_bb);
1424  llvm::ReturnInst::Create(cgen_state_->context_, error_bb);
1425  llvm::ReplaceInstWithInst(&br_instr,
1426  llvm::BranchInst::Create(error_bb, new_bb, err_lv));
1427  done_splitting = true;
1428  break;
1429  }
1430  }
1431  }
1432  CHECK(done_splitting);
1433 }
1434 
1435 std::vector<llvm::Value*> Executor::inlineHoistedLiterals() {
1436  std::vector<llvm::Value*> hoisted_literals;
1437 
1438  // row_func_ is using literals whose defs have been hoisted up to the query_func_,
1439  // extend row_func_ signature to include extra args to pass these literal values.
1440  std::vector<llvm::Type*> row_process_arg_types;
1441 
1442  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1443  E = cgen_state_->row_func_->arg_end();
1444  I != E;
1445  ++I) {
1446  row_process_arg_types.push_back(I->getType());
1447  }
1448 
1449  for (auto& element : cgen_state_->query_func_literal_loads_) {
1450  for (auto value : element.second) {
1451  row_process_arg_types.push_back(value->getType());
1452  }
1453  }
1454 
1455  auto ft = llvm::FunctionType::get(
1456  get_int_type(32, cgen_state_->context_), row_process_arg_types, false);
1457  auto row_func_with_hoisted_literals =
1458  llvm::Function::Create(ft,
1459  llvm::Function::ExternalLinkage,
1460  "row_func_hoisted_literals",
1461  cgen_state_->row_func_->getParent());
1462 
1463  // make sure it's in-lined, we don't want register spills in the inner loop
1464  mark_function_always_inline(row_func_with_hoisted_literals);
1465 
1466  auto arg_it = row_func_with_hoisted_literals->arg_begin();
1467  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1468  E = cgen_state_->row_func_->arg_end();
1469  I != E;
1470  ++I) {
1471  if (I->hasName()) {
1472  arg_it->setName(I->getName());
1473  }
1474  ++arg_it;
1475  }
1476 
1477  std::unordered_map<int, std::vector<llvm::Value*>>
1478  query_func_literal_loads_function_arguments;
1479 
1480  for (auto& element : cgen_state_->query_func_literal_loads_) {
1481  std::vector<llvm::Value*> argument_values;
1482 
1483  for (auto value : element.second) {
1484  hoisted_literals.push_back(value);
1485  argument_values.push_back(&*arg_it);
1486  if (value->hasName()) {
1487  arg_it->setName("arg_" + value->getName());
1488  }
1489  ++arg_it;
1490  }
1491 
1492  query_func_literal_loads_function_arguments[element.first] = argument_values;
1493  }
1494 
1495  // copy the row_func function body over
1496  // see
1497  // https://stackoverflow.com/questions/12864106/move-function-body-avoiding-full-cloning/18751365
1498  row_func_with_hoisted_literals->getBasicBlockList().splice(
1499  row_func_with_hoisted_literals->begin(),
1500  cgen_state_->row_func_->getBasicBlockList());
1501 
1502  // also replace row_func arguments with the arguments from row_func_hoisted_literals
1503  for (llvm::Function::arg_iterator I = cgen_state_->row_func_->arg_begin(),
1504  E = cgen_state_->row_func_->arg_end(),
1505  I2 = row_func_with_hoisted_literals->arg_begin();
1506  I != E;
1507  ++I) {
1508  I->replaceAllUsesWith(&*I2);
1509  I2->takeName(&*I);
1510  ++I2;
1511  }
1512 
1513  cgen_state_->row_func_ = row_func_with_hoisted_literals;
1514 
1515  // and finally replace literal placeholders
1516  std::vector<llvm::Instruction*> placeholders;
1517  std::string prefix("__placeholder__literal_");
1518  for (auto it = llvm::inst_begin(row_func_with_hoisted_literals),
1519  e = llvm::inst_end(row_func_with_hoisted_literals);
1520  it != e;
1521  ++it) {
1522  if (it->hasName() && it->getName().startswith(prefix)) {
1523  auto offset_and_index_entry =
1524  cgen_state_->row_func_hoisted_literals_.find(llvm::dyn_cast<llvm::Value>(&*it));
1525  CHECK(offset_and_index_entry != cgen_state_->row_func_hoisted_literals_.end());
1526 
1527  int lit_off = offset_and_index_entry->second.offset_in_literal_buffer;
1528  int lit_idx = offset_and_index_entry->second.index_of_literal_load;
1529 
1530  it->replaceAllUsesWith(
1531  query_func_literal_loads_function_arguments[lit_off][lit_idx]);
1532  placeholders.push_back(&*it);
1533  }
1534  }
1535  for (auto placeholder : placeholders) {
1536  placeholder->removeFromParent();
1537  }
1538 
1539  return hoisted_literals;
1540 }
1541 
1542 namespace {
1543 
1544 // A small number of runtime functions don't get through CgenState::emitCall. List them
1545 // explicitly here and always clone their implementation from the runtime module.
1546 bool always_clone_runtime_function(const llvm::Function* func) {
1547  return func->getName() == "query_stub_hoisted_literals" ||
1548  func->getName() == "multifrag_query_hoisted_literals" ||
1549  func->getName() == "query_stub" || func->getName() == "multifrag_query" ||
1550  func->getName() == "fixed_width_int_decode" ||
1551  func->getName() == "fixed_width_unsigned_decode" ||
1552  func->getName() == "diff_fixed_width_int_decode" ||
1553  func->getName() == "fixed_width_double_decode" ||
1554  func->getName() == "fixed_width_float_decode" ||
1555  func->getName() == "fixed_width_small_date_decode" ||
1556  func->getName() == "record_error_code";
1557 }
1558 
1559 } // namespace
1560 
1561 std::tuple<Executor::CompilationResult, std::unique_ptr<QueryMemoryDescriptor>>
1562 Executor::compileWorkUnit(const std::vector<InputTableInfo>& query_infos,
1563  const RelAlgExecutionUnit& ra_exe_unit,
1564  const CompilationOptions& co,
1565  const ExecutionOptions& eo,
1566  const CudaMgr_Namespace::CudaMgr* cuda_mgr,
1567  const bool allow_lazy_fetch,
1568  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
1569  const size_t max_groups_buffer_entry_guess,
1570  const int8_t crt_min_byte_width,
1571  const bool has_cardinality_estimation,
1572  ColumnCacheMap& column_cache,
1573  RenderInfo* render_info) {
1574  nukeOldState(allow_lazy_fetch, query_infos, ra_exe_unit);
1575 
1576  GroupByAndAggregate group_by_and_aggregate(
1577  this, co.device_type_, ra_exe_unit, query_infos, row_set_mem_owner);
1578  auto query_mem_desc =
1579  group_by_and_aggregate.initQueryMemoryDescriptor(eo.allow_multifrag,
1580  max_groups_buffer_entry_guess,
1581  crt_min_byte_width,
1582  render_info,
1584 
1585  if (query_mem_desc->getQueryDescriptionType() ==
1587  !has_cardinality_estimation &&
1588  (!render_info || !render_info->isPotentialInSituRender()) && !eo.just_explain) {
1590  }
1591 
1592  const bool output_columnar = query_mem_desc->didOutputColumnar();
1593 
1595  const size_t num_count_distinct_descs =
1596  query_mem_desc->getCountDistinctDescriptorsSize();
1597  for (size_t i = 0; i < num_count_distinct_descs; i++) {
1598  const auto& count_distinct_descriptor =
1599  query_mem_desc->getCountDistinctDescriptor(i);
1600  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::StdSet ||
1601  (count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid &&
1602  !co.hoist_literals_)) {
1603  throw QueryMustRunOnCpu();
1604  }
1605  }
1606  }
1607 
1608  // Read the module template and target either CPU or GPU
1609  // by binding the stream position functions to the right implementation:
1610  // stride access for GPU, contiguous for CPU
1611  auto rt_module_copy = llvm::CloneModule(
1612 #if LLVM_VERSION_MAJOR >= 7
1613  *g_rt_module.get(),
1614 #else
1615  g_rt_module.get(),
1616 #endif
1617  cgen_state_->vmap_,
1618  [](const llvm::GlobalValue* gv) {
1619  auto func = llvm::dyn_cast<llvm::Function>(gv);
1620  if (!func) {
1621  return true;
1622  }
1623  return (func->getLinkage() == llvm::GlobalValue::LinkageTypes::PrivateLinkage ||
1624  func->getLinkage() == llvm::GlobalValue::LinkageTypes::InternalLinkage ||
1626  });
1627 
1629  if (is_udf_module_present(true)) {
1630  link_udf_module(udf_cpu_module, *rt_module_copy, cgen_state_.get());
1631  }
1632  if (is_rt_udf_module_present(true)) {
1633  link_udf_module(rt_udf_cpu_module, *rt_module_copy, cgen_state_.get());
1634  }
1635  } else {
1636  rt_module_copy->setDataLayout(get_gpu_data_layout());
1637  rt_module_copy->setTargetTriple(get_gpu_target_triple_string());
1638  if (is_udf_module_present()) {
1639  link_udf_module(udf_gpu_module, *rt_module_copy, cgen_state_.get());
1640  }
1641  if (is_rt_udf_module_present()) {
1642  link_udf_module(rt_udf_gpu_module, *rt_module_copy, cgen_state_.get());
1643  }
1644  }
1645 
1646  cgen_state_->module_ = rt_module_copy.release();
1647 
1648  auto agg_fnames =
1649  get_agg_fnames(ra_exe_unit.target_exprs, !ra_exe_unit.groupby_exprs.empty());
1650 
1651  const auto agg_slot_count = ra_exe_unit.estimator ? size_t(1) : agg_fnames.size();
1652 
1653  const bool is_group_by{query_mem_desc->isGroupBy()};
1654  auto query_func = is_group_by ? query_group_by_template(cgen_state_->module_,
1655  co.hoist_literals_,
1656  *query_mem_desc,
1657  co.device_type_,
1658  ra_exe_unit.scan_limit)
1659  : query_template(cgen_state_->module_,
1660  agg_slot_count,
1661  co.hoist_literals_,
1662  !!ra_exe_unit.estimator);
1663  bind_pos_placeholders("pos_start", true, query_func, cgen_state_->module_);
1664  bind_pos_placeholders("group_buff_idx", false, query_func, cgen_state_->module_);
1665  bind_pos_placeholders("pos_step", false, query_func, cgen_state_->module_);
1666 
1667  cgen_state_->query_func_ = query_func;
1668  cgen_state_->query_func_entry_ir_builder_.SetInsertPoint(
1669  &query_func->getEntryBlock().front());
1670 
1671  std::vector<llvm::Value*> col_heads;
1672  std::tie(cgen_state_->row_func_, col_heads) =
1673  create_row_function(ra_exe_unit.input_col_descs.size(),
1674  is_group_by ? 0 : agg_slot_count,
1675  co.hoist_literals_,
1676  query_func,
1677  cgen_state_->module_,
1678  cgen_state_->context_);
1679  CHECK(cgen_state_->row_func_);
1680  // make sure it's in-lined, we don't want register spills in the inner loop
1681  mark_function_always_inline(cgen_state_->row_func_);
1682  auto bb =
1683  llvm::BasicBlock::Create(cgen_state_->context_, "entry", cgen_state_->row_func_);
1684  cgen_state_->ir_builder_.SetInsertPoint(bb);
1685  preloadFragOffsets(ra_exe_unit.input_descs, query_infos);
1686  RelAlgExecutionUnit body_execution_unit = ra_exe_unit;
1687  const auto join_loops =
1688  buildJoinLoops(body_execution_unit, co, eo, query_infos, column_cache);
1689  plan_state_->allocateLocalColumnIds(ra_exe_unit.input_col_descs);
1690  const auto is_not_deleted_bb = codegenSkipDeletedOuterTableRow(ra_exe_unit, co);
1691  if (is_not_deleted_bb) {
1692  bb = is_not_deleted_bb;
1693  }
1694  if (!join_loops.empty()) {
1695  codegenJoinLoops(join_loops,
1696  body_execution_unit,
1697  group_by_and_aggregate,
1698  query_func,
1699  bb,
1700  *(query_mem_desc.get()),
1701  co,
1702  eo);
1703  } else {
1704  const bool can_return_error =
1705  compileBody(ra_exe_unit, group_by_and_aggregate, *query_mem_desc, co);
1706  if (can_return_error || cgen_state_->needs_error_check_ || eo.with_dynamic_watchdog) {
1707  createErrorCheckControlFlow(query_func, eo.with_dynamic_watchdog, co.device_type_);
1708  }
1709  }
1710  std::vector<llvm::Value*> hoisted_literals;
1711 
1712  if (co.hoist_literals_) {
1713  VLOG(1) << "number of hoisted literals: "
1714  << cgen_state_->query_func_literal_loads_.size()
1715  << " / literal buffer usage: " << cgen_state_->getLiteralBufferUsage(0)
1716  << " bytes";
1717  }
1718 
1719  if (co.hoist_literals_ && !cgen_state_->query_func_literal_loads_.empty()) {
1720  // we have some hoisted literals...
1721  hoisted_literals = inlineHoistedLiterals();
1722  }
1723  // iterate through all the instruction in the query template function and
1724  // replace the call to the filter placeholder with the call to the actual filter
1725  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1726  ++it) {
1727  if (!llvm::isa<llvm::CallInst>(*it)) {
1728  continue;
1729  }
1730  auto& filter_call = llvm::cast<llvm::CallInst>(*it);
1731  if (std::string(filter_call.getCalledFunction()->getName()) == "row_process") {
1732  std::vector<llvm::Value*> args;
1733  for (size_t i = 0; i < filter_call.getNumArgOperands(); ++i) {
1734  args.push_back(filter_call.getArgOperand(i));
1735  }
1736  args.insert(args.end(), col_heads.begin(), col_heads.end());
1737  args.push_back(get_arg_by_name(query_func, "join_hash_tables"));
1738  // push hoisted literals arguments, if any
1739  args.insert(args.end(), hoisted_literals.begin(), hoisted_literals.end());
1740 
1741  llvm::ReplaceInstWithInst(&filter_call,
1742  llvm::CallInst::Create(cgen_state_->row_func_, args, ""));
1743  break;
1744  }
1745  }
1746  plan_state_->init_agg_vals_ =
1747  init_agg_val_vec(ra_exe_unit.target_exprs, ra_exe_unit.quals, *query_mem_desc);
1748 
1749  auto multifrag_query_func = cgen_state_->module_->getFunction(
1750  "multifrag_query" + std::string(co.hoist_literals_ ? "_hoisted_literals" : ""));
1751  CHECK(multifrag_query_func);
1752 
1753  bind_query(query_func,
1754  "query_stub" + std::string(co.hoist_literals_ ? "_hoisted_literals" : ""),
1755  multifrag_query_func,
1756  cgen_state_->module_);
1757 
1758  auto live_funcs =
1759  CodeGenerator::markDeadRuntimeFuncs(*cgen_state_->module_,
1760  {query_func, cgen_state_->row_func_},
1761  {multifrag_query_func});
1762 
1763  std::string llvm_ir;
1764  if (eo.just_explain) {
1766 #ifdef WITH_JIT_DEBUG
1767  throw std::runtime_error(
1768  "Explain optimized not available when JIT runtime debug symbols are enabled");
1769 #else
1770  optimize_ir(query_func, cgen_state_->module_, live_funcs, co);
1771 #endif // WITH_JIT_DEBUG
1772  }
1773  llvm_ir =
1774  serialize_llvm_object(query_func) + serialize_llvm_object(cgen_state_->row_func_);
1775  }
1776  verify_function_ir(cgen_state_->row_func_);
1777 
1778  LOG(IR) << query_mem_desc->toString() << "\nGenerated IR\n"
1779  << serialize_llvm_object(query_func)
1780  << serialize_llvm_object(cgen_state_->row_func_) << "\nEnd of IR";
1781 
1782  return std::make_tuple(
1785  ? optimizeAndCodegenCPU(query_func, multifrag_query_func, live_funcs, co)
1786  : optimizeAndCodegenGPU(query_func,
1787  multifrag_query_func,
1788  live_funcs,
1789  is_group_by || ra_exe_unit.estimator,
1790  cuda_mgr,
1791  co),
1792  cgen_state_->getLiterals(),
1793  output_columnar,
1794  llvm_ir},
1795  std::move(query_mem_desc));
1796 }
1797 
1799  const RelAlgExecutionUnit& ra_exe_unit,
1800  const CompilationOptions& co) {
1801  CHECK(!ra_exe_unit.input_descs.empty());
1802  const auto& outer_input_desc = ra_exe_unit.input_descs[0];
1803  if (outer_input_desc.getSourceType() != InputSourceType::TABLE) {
1804  return nullptr;
1805  }
1806  const auto td = catalog_->getMetadataForTable(outer_input_desc.getTableId());
1807  CHECK(td);
1808  const auto deleted_cd = catalog_->getDeletedColumnIfRowsDeleted(td);
1809  if (!deleted_cd) {
1810  return nullptr;
1811  }
1812  CHECK(deleted_cd->columnType.is_boolean());
1813  const auto deleted_expr =
1814  makeExpr<Analyzer::ColumnVar>(deleted_cd->columnType,
1815  outer_input_desc.getTableId(),
1816  deleted_cd->columnId,
1817  outer_input_desc.getNestLevel());
1818  CodeGenerator code_generator(this);
1819  const auto is_deleted =
1820  code_generator.toBool(code_generator.codegen(deleted_expr.get(), true, co).front());
1821  const auto is_deleted_bb = llvm::BasicBlock::Create(
1822  cgen_state_->context_, "is_deleted", cgen_state_->row_func_);
1823  llvm::BasicBlock* bb = llvm::BasicBlock::Create(
1824  cgen_state_->context_, "is_not_deleted", cgen_state_->row_func_);
1825  cgen_state_->ir_builder_.CreateCondBr(is_deleted, is_deleted_bb, bb);
1826  cgen_state_->ir_builder_.SetInsertPoint(is_deleted_bb);
1827  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt<int32_t>(0));
1828  cgen_state_->ir_builder_.SetInsertPoint(bb);
1829  return bb;
1830 }
1831 
1833  GroupByAndAggregate& group_by_and_aggregate,
1834  const QueryMemoryDescriptor& query_mem_desc,
1835  const CompilationOptions& co) {
1836  // generate the code for the filter
1837  std::vector<Analyzer::Expr*> primary_quals;
1838  std::vector<Analyzer::Expr*> deferred_quals;
1839  bool short_circuited =
1840  CodeGenerator::prioritizeQuals(ra_exe_unit, primary_quals, deferred_quals);
1841  if (short_circuited) {
1842  VLOG(1) << "Prioritized " << std::to_string(primary_quals.size()) << " quals, "
1843  << "short-circuited and deferred " << std::to_string(deferred_quals.size())
1844  << " quals";
1845  }
1846  llvm::Value* filter_lv = cgen_state_->llBool(true);
1847  CodeGenerator code_generator(this);
1848  for (auto expr : primary_quals) {
1849  // Generate the filter for primary quals
1850  auto cond = code_generator.toBool(code_generator.codegen(expr, true, co).front());
1851  filter_lv = cgen_state_->ir_builder_.CreateAnd(filter_lv, cond);
1852  }
1853  CHECK(filter_lv->getType()->isIntegerTy(1));
1854  llvm::BasicBlock* sc_false{nullptr};
1855  if (!deferred_quals.empty()) {
1856  auto sc_true = llvm::BasicBlock::Create(
1857  cgen_state_->context_, "sc_true", cgen_state_->row_func_);
1858  sc_false = llvm::BasicBlock::Create(
1859  cgen_state_->context_, "sc_false", cgen_state_->row_func_);
1860  cgen_state_->ir_builder_.CreateCondBr(filter_lv, sc_true, sc_false);
1861  cgen_state_->ir_builder_.SetInsertPoint(sc_false);
1862  if (ra_exe_unit.join_quals.empty()) {
1863  cgen_state_->ir_builder_.CreateRet(cgen_state_->llInt(int32_t(0)));
1864  }
1865  cgen_state_->ir_builder_.SetInsertPoint(sc_true);
1866  filter_lv = cgen_state_->llBool(true);
1867  }
1868  for (auto expr : deferred_quals) {
1869  filter_lv = cgen_state_->ir_builder_.CreateAnd(
1870  filter_lv, code_generator.toBool(code_generator.codegen(expr, true, co).front()));
1871  }
1872 
1873  CHECK(filter_lv->getType()->isIntegerTy(1));
1874  return group_by_and_aggregate.codegen(filter_lv, sc_false, query_mem_desc, co);
1875 }
std::map< std::string, std::string > get_device_parameters()
std::vector< Analyzer::Expr * > target_exprs
std::string generatePTX(const std::string &) const
#define CHECK_EQ(x, y)
Definition: Logger.h:195
bool is_udf_module_present(bool cpu_only)
llvm::Value * find_variable_in_basic_block(llvm::Function *func, std::string bb_name, std::string variable_name)
llvm::Module * read_template_module(llvm::LLVMContext &context)
std::unique_ptr< llvm::Module > rt_udf_cpu_module
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
static void addCodeToCache(const CodeCacheKey &, std::vector< std::tuple< void *, ExecutionEngineWrapper >>, llvm::Module *, CodeCache &)
std::vector< std::tuple< void *, ExecutionEngineWrapper, std::unique_ptr< GpuCompilationContext > >> CodeCacheVal
Definition: CodeCache.h:63
void link_udf_module(const std::unique_ptr< llvm::Module > &udf_module, llvm::Module &module, CgenState *cgen_state, llvm::Linker::Flags flags=llvm::Linker::Flags::None)
std::unique_ptr< llvm::Module > udf_cpu_module
void mark_function_never_inline(llvm::Function *func)
static ExecutionEngineWrapper generateNativeCPUCode(llvm::Function *func, const std::unordered_set< llvm::Function *> &live_funcs, const CompilationOptions &co)
static std::string generatePTX(const std::string &cuda_llir, llvm::TargetMachine *nvptx_target_machine, CgenState *cgen_state)
ExecutorDeviceType
void initializeNVPTXBackend() const
#define LOG(tag)
Definition: Logger.h:182
std::unique_ptr< llvm::Module > rt_udf_gpu_module
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void mark_function_always_inline(llvm::Function *func)
static std::unordered_set< llvm::Function * > markDeadRuntimeFuncs(llvm::Module &module, const std::vector< llvm::Function *> &roots, const std::vector< llvm::Function *> &leaves)
llvm::StringRef get_gpu_data_layout()
void verify_function_ir(const llvm::Function *func)
std::string mapd_root_abs_path()
Definition: mapdpath.h:30
std::vector< std::pair< void *, void * > > optimizeAndCodegenCPU(llvm::Function *, llvm::Function *, const std::unordered_set< llvm::Function *> &, const CompilationOptions &)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const ExecutorOptLevel opt_level_
std::string join(T const &container, std::string const &delim)
llvm::Function * query_template(llvm::Module *module, const size_t aggr_col_count, const bool hoist_literals, const bool is_estimate_query)
int64_t * src
std::vector< std::string > CodeCacheKey
Definition: CodeCache.h:61
static std::vector< std::string > getLLVMDeclarations()
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
int getDeviceCount() const
Definition: CudaMgr.h:81
std::unique_ptr< llvm::Module > udf_gpu_module
const std::vector< InputDescriptor > input_descs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
ExecutionEngineWrapper & operator=(const ExecutionEngineWrapper &other)=delete
std::string to_string(char const *&&v)
const CudaMgr_Namespace::CudaMgr * cuda_mgr
Definition: CodeGenerator.h:90
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function *> &live_funcs)
std::tuple< Executor::CompilationResult, std::unique_ptr< QueryMemoryDescriptor > > compileWorkUnit(const std::vector< InputTableInfo > &query_infos, const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co, const ExecutionOptions &eo, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const bool allow_lazy_fetch, std::shared_ptr< RowSetMemoryOwner >, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool has_cardinality_estimation, ColumnCacheMap &column_cache, RenderInfo *render_info=nullptr)
llvm::Function * row_func_
Definition: CgenState.h:264
Definition: sqldefs.h:71
llvm::TargetMachine * nvptx_target_machine
Definition: CodeGenerator.h:89
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:55
llvm::StringRef get_gpu_target_triple_string()
llvm::LLVMContext & context_
Definition: CgenState.h:266
const bool allow_multifrag
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co)
const_list_iterator_t find(const key_t &key) const
Definition: LruCache.hpp:49
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:114
bool compileBody(const RelAlgExecutionUnit &ra_exe_unit, GroupByAndAggregate &group_by_and_aggregate, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co)
std::unique_ptr< llvm::JITEventListener > intel_jit_listener_
Definition: CodeCache.h:56
const bool with_dynamic_watchdog
std::vector< llvm::Value * > generate_column_heads_load(const int num_columns, llvm::Function *query_func, llvm::LLVMContext &context)
const JoinQualsPerNestingLevel join_quals
std::unique_ptr< llvm::Module > g_rt_module(read_template_module(getGlobalLLVMContext()))
void read_rt_udf_gpu_module(const std::string &udf_ir_string)
const_list_iterator_t cend() const
Definition: LruCache.hpp:55
static const int32_t ERR_OUT_OF_TIME
Definition: Execute.h:1012
Definition: sqldefs.h:71
const bool register_intel_jit_listener_
const bool output_columnar_hint
llvm::Function * query_group_by_template(llvm::Module *module, const bool hoist_literals, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool check_scan_limit)
const std::shared_ptr< Analyzer::Estimator > estimator
std::vector< std::string > get_agg_fnames(const std::vector< Analyzer::Expr *> &target_exprs, const bool is_group_by)
ExecutorDeviceType device_type_
static GPUCode generateNativeGPUCode(llvm::Function *func, llvm::Function *wrapper_func, const std::unordered_set< llvm::Function *> &live_funcs, const CompilationOptions &co, const GPUTarget &gpu_target)
static bool prioritizeQuals(const RelAlgExecutionUnit &ra_exe_unit, std::vector< Analyzer::Expr *> &primary_quals, std::vector< Analyzer::Expr *> &deferred_quals)
Definition: LogicalIR.cpp:157
void read_udf_gpu_module(const std::string &udf_ir_filename)
void createErrorCheckControlFlow(llvm::Function *query_func, bool run_with_dynamic_watchdog, ExecutorDeviceType device_type)
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:25
const ExecutorExplainType explain_type_
std::unique_ptr< llvm::ExecutionEngine > execution_engine_
Definition: CodeCache.h:55
llvm::BasicBlock * codegenSkipDeletedOuterTableRow(const RelAlgExecutionUnit &ra_exe_unit, const CompilationOptions &co)
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
std::string cpp_to_llvm_name(const std::string &s)
void clear_function_attributes(llvm::Function *func)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
SQLAgg get_aggtype() const
Definition: Analyzer.h:987
void read_rt_udf_cpu_module(const std::string &udf_ir_string)
Definition: sqldefs.h:71
int CUdevice
Definition: nocuda.h:20
void bind_pos_placeholders(const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *module)
llvm::Value * toBool(llvm::Value *)
Definition: LogicalIR.cpp:333
llvm::LLVMContext & getGlobalLLVMContext()
std::string serialize_llvm_object(const T *llvm_obj)
std::list< std::shared_ptr< Analyzer::Expr > > quals
static std::unique_ptr< llvm::TargetMachine > initializeNVPTXBackend()
#define CHECK(condition)
Definition: Logger.h:187
llvm::ValueToValueMapTy vmap_
Definition: CgenState.h:267
std::unordered_map< int, std::unordered_map< int, std::shared_ptr< const ColumnarResults > > > ColumnCacheMap
std::vector< llvm::Value * > inlineHoistedLiterals()
void optimize_ir(llvm::Function *query_func, llvm::Module *module, const std::unordered_set< llvm::Function *> &live_funcs, const CompilationOptions &co)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
void throw_parseIR_error(const llvm::SMDiagnostic &parse_error, std::string src="")
void read_udf_cpu_module(const std::string &udf_ir_filename)
bool always_clone_runtime_function(const llvm::Function *func)
Definition: sqldefs.h:71
int cpu_threads()
Definition: thread_count.h:23
CubinResult ptx_to_cubin(const std::string &ptx, const unsigned block_size, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
Definition: sqldefs.h:71
std::vector< std::pair< void *, void * > > optimizeAndCodegenGPU(llvm::Function *, llvm::Function *, std::unordered_set< llvm::Function *> &, const bool no_inline, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const CompilationOptions &)
#define VLOG(n)
Definition: Logger.h:277
std::pair< llvm::Function *, std::vector< llvm::Value * > > create_row_function(const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Function *query_func, llvm::Module *module, llvm::LLVMContext &context)
void bind_query(llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *module)
std::vector< std::pair< void *, void * > > getCodeFromCache(const CodeCacheKey &, const CodeCache &)
bool is_rt_udf_module_present(bool cpu_only)
void put(key_t const &key, value_t &&value)
Definition: LruCache.hpp:27