OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
anonymous_namespace{NativeCodegen.cpp} Namespace Reference

Classes

class  CaseExprDetector
 

Functions

void throw_parseIR_error (const llvm::SMDiagnostic &parse_error, std::string src="", const bool is_gpu=false)
 
template<typename T = void>
void show_defined (llvm::Module &llvm_module)
 
template<typename T = void>
void show_defined (llvm::Module *llvm_module)
 
template<typename T = void>
void show_defined (std::unique_ptr< llvm::Module > &llvm_module)
 
template<typename T = void>
void scan_function_calls (llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
 
template<typename T = void>
void scan_function_calls (llvm::Module &llvm_module, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
 
template<typename T = void>
std::tuple< std::unordered_set
< std::string >
, std::unordered_set
< std::string > > 
scan_function_calls (llvm::Module &llvm_module, const std::unordered_set< std::string > &ignored={})
 
void eliminate_dead_self_recursive_funcs (llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
 
void optimize_ir (llvm::Function *query_func, llvm::Module *llvm_module, llvm::legacy::PassManager &pass_manager, const std::unordered_set< llvm::Function * > &live_funcs, const bool is_gpu_smem_used, const CompilationOptions &co)
 
std::string assemblyForCPU (ExecutionEngineWrapper &execution_engine, llvm::Module *llvm_module)
 
ExecutionEngineWrapper create_execution_engine (llvm::Module *llvm_module, llvm::EngineBuilder &eb, const CompilationOptions &co)
 
std::string cpp_to_llvm_name (const std::string &s)
 
std::string gen_array_any_all_sigs ()
 
std::string gen_translate_null_key_sigs ()
 
void bind_pos_placeholders (const std::string &pos_fn_name, const bool use_resume_param, llvm::Function *query_func, llvm::Module *llvm_module)
 
void set_row_func_argnames (llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
 
llvm::Function * create_row_function (const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals, llvm::Module *llvm_module, llvm::LLVMContext &context)
 
void bind_query (llvm::Function *query_func, const std::string &query_fname, llvm::Function *multifrag_query_func, llvm::Module *llvm_module)
 
std::vector< std::string > get_agg_fnames (const std::vector< Analyzer::Expr * > &target_exprs, const bool is_group_by)
 
template<typename InstType >
llvm::Value * find_variable_in_basic_block (llvm::Function *func, std::string bb_name, std::string variable_name)
 
size_t get_shared_memory_size (const bool shared_mem_used, const QueryMemoryDescriptor *query_mem_desc_ptr)
 
bool has_count_expr (RelAlgExecutionUnit const &ra_exe_unit)
 
bool has_case_expr_within_groupby_expr (RelAlgExecutionUnit const &ra_exe_unit)
 
bool is_gpu_shared_mem_supported (const QueryMemoryDescriptor *query_mem_desc_ptr, const RelAlgExecutionUnit &ra_exe_unit, const CudaMgr_Namespace::CudaMgr *cuda_mgr, const ExecutorDeviceType device_type, const unsigned cuda_blocksize, const unsigned num_blocks_per_mp)
 
std::string serialize_llvm_metadata_footnotes (llvm::Function *query_func, CgenState *cgen_state)
 

Variables

const std::string cuda_rt_decls
 

Function Documentation

std::string anonymous_namespace{NativeCodegen.cpp}::assemblyForCPU ( ExecutionEngineWrapper execution_engine,
llvm::Module *  llvm_module 
)

Definition at line 400 of file NativeCodegen.cpp.

References CHECK.

Referenced by create_execution_engine().

401  {
402  llvm::legacy::PassManager pass_manager;
403  auto cpu_target_machine = execution_engine->getTargetMachine();
404  CHECK(cpu_target_machine);
405  llvm::SmallString<256> code_str;
406  llvm::raw_svector_ostream os(code_str);
407 #if LLVM_VERSION_MAJOR >= 10
408  cpu_target_machine->addPassesToEmitFile(
409  pass_manager, os, nullptr, llvm::CGFT_AssemblyFile);
410 #else
411  cpu_target_machine->addPassesToEmitFile(
412  pass_manager, os, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
413 #endif
414  pass_manager.run(*llvm_module);
415  return "Assembly for the CPU:\n" + std::string(code_str.str()) + "\nEnd of assembly";
416 }
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

void anonymous_namespace{NativeCodegen.cpp}::bind_pos_placeholders ( const std::string &  pos_fn_name,
const bool  use_resume_param,
llvm::Function *  query_func,
llvm::Module *  llvm_module 
)

Definition at line 1660 of file NativeCodegen.cpp.

1663  {
1664  for (auto it = llvm::inst_begin(query_func), e = llvm::inst_end(query_func); it != e;
1665  ++it) {
1666  if (!llvm::isa<llvm::CallInst>(*it)) {
1667  continue;
1668  }
1669  auto& pos_call = llvm::cast<llvm::CallInst>(*it);
1670  auto const func_name = CodegenUtil::getCalledFunctionName(pos_call);
1671  if (func_name && *func_name == pos_fn_name) {
1672  if (use_resume_param) {
1673  auto* const row_index_resume = get_arg_by_name(query_func, "row_index_resume");
1674  llvm::ReplaceInstWithInst(
1675  &pos_call,
1676  llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name + "_impl"),
1677  row_index_resume));
1678  } else {
1679  llvm::ReplaceInstWithInst(
1680  &pos_call,
1681  llvm::CallInst::Create(llvm_module->getFunction(pos_fn_name + "_impl")));
1682  }
1683  break;
1684  }
1685  }
1686 }
std::optional< std::string_view > getCalledFunctionName(llvm::CallInst &call_inst)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:168
void anonymous_namespace{NativeCodegen.cpp}::bind_query ( llvm::Function *  query_func,
const std::string &  query_fname,
llvm::Function *  multifrag_query_func,
llvm::Module *  llvm_module 
)

Definition at line 1810 of file NativeCodegen.cpp.

1813  {
1814  std::vector<llvm::CallInst*> query_stubs;
1815  for (auto it = llvm::inst_begin(multifrag_query_func),
1816  e = llvm::inst_end(multifrag_query_func);
1817  it != e;
1818  ++it) {
1819  if (!llvm::isa<llvm::CallInst>(*it)) {
1820  continue;
1821  }
1822  auto& query_call = llvm::cast<llvm::CallInst>(*it);
1823  auto const call_func_name = CodegenUtil::getCalledFunctionName(query_call);
1824  if (call_func_name && *call_func_name == query_fname) {
1825  query_stubs.push_back(&query_call);
1826  }
1827  }
1828  for (auto& S : query_stubs) {
1829  std::vector<llvm::Value*> args;
1830  for (size_t i = 0; i < S->getNumOperands() - 1; ++i) {
1831  args.push_back(S->getArgOperand(i));
1832  }
1833  llvm::ReplaceInstWithInst(S, llvm::CallInst::Create(query_func, args, ""));
1834  }
1835 }
std::optional< std::string_view > getCalledFunctionName(llvm::CallInst &call_inst)
std::string anonymous_namespace{NativeCodegen.cpp}::cpp_to_llvm_name ( const std::string &  s)

Definition at line 593 of file NativeCodegen.cpp.

References CHECK.

Referenced by gen_array_any_all_sigs(), and gen_translate_null_key_sigs().

593  {
594  if (s == "int8_t") {
595  return "i8";
596  }
597  if (s == "int16_t") {
598  return "i16";
599  }
600  if (s == "int32_t") {
601  return "i32";
602  }
603  if (s == "int64_t") {
604  return "i64";
605  }
606  CHECK(s == "float" || s == "double");
607  return s;
608 }
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

ExecutionEngineWrapper anonymous_namespace{NativeCodegen.cpp}::create_execution_engine ( llvm::Module *  llvm_module,
llvm::EngineBuilder &  eb,
const CompilationOptions co 
)

Definition at line 418 of file NativeCodegen.cpp.

References logger::ASM, assemblyForCPU(), CHECK, DEBUG_TIMER, and LOG.

Referenced by CodeGenerator::generateNativeCPUCode().

420  {
421  auto timer = DEBUG_TIMER(__func__);
422  ExecutionEngineWrapper execution_engine(eb.create(), co);
423  CHECK(execution_engine.get());
424  // Force the module data layout to match the layout for the selected target
425  llvm_module->setDataLayout(execution_engine->getDataLayout());
426 
427  LOG(ASM) << assemblyForCPU(execution_engine, llvm_module);
428 
429  execution_engine->finalizeObject();
430  return execution_engine;
431 }
#define LOG(tag)
Definition: Logger.h:285
std::string assemblyForCPU(ExecutionEngineWrapper &execution_engine, llvm::Module *llvm_module)
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

llvm::Function* anonymous_namespace{NativeCodegen.cpp}::create_row_function ( const size_t  in_col_count,
const size_t  agg_col_count,
const bool  hoist_literals,
llvm::Module *  llvm_module,
llvm::LLVMContext &  context 
)

Definition at line 1741 of file NativeCodegen.cpp.

1745  {
1746  std::vector<llvm::Type*> row_process_arg_types;
1747 
1748  if (agg_col_count) {
1749  // output (aggregate) arguments
1750  for (size_t i = 0; i < agg_col_count; ++i) {
1751  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1752  }
1753  } else {
1754  // group by buffer
1755  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1756  // varlen output buffer
1757  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1758  // current match count
1759  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1760  // total match count passed from the caller
1761  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1762  // old total match count returned to the caller
1763  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1764  // max matched (total number of slots in the output buffer)
1765  row_process_arg_types.push_back(llvm::Type::getInt32PtrTy(context));
1766  }
1767 
1768  // aggregate init values
1769  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1770 
1771  // position argument
1772  row_process_arg_types.push_back(llvm::Type::getInt64Ty(context));
1773 
1774  // fragment row offset argument
1775  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1776 
1777  // number of rows for each scan
1778  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1779 
1780  // literals buffer argument
1781  if (hoist_literals) {
1782  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1783  }
1784 
1785  // column buffer arguments
1786  for (size_t i = 0; i < in_col_count; ++i) {
1787  row_process_arg_types.emplace_back(llvm::Type::getInt8PtrTy(context));
1788  }
1789 
1790  // join hash table argument
1791  row_process_arg_types.push_back(llvm::Type::getInt64PtrTy(context));
1792 
1793  // row function manager
1794  row_process_arg_types.push_back(llvm::Type::getInt8PtrTy(context));
1795 
1796  // generate the function
1797  auto ft =
1798  llvm::FunctionType::get(get_int_type(32, context), row_process_arg_types, false);
1799 
1800  auto row_func = llvm::Function::Create(
1801  ft, llvm::Function::ExternalLinkage, "row_func", llvm_module);
1802 
1803  // set the row function argument names; for debugging purposes only
1804  set_row_func_argnames(row_func, in_col_count, agg_col_count, hoist_literals);
1805 
1806  return row_func;
1807 }
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
void set_row_func_argnames(llvm::Function *row_func, const size_t in_col_count, const size_t agg_col_count, const bool hoist_literals)
void anonymous_namespace{NativeCodegen.cpp}::eliminate_dead_self_recursive_funcs ( llvm::Module &  M,
const std::unordered_set< llvm::Function * > &  live_funcs 
)

Definition at line 244 of file NativeCodegen.cpp.

Referenced by optimize_ir().

246  {
247  std::vector<llvm::Function*> dead_funcs;
248  for (auto& F : M) {
249  bool bAlive = false;
250  if (live_funcs.count(&F)) {
251  continue;
252  }
253  for (auto U : F.users()) {
254  auto* C = llvm::dyn_cast<const llvm::CallInst>(U);
255  if (!C || C->getParent()->getParent() != &F) {
256  bAlive = true;
257  break;
258  }
259  }
260  if (!bAlive) {
261  dead_funcs.push_back(&F);
262  }
263  }
264  for (auto pFn : dead_funcs) {
265  pFn->eraseFromParent();
266  }
267 }

+ Here is the caller graph for this function:

template<typename InstType >
llvm::Value* anonymous_namespace{NativeCodegen.cpp}::find_variable_in_basic_block ( llvm::Function *  func,
std::string  bb_name,
std::string  variable_name 
)

Definition at line 2007 of file NativeCodegen.cpp.

2009  {
2010  llvm::Value* result = nullptr;
2011  if (func == nullptr || variable_name.empty()) {
2012  return result;
2013  }
2014  bool is_found = false;
2015  for (auto bb_it = func->begin(); bb_it != func->end() && !is_found; ++bb_it) {
2016  if (!bb_name.empty() && bb_it->getName() != bb_name) {
2017  continue;
2018  }
2019  for (auto inst_it = bb_it->begin(); inst_it != bb_it->end(); inst_it++) {
2020  if (llvm::isa<InstType>(*inst_it)) {
2021  if (inst_it->getName() == variable_name) {
2022  result = &*inst_it;
2023  is_found = true;
2024  break;
2025  }
2026  }
2027  }
2028  }
2029  return result;
2030 }
std::string anonymous_namespace{NativeCodegen.cpp}::gen_array_any_all_sigs ( )

Definition at line 610 of file NativeCodegen.cpp.

References cpp_to_llvm_name(), and run_benchmark_import::result.

610  {
611  std::string result;
612  for (const std::string any_or_all : {"any", "all"}) {
613  for (const std::string elem_type :
614  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
615  for (const std::string needle_type :
616  {"int8_t", "int16_t", "int32_t", "int64_t", "float", "double"}) {
617  for (const std::string op_name : {"eq", "ne", "lt", "le", "gt", "ge"}) {
618  result += ("declare i1 @array_" + any_or_all + "_" + op_name + "_" + elem_type +
619  "_" + needle_type + "(i8*, i64, " + cpp_to_llvm_name(needle_type) +
620  ", " + cpp_to_llvm_name(elem_type) + ");\n");
621  }
622  }
623  }
624  }
625  return result;
626 }
std::string cpp_to_llvm_name(const std::string &s)

+ Here is the call graph for this function:

std::string anonymous_namespace{NativeCodegen.cpp}::gen_translate_null_key_sigs ( )

Definition at line 628 of file NativeCodegen.cpp.

References cpp_to_llvm_name(), and run_benchmark_import::result.

628  {
629  std::string result;
630  for (const std::string key_type : {"int8_t", "int16_t", "int32_t", "int64_t"}) {
631  const auto key_llvm_type = cpp_to_llvm_name(key_type);
632  result += "declare i64 @translate_null_key_" + key_type + "(" + key_llvm_type + ", " +
633  key_llvm_type + ", i64);\n";
634  }
635  return result;
636 }
std::string cpp_to_llvm_name(const std::string &s)

+ Here is the call graph for this function:

std::vector<std::string> anonymous_namespace{NativeCodegen.cpp}::get_agg_fnames ( const std::vector< Analyzer::Expr * > &  target_exprs,
const bool  is_group_by 
)

Definition at line 1837 of file NativeCodegen.cpp.

1838  {
1839  std::vector<std::string> result;
1840  for (size_t target_idx = 0, agg_col_idx = 0; target_idx < target_exprs.size();
1841  ++target_idx, ++agg_col_idx) {
1842  const auto target_expr = target_exprs[target_idx];
1843  CHECK(target_expr);
1844  const auto target_type_info = target_expr->get_type_info();
1845  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
1846  const bool is_varlen =
1847  (target_type_info.is_string() &&
1848  target_type_info.get_compression() == kENCODING_NONE) ||
1849  target_type_info.is_array(); // TODO: should it use is_varlen_array() ?
1850  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
1851  result.emplace_back(target_type_info.is_fp() ? "agg_id_double" : "agg_id");
1852  if (is_varlen) {
1853  result.emplace_back("agg_id");
1854  }
1855  if (target_type_info.is_geometry()) {
1856  result.emplace_back("agg_id");
1857  for (auto i = 2; i < 2 * target_type_info.get_physical_coord_cols(); ++i) {
1858  result.emplace_back("agg_id");
1859  }
1860  }
1861  continue;
1862  }
1863  const auto agg_type = agg_expr->get_aggtype();
1864  SQLTypeInfo agg_type_info;
1865  switch (agg_type) {
1866  case kCOUNT:
1867  case kCOUNT_IF:
1868  agg_type_info = target_type_info;
1869  break;
1870  default:
1871  agg_type_info = agg_expr->get_arg()->get_type_info();
1872  break;
1873  }
1874  switch (agg_type) {
1875  case kAVG: {
1876  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1877  !agg_type_info.is_fp()) {
1878  throw std::runtime_error("AVG is only valid on integer and floating point");
1879  }
1880  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1881  ? "agg_sum"
1882  : "agg_sum_double");
1883  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1884  ? "agg_count"
1885  : "agg_count_double");
1886  break;
1887  }
1888  case kMIN: {
1889  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1890  agg_type_info.is_geometry()) {
1891  throw std::runtime_error(
1892  "MIN on strings, arrays or geospatial types not supported yet");
1893  }
1894  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1895  ? "agg_min"
1896  : "agg_min_double");
1897  break;
1898  }
1899  case kMAX: {
1900  if (agg_type_info.is_string() || agg_type_info.is_array() ||
1901  agg_type_info.is_geometry()) {
1902  throw std::runtime_error(
1903  "MAX on strings, arrays or geospatial types not supported yet");
1904  }
1905  result.emplace_back((agg_type_info.is_integer() || agg_type_info.is_time())
1906  ? "agg_max"
1907  : "agg_max_double");
1908  break;
1909  }
1910  case kSUM:
1911  case kSUM_IF: {
1912  if (!agg_type_info.is_integer() && !agg_type_info.is_decimal() &&
1913  !agg_type_info.is_fp()) {
1914  throw std::runtime_error(
1915  "SUM and SUM_IF is only valid on integer and floating point");
1916  }
1917  std::string func_name = (agg_type_info.is_integer() || agg_type_info.is_time())
1918  ? "agg_sum"
1919  : "agg_sum_double";
1920  if (agg_type == kSUM_IF) {
1921  func_name += "_if";
1922  }
1923  result.emplace_back(func_name);
1924  break;
1925  }
1926  case kCOUNT:
1927  result.emplace_back(agg_expr->get_is_distinct() ? "agg_count_distinct"
1928  : "agg_count");
1929  break;
1930  case kCOUNT_IF:
1931  result.emplace_back("agg_count_if");
1932  break;
1933  case kSINGLE_VALUE: {
1934  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1935  break;
1936  }
1937  case kSAMPLE: {
1938  // Note that varlen SAMPLE arguments are handled separately above
1939  result.emplace_back(agg_type_info.is_fp() ? "agg_id_double" : "agg_id");
1940  break;
1941  }
1943  result.emplace_back("agg_approximate_count_distinct");
1944  break;
1945  case kAPPROX_QUANTILE:
1946  result.emplace_back("agg_approx_quantile");
1947  break;
1948  case kMODE:
1949  result.emplace_back("agg_mode_func");
1950  break;
1951  default:
1952  UNREACHABLE() << "Usupported agg_type: " << agg_type;
1953  }
1954  }
1955  return result;
1956 }
bool is_fp() const
Definition: sqltypes.h:571
#define UNREACHABLE()
Definition: Logger.h:338
bool is_time() const
Definition: sqltypes.h:577
Definition: sqldefs.h:75
bool is_integer() const
Definition: sqltypes.h:565
Definition: sqldefs.h:77
Definition: sqldefs.h:78
SQLAgg get_aggtype() const
Definition: Analyzer.h:1329
#define CHECK(condition)
Definition: Logger.h:291
bool is_geometry() const
Definition: sqltypes.h:595
bool is_string() const
Definition: sqltypes.h:559
Definition: sqldefs.h:76
bool is_decimal() const
Definition: sqltypes.h:568
Definition: sqldefs.h:74
Definition: sqldefs.h:83
bool is_array() const
Definition: sqltypes.h:583
size_t anonymous_namespace{NativeCodegen.cpp}::get_shared_memory_size ( const bool  shared_mem_used,
const QueryMemoryDescriptor query_mem_desc_ptr 
)

Definition at line 2581 of file NativeCodegen.cpp.

2582  {
2583  return shared_mem_used
2584  ? (query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount())
2585  : 0;
2586 }
bool anonymous_namespace{NativeCodegen.cpp}::has_case_expr_within_groupby_expr ( RelAlgExecutionUnit const &  ra_exe_unit)

Definition at line 2618 of file NativeCodegen.cpp.

2618  {
2619  if (ra_exe_unit.groupby_exprs.empty() || !ra_exe_unit.groupby_exprs.front()) {
2620  return false;
2621  }
2622  CaseExprDetector detector;
2623  for (auto expr : ra_exe_unit.groupby_exprs) {
2624  if (detector.detectCaseExpr(expr.get())) {
2625  return true;
2626  }
2627  }
2628  return false;
2629 }
bool anonymous_namespace{NativeCodegen.cpp}::has_count_expr ( RelAlgExecutionUnit const &  ra_exe_unit)

Definition at line 2588 of file NativeCodegen.cpp.

2588  {
2589  for (auto const expr : ra_exe_unit.target_exprs) {
2590  if (auto const agg_expr = dynamic_cast<Analyzer::AggExpr*>(expr)) {
2591  if (shared::is_any<SQLAgg::kCOUNT, SQLAgg::kCOUNT_IF>(agg_expr->get_aggtype())) {
2592  return true;
2593  }
2594  }
2595  }
2596  return false;
2597 }
bool anonymous_namespace{NativeCodegen.cpp}::is_gpu_shared_mem_supported ( const QueryMemoryDescriptor query_mem_desc_ptr,
const RelAlgExecutionUnit ra_exe_unit,
const CudaMgr_Namespace::CudaMgr cuda_mgr,
const ExecutorDeviceType  device_type,
const unsigned  cuda_blocksize,
const unsigned  num_blocks_per_mp 
)

To simplify the implementation for practical purposes, we initially provide shared memory support for cases where there are at most as many entries in the output buffer as there are threads within each GPU device. In order to relax this assumption later, we need to add a for loop in generated codes such that each thread loops over multiple entries. TODO: relax this if necessary

Definition at line 2631 of file NativeCodegen.cpp.

2636  {
2637  if (device_type == ExecutorDeviceType::CPU) {
2638  return false;
2639  }
2640  if (query_mem_desc_ptr->didOutputColumnar()) {
2641  return false;
2642  }
2643  CHECK(query_mem_desc_ptr);
2644  CHECK(cuda_mgr);
2645  /*
2646  * We only use shared memory strategy if GPU hardware provides native shared
2647  * memory atomics support. From CUDA Toolkit documentation:
2648  * https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html#atomic-ops "Like
2649  * Maxwell, Pascal [and Volta] provides native shared memory atomic operations
2650  * for 32-bit integer arithmetic, along with native 32 or 64-bit compare-and-swap
2651  * (CAS)."
2652  *
2653  */
2654  if (!cuda_mgr->isArchMaxwellOrLaterForAll()) {
2655  return false;
2656  }
2657  if (cuda_mgr->isArchPascal() && !ra_exe_unit.join_quals.empty() &&
2658  has_count_expr(ra_exe_unit) && has_case_expr_within_groupby_expr(ra_exe_unit)) {
2659  return false;
2660  }
2661 
2662  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2665  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty()) {
2666  // TODO: relax this, if necessary
2667  if (cuda_blocksize < query_mem_desc_ptr->getEntryCount()) {
2668  return false;
2669  }
2670  // skip shared memory usage when dealing with 1) variable length targets, 2)
2671  // not a COUNT aggregate
2672  const auto target_infos =
2673  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2674  std::unordered_set<SQLAgg> supported_aggs{kCOUNT, kCOUNT_IF};
2675  if (std::find_if(target_infos.begin(),
2676  target_infos.end(),
2677  [&supported_aggs](const TargetInfo& ti) {
2678  if (ti.sql_type.is_varlen() ||
2679  !supported_aggs.count(ti.agg_kind)) {
2680  return true;
2681  } else {
2682  return false;
2683  }
2684  }) == target_infos.end()) {
2685  return true;
2686  }
2687  }
2688  if (query_mem_desc_ptr->getQueryDescriptionType() ==
2699  if (cuda_blocksize < query_mem_desc_ptr->getEntryCount()) {
2700  return false;
2701  }
2702 
2703  // Fundamentally, we should use shared memory whenever the output buffer
2704  // is small enough so that we can fit it in the shared memory and yet expect
2705  // good occupancy.
2706  // For now, we allow keyless, row-wise layout, and only for perfect hash
2707  // group by operations.
2708  if (query_mem_desc_ptr->hasKeylessHash() &&
2709  query_mem_desc_ptr->countDistinctDescriptorsLogicallyEmpty() &&
2710  !query_mem_desc_ptr->useStreamingTopN()) {
2711  const size_t shared_memory_threshold_bytes = std::min(
2712  g_gpu_smem_threshold == 0 ? SIZE_MAX : g_gpu_smem_threshold,
2713  cuda_mgr->getMinSharedMemoryPerBlockForAllDevices() / num_blocks_per_mp);
2714  const auto output_buffer_size =
2715  query_mem_desc_ptr->getRowSize() * query_mem_desc_ptr->getEntryCount();
2716  if (output_buffer_size > shared_memory_threshold_bytes) {
2717  return false;
2718  }
2719 
2720  // skip shared memory usage when dealing with 1) variable length targets, 2)
2721  // non-basic aggregates (COUNT, SUM, MIN, MAX, AVG)
2722  // TODO: relax this if necessary
2723  const auto target_infos =
2724  target_exprs_to_infos(ra_exe_unit.target_exprs, *query_mem_desc_ptr);
2725  std::unordered_set<SQLAgg> supported_aggs{kCOUNT, kCOUNT_IF};
2727  supported_aggs = {kCOUNT, kCOUNT_IF, kMIN, kMAX, kSUM, kSUM_IF, kAVG};
2728  }
2729  if (std::find_if(target_infos.begin(),
2730  target_infos.end(),
2731  [&supported_aggs](const TargetInfo& ti) {
2732  if (ti.sql_type.is_varlen() ||
2733  !supported_aggs.count(ti.agg_kind)) {
2734  return true;
2735  } else {
2736  return false;
2737  }
2738  }) == target_infos.end()) {
2739  return true;
2740  }
2741  }
2742  }
2743  return false;
2744 }
std::vector< Analyzer::Expr * > target_exprs
bool g_enable_smem_group_by
bool countDistinctDescriptorsLogicallyEmpty() const
bool g_enable_smem_non_grouped_agg
Definition: Execute.cpp:146
Definition: sqldefs.h:75
const JoinQualsPerNestingLevel join_quals
Definition: sqldefs.h:77
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:128
QueryDescriptionType getQueryDescriptionType() const
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:437
bool g_enable_smem_grouped_non_count_agg
Definition: Execute.cpp:143
Definition: sqldefs.h:78
bool has_count_expr(RelAlgExecutionUnit const &ra_exe_unit)
#define CHECK(condition)
Definition: Logger.h:291
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
bool isArchPascal() const
Definition: CudaMgr.h:153
Definition: sqldefs.h:76
Definition: sqldefs.h:74
bool has_case_expr_within_groupby_expr(RelAlgExecutionUnit const &ra_exe_unit)
size_t g_gpu_smem_threshold
Definition: Execute.cpp:138
void anonymous_namespace{NativeCodegen.cpp}::optimize_ir ( llvm::Function *  query_func,
llvm::Module *  llvm_module,
llvm::legacy::PassManager &  pass_manager,
const std::unordered_set< llvm::Function * > &  live_funcs,
const bool  is_gpu_smem_used,
const CompilationOptions co 
)

Definition at line 306 of file NativeCodegen.cpp.

References DEBUG_TIMER, and eliminate_dead_self_recursive_funcs().

Referenced by CodeGenerator::generateNativeCPUCode().

311  {
312  auto timer = DEBUG_TIMER(__func__);
313  // the always inliner legacy pass must always run first
314  pass_manager.add(llvm::createVerifierPass());
315  pass_manager.add(llvm::createAlwaysInlinerLegacyPass());
316 
317  pass_manager.add(new AnnotateInternalFunctionsPass());
318 
319  pass_manager.add(llvm::createSROAPass());
320  // mem ssa drops unused load and store instructions, e.g. passing variables directly
321  // where possible
322  pass_manager.add(
323  llvm::createEarlyCSEPass(/*enable_mem_ssa=*/true)); // Catch trivial redundancies
324 
325  if (!is_gpu_smem_used) {
326  // thread jumps can change the execution order around SMEM sections guarded by
327  // `__syncthreads()`, which results in race conditions. For now, disable jump
328  // threading for shared memory queries. In the future, consider handling shared
329  // memory aggregations with a separate kernel launch
330  pass_manager.add(llvm::createJumpThreadingPass()); // Thread jumps.
331  }
332  pass_manager.add(llvm::createCFGSimplificationPass());
333 
334  // remove load/stores in PHIs if instructions can be accessed directly post thread jumps
335  pass_manager.add(llvm::createNewGVNPass());
336 
337  pass_manager.add(llvm::createDeadStoreEliminationPass());
338  pass_manager.add(llvm::createLICMPass());
339 
340  pass_manager.add(llvm::createInstructionCombiningPass());
341 
342  // module passes
343  pass_manager.add(llvm::createPromoteMemoryToRegisterPass());
344  pass_manager.add(llvm::createGlobalOptimizerPass());
345 
346  pass_manager.add(llvm::createCFGSimplificationPass()); // cleanup after everything
347 
348  pass_manager.run(*llvm_module);
349 
350  eliminate_dead_self_recursive_funcs(*llvm_module, live_funcs);
351 }
void eliminate_dead_self_recursive_funcs(llvm::Module &M, const std::unordered_set< llvm::Function * > &live_funcs)
#define DEBUG_TIMER(name)
Definition: Logger.h:412

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T = void>
void anonymous_namespace{NativeCodegen.cpp}::scan_function_calls ( llvm::Function &  F,
std::unordered_set< std::string > &  defined,
std::unordered_set< std::string > &  undefined,
const std::unordered_set< std::string > &  ignored 
)

Definition at line 193 of file NativeCodegen.cpp.

Referenced by scan_function_calls().

196  {
197  for (llvm::inst_iterator I = llvm::inst_begin(F), E = llvm::inst_end(F); I != E; ++I) {
198  if (auto* CI = llvm::dyn_cast<llvm::CallInst>(&*I)) {
199  auto* F2 = CI->getCalledFunction();
200  if (F2 != nullptr) {
201  auto F2name = F2->getName().str();
202  if (F2->isDeclaration()) {
203  if (F2name.rfind("__", 0) !=
204  0 // assume symbols with double underscore are defined
205  && F2name.rfind("llvm.", 0) !=
206  0 // TODO: this may give false positive for NVVM intrinsics
207  && ignored.find(F2name) == ignored.end() // not in ignored list
208  ) {
209  undefined.emplace(F2name);
210  }
211  } else {
212  if (defined.find(F2name) == defined.end()) {
213  defined.emplace(F2name);
214  scan_function_calls<T>(*F2, defined, undefined, ignored);
215  }
216  }
217  }
218  }
219  }
220 }

+ Here is the caller graph for this function:

template<typename T = void>
void anonymous_namespace{NativeCodegen.cpp}::scan_function_calls ( llvm::Module &  llvm_module,
std::unordered_set< std::string > &  defined,
std::unordered_set< std::string > &  undefined,
const std::unordered_set< std::string > &  ignored 
)

Definition at line 223 of file NativeCodegen.cpp.

References scan_function_calls().

226  {
227  for (auto& F : llvm_module) {
228  if (!F.isDeclaration()) {
229  scan_function_calls(F, defined, undefined, ignored);
230  }
231  }
232 }
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)

+ Here is the call graph for this function:

template<typename T = void>
std::tuple<std::unordered_set<std::string>, std::unordered_set<std::string> > anonymous_namespace{NativeCodegen.cpp}::scan_function_calls ( llvm::Module &  llvm_module,
const std::unordered_set< std::string > &  ignored = {} 
)

Definition at line 236 of file NativeCodegen.cpp.

237  {}) {
238  std::unordered_set<std::string> defined, undefined;
239  scan_function_calls(llvm_module, defined, undefined, ignored);
240  return std::make_tuple(defined, undefined);
241 }
void scan_function_calls(llvm::Function &F, std::unordered_set< std::string > &defined, std::unordered_set< std::string > &undefined, const std::unordered_set< std::string > &ignored)
std::string anonymous_namespace{NativeCodegen.cpp}::serialize_llvm_metadata_footnotes ( llvm::Function *  query_func,
CgenState cgen_state 
)

Definition at line 2747 of file NativeCodegen.cpp.

2748  {
2749  std::string llvm_ir;
2750  std::unordered_set<llvm::MDNode*> md;
2751 
2752  // Loop over all instructions in the query function.
2753  for (auto bb_it = query_func->begin(); bb_it != query_func->end(); ++bb_it) {
2754  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2755  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2756  instr_it->getAllMetadata(imd);
2757  for (auto [kind, node] : imd) {
2758  md.insert(node);
2759  }
2760  }
2761  }
2762 
2763  // Loop over all instructions in the row function.
2764  for (auto bb_it = cgen_state->row_func_->begin(); bb_it != cgen_state->row_func_->end();
2765  ++bb_it) {
2766  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2767  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2768  instr_it->getAllMetadata(imd);
2769  for (auto [kind, node] : imd) {
2770  md.insert(node);
2771  }
2772  }
2773  }
2774 
2775  // Loop over all instructions in the filter function.
2776  if (cgen_state->filter_func_) {
2777  for (auto bb_it = cgen_state->filter_func_->begin();
2778  bb_it != cgen_state->filter_func_->end();
2779  ++bb_it) {
2780  for (auto instr_it = bb_it->begin(); instr_it != bb_it->end(); ++instr_it) {
2781  llvm::SmallVector<std::pair<unsigned, llvm::MDNode*>, 100> imd;
2782  instr_it->getAllMetadata(imd);
2783  for (auto [kind, node] : imd) {
2784  md.insert(node);
2785  }
2786  }
2787  }
2788  }
2789 
2790  // Sort the metadata by canonical number and convert to text.
2791  if (!md.empty()) {
2792  std::map<size_t, std::string> sorted_strings;
2793  for (auto p : md) {
2794  std::string str;
2795  llvm::raw_string_ostream os(str);
2796  p->print(os, cgen_state->module_, true);
2797  os.flush();
2798  auto fields = split(str, {}, 1);
2799  if (fields.empty() || fields[0].empty()) {
2800  continue;
2801  }
2802  sorted_strings.emplace(std::stoul(fields[0].substr(1)), str);
2803  }
2804  llvm_ir += "\n";
2805  for (auto [id, text] : sorted_strings) {
2806  llvm_ir += text;
2807  llvm_ir += "\n";
2808  }
2809  }
2810 
2811  return llvm_ir;
2812 }
std::vector< std::string > split(std::string_view str, std::string_view delim, std::optional< size_t > maxsplit)
split apart a string into a vector of substrings
llvm::Function * row_func_
Definition: CgenState.h:374
llvm::Module * module_
Definition: CgenState.h:373
llvm::Function * filter_func_
Definition: CgenState.h:375
void anonymous_namespace{NativeCodegen.cpp}::set_row_func_argnames ( llvm::Function *  row_func,
const size_t  in_col_count,
const size_t  agg_col_count,
const bool  hoist_literals 
)

Definition at line 1688 of file NativeCodegen.cpp.

References SQLTypeInfo::is_fp().

1691  {
1692  auto arg_it = row_func->arg_begin();
1693 
1694  if (agg_col_count) {
1695  for (size_t i = 0; i < agg_col_count; ++i) {
1696  arg_it->setName("out");
1697  ++arg_it;
1698  }
1699  } else {
1700  arg_it->setName("group_by_buff");
1701  ++arg_it;
1702  arg_it->setName("varlen_output_buff");
1703  ++arg_it;
1704  arg_it->setName("crt_matched");
1705  ++arg_it;
1706  arg_it->setName("total_matched");
1707  ++arg_it;
1708  arg_it->setName("old_total_matched");
1709  ++arg_it;
1710  arg_it->setName("max_matched");
1711  ++arg_it;
1712  }
1713 
1714  arg_it->setName("agg_init_val");
1715  ++arg_it;
1716 
1717  arg_it->setName("pos");
1718  ++arg_it;
1719 
1720  arg_it->setName("frag_row_off");
1721  ++arg_it;
1722 
1723  arg_it->setName("num_rows_per_scan");
1724  ++arg_it;
1725 
1726  if (hoist_literals) {
1727  arg_it->setName("literals");
1728  ++arg_it;
1729  }
1730 
1731  for (size_t i = 0; i < in_col_count; ++i) {
1732  arg_it->setName("col_buf" + std::to_string(i));
1733  ++arg_it;
1734  }
1735 
1736  arg_it->setName("join_hash_tables");
1737  ++arg_it;
1738  arg_it->setName("row_func_mgr");
1739 }
std::string to_string(char const *&&v)

+ Here is the call graph for this function:

template<typename T = void>
void anonymous_namespace{NativeCodegen.cpp}::show_defined ( llvm::Module &  llvm_module)

Definition at line 154 of file NativeCodegen.cpp.

References f().

Referenced by show_defined().

154  {
155  std::cout << "defines: ";
156  for (auto& f : llvm_module.getFunctionList()) {
157  if (!f.isDeclaration()) {
158  std::cout << f.getName().str() << ", ";
159  }
160  }
161  std::cout << std::endl;
162 }
torch::Tensor f(torch::Tensor x, torch::Tensor W_target, torch::Tensor b_target)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

template<typename T = void>
void anonymous_namespace{NativeCodegen.cpp}::show_defined ( llvm::Module *  llvm_module)

Definition at line 165 of file NativeCodegen.cpp.

References show_defined().

165  {
166  if (llvm_module == nullptr) {
167  std::cout << "is null" << std::endl;
168  } else {
169  show_defined(*llvm_module);
170  }
171 }
void show_defined(llvm::Module &llvm_module)

+ Here is the call graph for this function:

template<typename T = void>
void anonymous_namespace{NativeCodegen.cpp}::show_defined ( std::unique_ptr< llvm::Module > &  llvm_module)

Definition at line 174 of file NativeCodegen.cpp.

References show_defined().

174  {
175  show_defined(llvm_module.get());
176 }
void show_defined(llvm::Module &llvm_module)

+ Here is the call graph for this function:

void anonymous_namespace{NativeCodegen.cpp}::throw_parseIR_error ( const llvm::SMDiagnostic &  parse_error,
std::string  src = "",
const bool  is_gpu = false 
)

Definition at line 122 of file NativeCodegen.cpp.

124  {
125  std::string excname = (is_gpu ? "NVVM IR ParseError: " : "LLVM IR ParseError: ");
126  llvm::raw_string_ostream ss(excname);
127  parse_error.print(src.c_str(), ss, false, false);
128  throw ParseIRError(ss.str());
129 }

Variable Documentation

const std::string anonymous_namespace{NativeCodegen.cpp}::cuda_rt_decls

Definition at line 638 of file NativeCodegen.cpp.