OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GroupByAndAggregate Class Reference

#include <GroupByAndAggregate.h>

+ Collaboration diagram for GroupByAndAggregate:

Public Member Functions

 GroupByAndAggregate (Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const std::optional< int64_t > &group_cardinality_estimation)
 
bool codegen (llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
 

Static Public Member Functions

static size_t shard_count_for_top_groups (const RelAlgExecutionUnit &ra_exe_unit)
 

Private Member Functions

bool gpuCanHandleOrderEntries (const std::list< Analyzer::OrderEntry > &order_entries)
 
std::unique_ptr
< QueryMemoryDescriptor
initQueryMemoryDescriptor (const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
 
std::unique_ptr
< QueryMemoryDescriptor
initQueryMemoryDescriptorImpl (const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
 
int64_t getShardedTopBucket (const ColRangeInfo &col_range_info, const size_t shard_count) const
 
llvm::Value * codegenOutputSlot (llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
 
std::tuple< llvm::Value
*, llvm::Value * > 
codegenGroupBy (const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
 
llvm::Value * codegenVarlenOutputBuffer (const QueryMemoryDescriptor &query_mem_desc)
 
std::tuple< llvm::Value
*, llvm::Value * > 
codegenSingleColumnPerfectHash (const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
 
std::tuple< llvm::Value
*, llvm::Value * > 
codegenMultiColumnPerfectHash (llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
 
llvm::Function * codegenPerfectHashFunction ()
 
std::tuple< llvm::Value
*, llvm::Value * > 
codegenMultiColumnBaselineHash (const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
 
ColRangeInfo getColRangeInfo ()
 
llvm::Value * convertNullIfAny (const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
 
bool codegenAggCalls (const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, llvm::Value *varlen_output_buffer, const std::vector< llvm::Value * > &agg_out_vec, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
 
llvm::Value * codegenWindowRowPointer (const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
 
llvm::Value * codegenAggColumnPtr (llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
 : returns the pointer to where the aggregation should be stored. More...
 
void codegenEstimator (std::stack< llvm::BasicBlock * > &array_loops, DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
 
void codegenCountDistinct (const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
 
void codegenApproxQuantile (const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
 
void codegenMode (const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
 
llvm::Value * getAdditionalLiteral (const int32_t off)
 
std::vector< llvm::Value * > codegenAggArg (const Analyzer::Expr *target_expr, const CompilationOptions &co)
 
llvm::Value * emitCall (const std::string &fname, const std::vector< llvm::Value * > &args)
 
void checkErrorCode (llvm::Value *retCode)
 
bool needsUnnestDoublePatch (llvm::Value const *val_ptr, const std::string &agg_base_name, const bool threads_share_memory, const CompilationOptions &co) const
 
void prependForceSync ()
 

Static Private Member Functions

static int64_t getBucketedCardinality (const ColRangeInfo &col_range_info)
 

Private Attributes

Executorexecutor_
 
const RelAlgExecutionUnitra_exe_unit_
 
const std::vector
< InputTableInfo > & 
query_infos_
 
std::shared_ptr
< RowSetMemoryOwner
row_set_mem_owner_
 
bool output_columnar_
 
const ExecutorDeviceType device_type_
 
const std::optional< int64_t > group_cardinality_estimation_
 

Friends

class Executor
 
class QueryMemoryDescriptor
 
class CodeGenerator
 
class ExecutionKernel
 
struct TargetExprCodegen
 
struct TargetExprCodegenBuilder
 

Detailed Description

Definition at line 61 of file GroupByAndAggregate.h.

Constructor & Destructor Documentation

GroupByAndAggregate::GroupByAndAggregate ( Executor executor,
const ExecutorDeviceType  device_type,
const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const std::optional< int64_t > &  group_cardinality_estimation 
)

Definition at line 391 of file GroupByAndAggregate.cpp.

References RelAlgExecutionUnit::groupby_exprs, and ra_exe_unit_.

398  : executor_(executor)
399  , ra_exe_unit_(ra_exe_unit)
400  , query_infos_(query_infos)
401  , row_set_mem_owner_(row_set_mem_owner)
402  , device_type_(device_type)
403  , group_cardinality_estimation_(group_cardinality_estimation) {
404  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
405  if (!groupby_expr) {
406  continue;
407  }
408  const auto& groupby_ti = groupby_expr->get_type_info();
409  if (groupby_ti.is_text_encoding_none()) {
410  throw std::runtime_error(
411  "Cannot group by string columns which are not dictionary encoded.");
412  }
413  if (groupby_ti.is_buffer()) {
414  throw std::runtime_error("Group by buffer not supported");
415  }
416  if (groupby_ti.is_geometry()) {
417  throw std::runtime_error("Group by geometry not supported");
418  }
419  }
420 }
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
const std::vector< InputTableInfo > & query_infos_
const ExecutorDeviceType device_type_
const std::optional< int64_t > group_cardinality_estimation_
const RelAlgExecutionUnit & ra_exe_unit_

Member Function Documentation

void GroupByAndAggregate::checkErrorCode ( llvm::Value *  retCode)
private

Definition at line 2209 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, and executor_.

Referenced by TargetExprCodegen::codegenAggregate().

2209  {
2210  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2211  auto zero_const = llvm::ConstantInt::get(retCode->getType(), 0, true);
2212  auto rc_check_condition = executor_->cgen_state_->ir_builder_.CreateICmp(
2213  llvm::ICmpInst::ICMP_EQ, retCode, zero_const);
2214 
2215  executor_->cgen_state_->emitErrorCheck(rc_check_condition, retCode, "rc");
2216 }
#define AUTOMATIC_IR_METADATA(CGENSTATE)

+ Here is the caller graph for this function:

bool GroupByAndAggregate::codegen ( llvm::Value *  filter_result,
llvm::BasicBlock *  sc_false,
QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
const GpuSharedMemoryContext gpu_smem_context 
)

Definition at line 1022 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, CHECK, codegenAggCalls(), codegenEstimator(), codegenGroupBy(), codegenVarlenOutputBuffer(), DiamondCodegen::cond_false_, CompilationOptions::device_type, QueryMemoryDescriptor::didOutputColumnar(), RelAlgExecutionUnit::estimator, executor_, anonymous_namespace{GroupByAndAggregate.cpp}::get_agg_count(), get_arg_by_name(), get_int_type(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getQueryDescriptionType(), GPU, RelAlgExecutionUnit::groupby_exprs, GroupByPerfectHash, RelAlgExecutionUnit::join_quals, LL_BUILDER, LL_CONTEXT, LL_INT, LLVM_ALIGN, CodeGenerator::posArg(), prependForceSync(), Projection, query_mem_desc, ra_exe_unit_, ROW_FUNC, RelAlgExecutionUnit::target_exprs, QueryMemoryDescriptor::usesGetGroupValueFast(), and QueryMemoryDescriptor::useStreamingTopN().

1026  {
1027  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1028  CHECK(filter_result);
1029 
1030  bool can_return_error = false;
1031  llvm::BasicBlock* filter_false{nullptr};
1032 
1033  {
1034  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
1035 
1036  if (executor_->isArchMaxwell(co.device_type)) {
1037  prependForceSync();
1038  }
1039  DiamondCodegen filter_cfg(filter_result,
1040  executor_,
1041  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
1042  "filter", // filter_true and filter_false basic blocks
1043  nullptr,
1044  false);
1045  filter_false = filter_cfg.cond_false_;
1046 
1047  if (is_group_by) {
1049  !query_mem_desc.useStreamingTopN()) {
1050  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
1051  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
1052  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
1053  llvm::Value* old_total_matched_val{nullptr};
1055  old_total_matched_val =
1056  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
1057  total_matched_ptr,
1058  LL_INT(int32_t(1)),
1059 #if LLVM_VERSION_MAJOR > 12
1060  LLVM_ALIGN(8),
1061 #endif
1062  llvm::AtomicOrdering::Monotonic);
1063  } else {
1064  old_total_matched_val = LL_BUILDER.CreateLoad(
1065  total_matched_ptr->getType()->getPointerElementType(), total_matched_ptr);
1066  LL_BUILDER.CreateStore(
1067  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
1068  total_matched_ptr);
1069  }
1070  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
1071  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
1072  }
1073 
1074  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
1075  auto varlen_output_buffer = codegenVarlenOutputBuffer(query_mem_desc);
1076  if (query_mem_desc.usesGetGroupValueFast() ||
1077  query_mem_desc.getQueryDescriptionType() ==
1079  if (query_mem_desc.getGroupbyColCount() > 1) {
1080  filter_cfg.setChainToNext();
1081  }
1082  // Don't generate null checks if the group slot is guaranteed to be non-null,
1083  // as it's the case for get_group_value_fast* family.
1084  can_return_error = codegenAggCalls(agg_out_ptr_w_idx,
1085  varlen_output_buffer,
1086  {},
1088  co,
1089  gpu_smem_context,
1090  filter_cfg);
1091  } else {
1092  {
1093  llvm::Value* nullcheck_cond{nullptr};
1094  if (query_mem_desc.didOutputColumnar()) {
1095  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
1096  LL_INT(int32_t(0)));
1097  } else {
1098  nullcheck_cond = LL_BUILDER.CreateICmpNE(
1099  std::get<0>(agg_out_ptr_w_idx),
1100  llvm::ConstantPointerNull::get(
1101  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
1102  }
1103  DiamondCodegen nullcheck_cfg(
1104  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
1105  codegenAggCalls(agg_out_ptr_w_idx,
1106  varlen_output_buffer,
1107  {},
1109  co,
1110  gpu_smem_context,
1111  filter_cfg);
1112  }
1113  can_return_error = true;
1114  if (query_mem_desc.getQueryDescriptionType() ==
1116  query_mem_desc.useStreamingTopN()) {
1117  // Ignore rejection on pushing current row to top-K heap.
1118  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
1119  } else {
1120  CodeGenerator code_generator(executor_);
1121  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
1122  // TODO(alex): remove the trunc once pos is converted to 32 bits
1123  code_generator.posArg(nullptr),
1124  get_int_type(32, LL_CONTEXT))));
1125  }
1126  }
1127  } else {
1128  if (ra_exe_unit_.estimator) {
1129  std::stack<llvm::BasicBlock*> array_loops;
1130  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
1131  } else {
1132  auto arg_it = ROW_FUNC->arg_begin();
1133  std::vector<llvm::Value*> agg_out_vec;
1134  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
1135  agg_out_vec.push_back(&*arg_it++);
1136  }
1137  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
1138  /*varlen_output_buffer=*/nullptr,
1139  agg_out_vec,
1140  query_mem_desc,
1141  co,
1142  gpu_smem_context,
1143  filter_cfg);
1144  }
1145  }
1146  }
1147 
1148  if (ra_exe_unit_.join_quals.empty()) {
1149  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
1150  } else if (sc_false) {
1151  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
1152  LL_BUILDER.SetInsertPoint(sc_false);
1153  LL_BUILDER.CreateBr(filter_false);
1154  LL_BUILDER.SetInsertPoint(saved_insert_block);
1155  }
1156 
1157  return can_return_error;
1158 }
std::vector< Analyzer::Expr * > target_exprs
#define ROW_FUNC
llvm::BasicBlock * cond_false_
#define LL_BUILDER
#define LL_CONTEXT
#define LL_INT(v)
llvm::Value * codegenVarlenOutputBuffer(const QueryMemoryDescriptor &query_mem_desc)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
#define LLVM_ALIGN(alignment)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:168
size_t getGroupbyColCount() const
const JoinQualsPerNestingLevel join_quals
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, llvm::Value *varlen_output_buffer, const std::vector< llvm::Value * > &agg_out_vec, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
QueryDescriptionType getQueryDescriptionType() const
ExecutorDeviceType device_type
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
#define CHECK(condition)
Definition: Logger.h:291
const RelAlgExecutionUnit & ra_exe_unit_

+ Here is the call graph for this function:

std::vector< llvm::Value * > GroupByAndAggregate::codegenAggArg ( const Analyzer::Expr target_expr,
const CompilationOptions co 
)
private

Definition at line 2019 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, CodeGenerator::cgen_state_, CHECK, CHECK_EQ, CodeGenerator::codegen(), CgenState::context_, CUR_FUNC, executor_, get_int_type(), Analyzer::Expr::get_type_info(), SQLTypeInfo::is_geometry(), kARRAY, kPOINT, kSAMPLE, LL_BUILDER, LL_CONTEXT, log2_bytes(), and CodeGenerator::posArg().

Referenced by TargetExprCodegen::codegen(), and TargetExprCodegenBuilder::codegenMultiSlotSampleExpressions().

2021  {
2022  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2023  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
2024  const auto func_expr = dynamic_cast<const Analyzer::FunctionOper*>(target_expr);
2025  const auto arr_expr = dynamic_cast<const Analyzer::ArrayExpr*>(target_expr);
2026 
2027  // TODO(alex): handle arrays uniformly?
2028  CodeGenerator code_generator(executor_);
2029  if (target_expr) {
2030  const auto& target_ti = target_expr->get_type_info();
2031  if (target_ti.is_buffer() &&
2032  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
2033  const auto target_lvs =
2034  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2035  : code_generator.codegen(
2036  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
2037  if (!func_expr && !arr_expr) {
2038  // Something with the chunk transport is code that was generated from a source
2039  // other than an ARRAY[] expression
2040  if (target_ti.is_text_encoding_none()) {
2041  CHECK_EQ(size_t(3), target_lvs.size());
2042  return {target_lvs[1], target_lvs[2]};
2043  }
2044  CHECK(target_ti.is_array());
2045  CHECK_EQ(size_t(1), target_lvs.size());
2046  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
2047  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
2048  const auto i8p_ty =
2049  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
2050  const auto& elem_ti = target_ti.get_elem_type();
2051  return {
2052  executor_->cgen_state_->emitExternalCall(
2053  "array_buff",
2054  i8p_ty,
2055  {target_lvs.front(), code_generator.posArg(target_expr)}),
2056  executor_->cgen_state_->emitExternalCall(
2057  "array_size",
2058  i32_ty,
2059  {target_lvs.front(),
2060  code_generator.posArg(target_expr),
2061  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
2062  } else {
2063  if (agg_expr) {
2064  throw std::runtime_error(
2065  "Using array[] operator as argument to an aggregate operator is not "
2066  "supported");
2067  }
2068  CHECK(func_expr || arr_expr);
2069  if (dynamic_cast<const Analyzer::FunctionOper*>(target_expr)) {
2070  CHECK_EQ(size_t(1), target_lvs.size());
2071  const auto prefix = target_ti.get_buffer_name();
2072  CHECK(target_ti.is_array() || target_ti.is_text_encoding_none());
2073  const auto target_lv = LL_BUILDER.CreateLoad(
2074  target_lvs[0]->getType()->getPointerElementType(), target_lvs[0]);
2075  // const auto target_lv_type = target_lvs[0]->getType();
2076  // CHECK(target_lv_type->isStructTy());
2077  // CHECK_EQ(target_lv_type->getNumContainedTypes(), 3u);
2078  const auto i8p_ty = llvm::PointerType::get(
2079  get_int_type(8, executor_->cgen_state_->context_), 0);
2080  const auto ptr = LL_BUILDER.CreatePointerCast(
2081  LL_BUILDER.CreateExtractValue(target_lv, 0), i8p_ty);
2082  const auto size = LL_BUILDER.CreateExtractValue(target_lv, 1);
2083  const auto null_flag = LL_BUILDER.CreateExtractValue(target_lv, 2);
2084  const auto nullcheck_ok_bb =
2085  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_nullcheck_ok_bb", CUR_FUNC);
2086  const auto nullcheck_fail_bb = llvm::BasicBlock::Create(
2087  LL_CONTEXT, prefix + "_nullcheck_fail_bb", CUR_FUNC);
2088 
2089  // TODO(adb): probably better to zext the bool
2090  const auto nullcheck = LL_BUILDER.CreateICmpEQ(
2091  null_flag, executor_->cgen_state_->llInt(static_cast<int8_t>(1)));
2092  LL_BUILDER.CreateCondBr(nullcheck, nullcheck_fail_bb, nullcheck_ok_bb);
2093 
2094  const auto ret_bb =
2095  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_return", CUR_FUNC);
2096  LL_BUILDER.SetInsertPoint(ret_bb);
2097  auto result_phi = LL_BUILDER.CreatePHI(i8p_ty, 2, prefix + "_ptr_return");
2098  result_phi->addIncoming(ptr, nullcheck_ok_bb);
2099  const auto null_arr_sentinel = LL_BUILDER.CreateIntToPtr(
2100  executor_->cgen_state_->llInt(static_cast<int8_t>(0)), i8p_ty);
2101  result_phi->addIncoming(null_arr_sentinel, nullcheck_fail_bb);
2102  LL_BUILDER.SetInsertPoint(nullcheck_ok_bb);
2103  executor_->cgen_state_->emitExternalCall(
2104  "register_buffer_with_executor_rsm",
2105  llvm::Type::getVoidTy(executor_->cgen_state_->context_),
2106  {executor_->cgen_state_->llInt(reinterpret_cast<int64_t>(executor_)), ptr});
2107  LL_BUILDER.CreateBr(ret_bb);
2108  LL_BUILDER.SetInsertPoint(nullcheck_fail_bb);
2109  LL_BUILDER.CreateBr(ret_bb);
2110 
2111  LL_BUILDER.SetInsertPoint(ret_bb);
2112  return {result_phi, size};
2113  }
2114  CHECK_EQ(size_t(2), target_lvs.size());
2115  return {target_lvs[0], target_lvs[1]};
2116  }
2117  }
2118  if (target_ti.is_geometry() &&
2119  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
2120  auto generate_coord_lvs =
2121  [&](auto* selected_target_expr,
2122  bool const fetch_columns) -> std::vector<llvm::Value*> {
2123  const auto target_lvs =
2124  code_generator.codegen(selected_target_expr, fetch_columns, co);
2125  if (dynamic_cast<const Analyzer::GeoOperator*>(target_expr) &&
2126  target_expr->get_type_info().is_geometry()) {
2127  // return a pointer to the temporary alloca
2128  return target_lvs;
2129  }
2130  const auto geo_uoper = dynamic_cast<const Analyzer::GeoUOper*>(target_expr);
2131  const auto geo_binoper = dynamic_cast<const Analyzer::GeoBinOper*>(target_expr);
2132  if (geo_uoper || geo_binoper) {
2133  CHECK(target_expr->get_type_info().is_geometry());
2134  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
2135  target_lvs.size());
2136  return target_lvs;
2137  }
2138  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
2139  target_lvs.size());
2140 
2141  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
2142  const auto i8p_ty =
2143  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
2144  std::vector<llvm::Value*> coords;
2145  size_t ctr = 0;
2146  for (const auto& target_lv : target_lvs) {
2147  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
2148  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
2149  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
2150  // coords array (TINYINT). Subsequent arrays are regular INT.
2151 
2152  const size_t elem_sz = ctr == 0 ? 1 : 4;
2153  ctr++;
2154  int32_t fixlen = -1;
2155  if (target_ti.get_type() == kPOINT) {
2156  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
2157  if (col_var) {
2158  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
2159  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
2160  fixlen = coords_cd->columnType.get_size();
2161  }
2162  }
2163  }
2164  if (fixlen > 0) {
2165  coords.push_back(executor_->cgen_state_->emitExternalCall(
2166  "fast_fixlen_array_buff",
2167  i8p_ty,
2168  {target_lv, code_generator.posArg(selected_target_expr)}));
2169  auto fixed_len_lv = executor_->cgen_state_->emitExternalCall(
2170  "determine_fixed_array_len",
2171  llvm::IntegerType::get(code_generator.cgen_state_->context_, 64),
2172  {target_lv, executor_->cgen_state_->llInt(int64_t(fixlen))});
2173  coords.push_back(fixed_len_lv);
2174  continue;
2175  }
2176  coords.push_back(executor_->cgen_state_->emitExternalCall(
2177  "array_buff",
2178  i8p_ty,
2179  {target_lv, code_generator.posArg(selected_target_expr)}));
2180  coords.push_back(executor_->cgen_state_->emitExternalCall(
2181  "array_size",
2182  i32_ty,
2183  {target_lv,
2184  code_generator.posArg(selected_target_expr),
2185  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
2186  }
2187  return coords;
2188  };
2189 
2190  if (agg_expr) {
2191  return generate_coord_lvs(agg_expr->get_arg(), true);
2192  } else {
2193  return generate_coord_lvs(target_expr,
2194  !executor_->plan_state_->allow_lazy_fetch_);
2195  }
2196  }
2197  }
2198  bool fetch_column = !executor_->plan_state_->allow_lazy_fetch_;
2199  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2200  : code_generator.codegen(target_expr, fetch_column, co);
2201 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
#define LL_BUILDER
#define LL_CONTEXT
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
#define CHECK(condition)
Definition: Logger.h:291
bool is_geometry() const
Definition: sqltypes.h:595
#define CUR_FUNC
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:198

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool GroupByAndAggregate::codegenAggCalls ( const std::tuple< llvm::Value *, llvm::Value * > &  agg_out_ptr_w_idx,
llvm::Value *  varlen_output_buffer,
const std::vector< llvm::Value * > &  agg_out_vec,
QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
const GpuSharedMemoryContext gpu_smem_context,
DiamondCodegen diamond_codegen 
)
private

Definition at line 1656 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, CHECK, TargetExprCodegenBuilder::codegen(), QueryMemoryDescriptor::didOutputColumnar(), executor_, g_cluster, QueryMemoryDescriptor::getQueryDescriptionType(), LL_BUILDER, LL_CONTEXT, Projection, ra_exe_unit_, and RelAlgExecutionUnit::target_exprs.

Referenced by codegen().

1663  {
1664  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1665  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1666  // TODO(alex): unify the two cases, the output for non-group by queries
1667  // should be a contiguous buffer
1668  const bool is_group_by = std::get<0>(agg_out_ptr_w_idx);
1669  bool can_return_error = false;
1670  if (is_group_by) {
1671  CHECK(agg_out_vec.empty());
1672  } else {
1673  CHECK(!agg_out_vec.empty());
1674  }
1675 
1676  // output buffer is casted into a byte stream to be able to handle data elements of
1677  // different sizes (only used when actual column width sizes are used)
1678  llvm::Value* output_buffer_byte_stream{nullptr};
1679  llvm::Value* out_row_idx{nullptr};
1680  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1682  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1683  std::get<0>(agg_out_ptr_w_idx),
1684  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1685  output_buffer_byte_stream->setName("out_buff_b_stream");
1686  CHECK(std::get<1>(agg_out_ptr_w_idx));
1687  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1688  llvm::Type::getInt64Ty(LL_CONTEXT));
1689  out_row_idx->setName("out_row_idx");
1690  }
1691 
1692  TargetExprCodegenBuilder target_builder(ra_exe_unit_, is_group_by);
1693  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1694  ++target_idx) {
1695  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1696  CHECK(target_expr);
1697 
1698  target_builder(target_expr, executor_, query_mem_desc, co);
1699  }
1700 
1701  target_builder.codegen(this,
1702  executor_,
1703  query_mem_desc,
1704  co,
1705  gpu_smem_context,
1706  agg_out_ptr_w_idx,
1707  agg_out_vec,
1708  output_buffer_byte_stream,
1709  out_row_idx,
1710  varlen_output_buffer,
1711  diamond_codegen);
1712 
1713  return can_return_error;
1714 }
std::vector< Analyzer::Expr * > target_exprs
#define LL_BUILDER
#define LL_CONTEXT
#define AUTOMATIC_IR_METADATA(CGENSTATE)
QueryDescriptionType getQueryDescriptionType() const
#define CHECK(condition)
Definition: Logger.h:291
bool g_cluster
const RelAlgExecutionUnit & ra_exe_unit_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

llvm::Value * GroupByAndAggregate::codegenAggColumnPtr ( llvm::Value *  output_buffer_byte_stream,
llvm::Value *  out_row_idx,
const std::tuple< llvm::Value *, llvm::Value * > &  agg_out_ptr_w_idx,
const QueryMemoryDescriptor query_mem_desc,
const size_t  chosen_bytes,
const size_t  agg_out_off,
const size_t  target_idx 
)
private

: returns the pointer to where the aggregation should be stored.

Definition at line 1719 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, shared::bit_cast(), CHECK, CHECK_EQ, QueryMemoryDescriptor::didOutputColumnar(), executor_, g_cluster, get_int_type(), QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getColOnlyOffInBytes(), QueryMemoryDescriptor::getQueryDescriptionType(), LL_BUILDER, LL_CONTEXT, LL_INT, Projection, and to_string().

Referenced by TargetExprCodegen::codegenAggregate(), and TargetExprCodegenBuilder::codegenMultiSlotSampleExpressions().

1726  {
1727  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1728  llvm::Value* agg_col_ptr{nullptr};
1729  if (query_mem_desc.didOutputColumnar()) {
1730  // TODO(Saman): remove the second columnar branch, and support all query description
1731  // types through the first branch. Then, input arguments should also be cleaned up
1732  if (!g_cluster &&
1734  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1735  chosen_bytes == 8);
1736  CHECK(output_buffer_byte_stream);
1737  CHECK(out_row_idx);
1738  size_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1739  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1740  auto out_per_col_byte_idx =
1741 #ifdef _WIN32
1742  LL_BUILDER.CreateShl(out_row_idx, __lzcnt(chosen_bytes) - 1);
1743 #else
1744  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1745 #endif
1746  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1747  LL_INT(static_cast<int64_t>(col_off)));
1748  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1749  auto output_ptr = LL_BUILDER.CreateGEP(
1750  output_buffer_byte_stream->getType()->getScalarType()->getPointerElementType(),
1751  output_buffer_byte_stream,
1752  byte_offset);
1753  agg_col_ptr = LL_BUILDER.CreateBitCast(
1754  output_ptr,
1755  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1756  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1757  } else {
1758  auto const col_off_in_bytes = query_mem_desc.getColOffInBytes(agg_out_off);
1759  auto const col_off = col_off_in_bytes / chosen_bytes;
1760  auto const col_rem = col_off_in_bytes % chosen_bytes;
1761  CHECK_EQ(col_rem, 0u) << col_off_in_bytes << " % " << chosen_bytes;
1762  CHECK(std::get<1>(agg_out_ptr_w_idx));
1763  auto* agg_out_idx = LL_BUILDER.CreateZExt(
1764  std::get<1>(agg_out_ptr_w_idx),
1765  get_int_type(8 * sizeof(col_off), executor_->cgen_state_->context_));
1766  auto* offset = LL_BUILDER.CreateAdd(agg_out_idx, LL_INT(col_off));
1767  auto* bit_cast = LL_BUILDER.CreateBitCast(
1768  std::get<0>(agg_out_ptr_w_idx),
1769  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1770  agg_col_ptr = LL_BUILDER.CreateGEP(
1771  bit_cast->getType()->getScalarType()->getPointerElementType(),
1772  bit_cast,
1773  offset);
1774  }
1775  } else {
1776  auto const col_off_in_bytes = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1777  auto const col_off = col_off_in_bytes / chosen_bytes;
1778  auto const col_rem = col_off_in_bytes % chosen_bytes;
1779  CHECK_EQ(col_rem, 0u) << col_off_in_bytes << " % " << chosen_bytes;
1780  auto* bit_cast = LL_BUILDER.CreateBitCast(
1781  std::get<0>(agg_out_ptr_w_idx),
1782  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1783  agg_col_ptr = LL_BUILDER.CreateGEP(
1784  bit_cast->getType()->getScalarType()->getPointerElementType(),
1785  bit_cast,
1786  LL_INT(col_off));
1787  }
1788  CHECK(agg_col_ptr);
1789  return agg_col_ptr;
1790 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
#define LL_BUILDER
#define LL_CONTEXT
#define LL_INT(v)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::string to_string(char const *&&v)
size_t getColOnlyOffInBytes(const size_t col_idx) const
#define AUTOMATIC_IR_METADATA(CGENSTATE)
QueryDescriptionType getQueryDescriptionType() const
TO bit_cast(FROM &&from)
Definition: misc.h:298
#define CHECK(condition)
Definition: Logger.h:291
bool g_cluster
size_t getColOffInBytes(const size_t col_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void GroupByAndAggregate::codegenApproxQuantile ( const size_t  target_idx,
const Analyzer::Expr target_expr,
std::vector< llvm::Value * > &  agg_args,
const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type 
)
private

Definition at line 1926 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, executor_, g_bigint_count, SQLTypeInfo::get_notnull(), get_target_info(), Analyzer::Expr::get_type_info(), and GPU.

Referenced by TargetExprCodegen::codegenAggregate().

1931  {
1932  if (device_type == ExecutorDeviceType::GPU) {
1933  throw QueryMustRunOnCpu();
1934  }
1935  llvm::BasicBlock *calc, *skip{nullptr};
1936  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1937  auto const arg_ti =
1938  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1939  bool const nullable = !arg_ti.get_notnull();
1940 
1941  auto* cs = executor_->cgen_state_.get();
1942  auto& irb = cs->ir_builder_;
1943  if (nullable) {
1944  auto* const null_value = cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1945  auto* const skip_cond = arg_ti.is_fp()
1946  ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1947  : irb.CreateICmpEQ(agg_args.back(), null_value);
1948  calc = llvm::BasicBlock::Create(cs->context_, "calc_approx_quantile");
1949  skip = llvm::BasicBlock::Create(cs->context_, "skip_approx_quantile");
1950  irb.CreateCondBr(skip_cond, skip, calc);
1951  cs->current_func_->getBasicBlockList().push_back(calc);
1952  irb.SetInsertPoint(calc);
1953  }
1954  if (!arg_ti.is_fp()) {
1955  auto const agg_info = get_target_info(target_expr, g_bigint_count);
1956  agg_args.back() = executor_->castToFP(agg_args.back(), arg_ti, agg_info.sql_type);
1957  }
1958  cs->emitExternalCall(
1959  "agg_approx_quantile", llvm::Type::getVoidTy(cs->context_), agg_args);
1960  if (nullable) {
1961  irb.CreateBr(skip);
1962  cs->current_func_->getBasicBlockList().push_back(skip);
1963  irb.SetInsertPoint(skip);
1964  }
1965 }
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
bool g_bigint_count
#define AUTOMATIC_IR_METADATA(CGENSTATE)
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:398

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void GroupByAndAggregate::codegenCountDistinct ( const size_t  target_idx,
const Analyzer::Expr target_expr,
std::vector< llvm::Value * > &  agg_args,
const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type 
)
private

Definition at line 1856 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, Bitmap, CHECK, CHECK_EQ, emitCall(), executor_, g_bigint_count, get_int_type(), get_target_info(), Analyzer::Expr::get_type_info(), getAdditionalLiteral(), QueryMemoryDescriptor::getCountDistinctDescriptor(), GPU, Invalid, kAPPROX_COUNT_DISTINCT, LL_CONTEXT, and LL_INT.

Referenced by TargetExprCodegen::codegenAggregate().

1861  {
1862  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1863  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1864  const auto& arg_ti =
1865  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1866  if (arg_ti.is_fp()) {
1867  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1868  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1869  }
1870  const auto& count_distinct_descriptor =
1871  query_mem_desc.getCountDistinctDescriptor(target_idx);
1872  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1873  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1874  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1875  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1876  if (device_type == ExecutorDeviceType::GPU) {
1877  const auto base_dev_addr = getAdditionalLiteral(-1);
1878  const auto base_host_addr = getAdditionalLiteral(-2);
1879  agg_args.push_back(base_dev_addr);
1880  agg_args.push_back(base_host_addr);
1881  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1882  } else {
1883  emitCall("agg_approximate_count_distinct", agg_args);
1884  }
1885  return;
1886  }
1887  std::string agg_fname{"agg_count_distinct"};
1888  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1889  agg_fname += "_bitmap";
1890  agg_args.push_back(LL_INT(count_distinct_descriptor.min_val));
1891  agg_args.push_back(LL_INT(count_distinct_descriptor.bucket_size));
1892  }
1893  if (agg_info.skip_null_val) {
1894  auto null_lv = executor_->cgen_state_->castToTypeIn(
1895  (arg_ti.is_fp()
1896  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1897  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1898  64);
1899  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1900  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1901  agg_fname += "_skip_val";
1902  agg_args.push_back(null_lv);
1903  }
1904  if (device_type == ExecutorDeviceType::GPU) {
1905  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1906  agg_fname += "_gpu";
1907  const auto base_dev_addr = getAdditionalLiteral(-1);
1908  const auto base_host_addr = getAdditionalLiteral(-2);
1909  agg_args.push_back(base_dev_addr);
1910  agg_args.push_back(base_host_addr);
1911  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1912  CHECK_EQ(size_t(0),
1913  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1914  count_distinct_descriptor.sub_bitmap_count);
1915  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1916  count_distinct_descriptor.sub_bitmap_count)));
1917  }
1918  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1919  emitCall(agg_fname, agg_args);
1920  } else {
1921  executor_->cgen_state_->emitExternalCall(
1922  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1923  }
1924 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
llvm::Value * getAdditionalLiteral(const int32_t off)
#define LL_CONTEXT
#define LL_INT(v)
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
bool g_bigint_count
#define AUTOMATIC_IR_METADATA(CGENSTATE)
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void GroupByAndAggregate::codegenEstimator ( std::stack< llvm::BasicBlock * > &  array_loops,
DiamondCodegen diamond_codegen,
const QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co 
)
private

Definition at line 1792 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, CHECK, emitCall(), RelAlgExecutionUnit::estimator, executor_, get_int_type(), QueryMemoryDescriptor::getEffectiveKeyWidth(), LL_BUILDER, LL_CONTEXT, LL_INT, ra_exe_unit_, and ROW_FUNC.

Referenced by codegen().

1795  {
1796  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1797  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1798  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1799  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1800  estimator_comp_count_lv);
1801  int32_t subkey_idx = 0;
1802  for (const auto& estimator_arg_comp : estimator_arg) {
1803  const auto estimator_arg_comp_lvs =
1804  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1805  query_mem_desc.getEffectiveKeyWidth(),
1806  co,
1807  false,
1808  0,
1809  diamond_codegen,
1810  array_loops,
1811  true);
1812  CHECK(!estimator_arg_comp_lvs.original_value);
1813  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1814  // store the sub-key to the buffer
1815  LL_BUILDER.CreateStore(
1816  estimator_arg_comp_lv,
1817  LL_BUILDER.CreateGEP(
1818  estimator_key_lv->getType()->getScalarType()->getPointerElementType(),
1819  estimator_key_lv,
1820  LL_INT(subkey_idx++)));
1821  }
1822  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1823  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1824  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1825  const auto estimator_comp_bytes_lv =
1826  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1827  const auto bitmap_size_lv =
1828  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1829  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1830  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1831 }
#define ROW_FUNC
#define LL_BUILDER
#define LL_CONTEXT
#define LL_INT(v)
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
size_t getEffectiveKeyWidth() const
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:291
const RelAlgExecutionUnit & ra_exe_unit_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::tuple< llvm::Value *, llvm::Value * > GroupByAndAggregate::codegenGroupBy ( const QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
DiamondCodegen codegen 
)
private

Definition at line 1248 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CHECK_EQ, codegenMultiColumnBaselineHash(), codegenMultiColumnPerfectHash(), codegenOutputSlot(), codegenSingleColumnPerfectHash(), QueryMemoryDescriptor::didOutputColumnar(), executor_, anonymous_namespace{GroupByAndAggregate.cpp}::get_expr_range_info(), QueryMemoryDescriptor::getBucket(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getMaxVal(), QueryMemoryDescriptor::getQueryDescriptionType(), QueryMemoryDescriptor::getRowSize(), RelAlgExecutionUnit::groupby_exprs, GroupByBaselineHash, GroupByPerfectHash, QueryMemoryDescriptor::hasNulls(), QueryMemoryDescriptor::isSingleColumnGroupByWithPerfectHash(), LL_BUILDER, LL_CONTEXT, LL_INT, Projection, query_infos_, ra_exe_unit_, ROW_FUNC, and QueryMemoryDescriptor::threadsShareMemory().

Referenced by codegen().

1251  {
1252  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1253  auto arg_it = ROW_FUNC->arg_begin();
1254  auto groups_buffer = arg_it++;
1255 
1256  std::stack<llvm::BasicBlock*> array_loops;
1257 
1258  // TODO(Saman): move this logic outside of this function.
1260  if (query_mem_desc.didOutputColumnar()) {
1261  return std::make_tuple(
1262  &*groups_buffer,
1263  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1264  } else {
1265  return std::make_tuple(
1266  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1267  nullptr);
1268  }
1269  }
1270 
1271  CHECK(query_mem_desc.getQueryDescriptionType() ==
1273  query_mem_desc.getQueryDescriptionType() ==
1275 
1276  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1277  ? 0
1278  : query_mem_desc.getRowSize() / sizeof(int64_t);
1279 
1280  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1281  ? sizeof(int64_t)
1282  : query_mem_desc.getEffectiveKeyWidth();
1283  // for multi-column group by
1284  llvm::Value* group_key = nullptr;
1285  llvm::Value* key_size_lv = nullptr;
1286 
1287  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1288  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1289  if (query_mem_desc.getQueryDescriptionType() ==
1291  group_key =
1292  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1293  } else if (query_mem_desc.getQueryDescriptionType() ==
1295  group_key =
1296  col_width_size == sizeof(int32_t)
1297  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1298  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1299  }
1300  CHECK(group_key);
1301  CHECK(key_size_lv);
1302  }
1303 
1304  int32_t subkey_idx = 0;
1305  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1306  for (const auto& group_expr : ra_exe_unit_.groupby_exprs) {
1307  const auto col_range_info =
1309  const auto translated_null_value = static_cast<int64_t>(
1310  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1311  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1312  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1313  : checked_int64_t(col_range_info.max) +
1314  (col_range_info.bucket ? col_range_info.bucket : 1));
1315 
1316  const bool col_has_nulls =
1317  query_mem_desc.getQueryDescriptionType() ==
1319  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1320  ? query_mem_desc.hasNulls()
1321  : col_range_info.has_nulls)
1322  : false;
1323 
1324  const auto group_expr_lvs =
1325  executor_->groupByColumnCodegen(group_expr.get(),
1326  col_width_size,
1327  co,
1328  col_has_nulls,
1329  translated_null_value,
1330  diamond_codegen,
1331  array_loops,
1332  query_mem_desc.threadsShareMemory());
1333  const auto group_expr_lv = group_expr_lvs.translated_value;
1334  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1335  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1336  return codegenSingleColumnPerfectHash(query_mem_desc,
1337  co,
1338  &*groups_buffer,
1339  group_expr_lv,
1340  group_expr_lvs.original_value,
1341  row_size_quad);
1342  } else {
1343  // store the sub-key to the buffer
1344  LL_BUILDER.CreateStore(
1345  group_expr_lv,
1346  LL_BUILDER.CreateGEP(
1347  group_key->getType()->getScalarType()->getPointerElementType(),
1348  group_key,
1349  LL_INT(subkey_idx++)));
1350  }
1351  }
1352  if (query_mem_desc.getQueryDescriptionType() ==
1354  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1356  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1357  } else if (query_mem_desc.getQueryDescriptionType() ==
1360  &*groups_buffer,
1361  group_key,
1362  key_size_lv,
1363  query_mem_desc,
1364  col_width_size,
1365  row_size_quad);
1366  }
1367  CHECK(false);
1368  return std::make_tuple(nullptr, nullptr);
1369 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
#define ROW_FUNC
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
#define LL_BUILDER
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
#define LL_CONTEXT
#define LL_INT(v)
size_t getEffectiveKeyWidth() const
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
size_t getGroupbyColCount() const
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
QueryDescriptionType getQueryDescriptionType() const
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
#define CHECK(condition)
Definition: Logger.h:291
const RelAlgExecutionUnit & ra_exe_unit_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void GroupByAndAggregate::codegenMode ( const size_t  target_idx,
const Analyzer::Expr target_expr,
std::vector< llvm::Value * > &  agg_args,
const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type 
)
private

Definition at line 1967 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, executor_, get_int_type(), SQLTypeInfo::get_notnull(), Analyzer::Expr::get_type_info(), and GPU.

Referenced by TargetExprCodegen::codegenAggregate().

1971  {
1972  if (device_type == ExecutorDeviceType::GPU) {
1973  throw QueryMustRunOnCpu();
1974  }
1975  llvm::BasicBlock *calc, *skip{nullptr};
1976  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1977  auto const arg_ti =
1978  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1979  bool const nullable = !arg_ti.get_notnull();
1980  bool const is_fp = arg_ti.is_fp();
1981  auto* cs = executor_->cgen_state_.get();
1982  auto& irb = cs->ir_builder_;
1983  if (nullable) {
1984  auto* const null_value =
1985  is_fp ? cs->inlineNull(arg_ti) : cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1986  auto* const skip_cond = is_fp ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1987  : irb.CreateICmpEQ(agg_args.back(), null_value);
1988  calc = llvm::BasicBlock::Create(cs->context_, "calc_mode");
1989  skip = llvm::BasicBlock::Create(cs->context_, "skip_mode");
1990  irb.CreateCondBr(skip_cond, skip, calc);
1991  cs->current_func_->getBasicBlockList().push_back(calc);
1992  irb.SetInsertPoint(calc);
1993  }
1994  if (is_fp) {
1995  auto* const int_type = get_int_type(8 * arg_ti.get_size(), cs->context_);
1996  agg_args.back() = irb.CreateBitCast(agg_args.back(), int_type);
1997  }
1998  // "agg_mode" collides with existing names, so non-standard suffix "_func" is added.
1999  cs->emitExternalCall("agg_mode_func", llvm::Type::getVoidTy(cs->context_), agg_args);
2000  if (nullable) {
2001  irb.CreateBr(skip);
2002  cs->current_func_->getBasicBlockList().push_back(skip);
2003  irb.SetInsertPoint(skip);
2004  }
2005 }
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:398

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::tuple< llvm::Value *, llvm::Value * > GroupByAndAggregate::codegenMultiColumnBaselineHash ( const CompilationOptions co,
llvm::Value *  groups_buffer,
llvm::Value *  group_key,
llvm::Value *  key_size_lv,
const QueryMemoryDescriptor query_mem_desc,
const size_t  key_width,
const int32_t  row_size_quad 
)
private

Definition at line 1480 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, CHECK, QueryMemoryDescriptor::didOutputColumnar(), emitCall(), executor_, QueryMemoryDescriptor::getEntryCount(), LL_BUILDER, LL_CONTEXT, LL_INT, and CompilationOptions::with_dynamic_watchdog.

Referenced by codegenGroupBy().

1487  {
1488  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1489  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1490  CHECK(key_width == sizeof(int32_t));
1491  group_key =
1492  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1493  }
1494  std::vector<llvm::Value*> func_args{
1495  groups_buffer,
1496  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1497  &*group_key,
1498  &*key_size_lv,
1499  LL_INT(static_cast<int32_t>(key_width))};
1500  std::string func_name{"get_group_value"};
1501  if (query_mem_desc.didOutputColumnar()) {
1502  func_name += "_columnar_slot";
1503  } else {
1504  func_args.push_back(LL_INT(row_size_quad));
1505  }
1506  if (co.with_dynamic_watchdog) {
1507  func_name += "_with_watchdog";
1508  }
1509  if (query_mem_desc.didOutputColumnar()) {
1510  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1511  } else {
1512  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1513  }
1514 }
#define LL_BUILDER
#define LL_CONTEXT
#define LL_INT(v)
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::tuple< llvm::Value *, llvm::Value * > GroupByAndAggregate::codegenMultiColumnPerfectHash ( llvm::Value *  groups_buffer,
llvm::Value *  group_key,
llvm::Value *  key_size_lv,
const QueryMemoryDescriptor query_mem_desc,
const int32_t  row_size_quad 
)
private

Definition at line 1436 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, CHECK, codegenPerfectHashFunction(), QueryMemoryDescriptor::didOutputColumnar(), emitCall(), executor_, get_int_type(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getQueryDescriptionType(), GroupByPerfectHash, QueryMemoryDescriptor::hasKeylessHash(), LL_BUILDER, LL_CONTEXT, and LL_INT.

Referenced by codegenGroupBy().

1441  {
1442  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1443  CHECK(query_mem_desc.getQueryDescriptionType() ==
1445  // compute the index (perfect hash)
1446  auto perfect_hash_func = codegenPerfectHashFunction();
1447  auto hash_lv =
1448  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1449 
1450  if (query_mem_desc.didOutputColumnar()) {
1451  if (!query_mem_desc.hasKeylessHash()) {
1452  const std::string set_matching_func_name{
1453  "set_matching_group_value_perfect_hash_columnar"};
1454  const std::vector<llvm::Value*> set_matching_func_arg{
1455  groups_buffer,
1456  hash_lv,
1457  group_key,
1458  key_size_lv,
1459  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1460  query_mem_desc.getEntryCount())};
1461  emitCall(set_matching_func_name, set_matching_func_arg);
1462  }
1463  return std::make_tuple(groups_buffer, hash_lv);
1464  } else {
1465  if (query_mem_desc.hasKeylessHash()) {
1466  return std::make_tuple(emitCall("get_matching_group_value_perfect_hash_keyless",
1467  {groups_buffer, hash_lv, LL_INT(row_size_quad)}),
1468  nullptr);
1469  } else {
1470  return std::make_tuple(
1471  emitCall(
1472  "get_matching_group_value_perfect_hash",
1473  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1474  nullptr);
1475  }
1476  }
1477 }
#define LL_BUILDER
#define LL_CONTEXT
#define LL_INT(v)
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
QueryDescriptionType getQueryDescriptionType() const
llvm::Function * codegenPerfectHashFunction()
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

llvm::Value * GroupByAndAggregate::codegenOutputSlot ( llvm::Value *  groups_buffer,
const QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
DiamondCodegen diamond_codegen 
)
private

Definition at line 1160 of file GroupByAndAggregate.cpp.

References run_benchmark_import::args, AUTOMATIC_IR_METADATA, CHECK, CHECK_EQ, CHECK_GE, CHECK_LT, CodeGenerator::codegen(), QueryMemoryDescriptor::didOutputColumnar(), emitCall(), executor_, get_arg_by_name(), get_heap_key_slot_index(), QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getQueryDescriptionType(), QueryMemoryDescriptor::getRowSize(), RelAlgExecutionUnit::groupby_exprs, inline_fp_null_val(), inline_int_null_val(), SortInfo::limit, LL_BOOL, LL_BUILDER, LL_FP, LL_INT, anonymous_namespace{Utm.h}::n, SortInfo::offset, SortInfo::order_entries, CodeGenerator::posArg(), Projection, ra_exe_unit_, ROW_FUNC, RelAlgExecutionUnit::sort_info, RelAlgExecutionUnit::target_exprs, to_string(), and QueryMemoryDescriptor::useStreamingTopN().

Referenced by codegenGroupBy(), and codegenWindowRowPointer().

1164  {
1165  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1167  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1168  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
1169  CHECK(!group_expr);
1170  if (!query_mem_desc.didOutputColumnar()) {
1171  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
1172  }
1173  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1174  ? 0
1175  : query_mem_desc.getRowSize() / sizeof(int64_t);
1176  CodeGenerator code_generator(executor_);
1177  if (query_mem_desc.useStreamingTopN()) {
1178  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
1179  CHECK_GE(only_order_entry.tle_no, int(1));
1180  const size_t target_idx = only_order_entry.tle_no - 1;
1181  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1182  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1183  const auto chosen_bytes =
1184  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1185  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1186  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1187  const uint32_t n =
1189  std::string fname = "get_bin_from_k_heap";
1190  const auto& oe_ti = order_entry_expr->get_type_info();
1191  llvm::Value* null_key_lv = nullptr;
1192  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1193  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1194  switch (bit_width) {
1195  case 32:
1196  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1197  break;
1198  case 64:
1199  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1200  break;
1201  default:
1202  CHECK(false);
1203  }
1204  fname += "_int" + std::to_string(bit_width) + "_t";
1205  } else {
1206  CHECK(oe_ti.is_fp());
1207  if (order_entry_lv->getType()->isDoubleTy()) {
1208  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1209  } else {
1210  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1211  }
1212  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1213  }
1214  const auto key_slot_idx =
1216  return emitCall(
1217  fname,
1218  {groups_buffer,
1219  LL_INT(n),
1220  LL_INT(row_size_quad),
1221  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1222  LL_BOOL(only_order_entry.is_desc),
1223  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1224  LL_BOOL(only_order_entry.nulls_first),
1225  null_key_lv,
1226  order_entry_lv});
1227  } else {
1228  auto* arg = get_arg_by_name(ROW_FUNC, "max_matched");
1229  const auto output_buffer_entry_count_lv =
1230  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1231  arg = get_arg_by_name(ROW_FUNC, "old_total_matched");
1232  const auto group_expr_lv =
1233  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1234  std::vector<llvm::Value*> args{groups_buffer,
1235  output_buffer_entry_count_lv,
1236  group_expr_lv,
1237  code_generator.posArg(nullptr)};
1238  if (query_mem_desc.didOutputColumnar()) {
1239  const auto columnar_output_offset =
1240  emitCall("get_columnar_scan_output_offset", args);
1241  return columnar_output_offset;
1242  }
1243  args.push_back(LL_INT(row_size_quad));
1244  return emitCall("get_scan_output_slot", args);
1245  }
1246 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
#define ROW_FUNC
#define LL_BUILDER
#define LL_INT(v)
#define CHECK_GE(x, y)
Definition: Logger.h:306
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
std::string to_string(char const *&&v)
#define LL_BOOL(v)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:168
std::optional< size_t > limit
std::list< Analyzer::OrderEntry > order_entries
#define LL_FP(v)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
#define AUTOMATIC_IR_METADATA(CGENSTATE)
QueryDescriptionType getQueryDescriptionType() const
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK(condition)
Definition: Logger.h:291
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
constexpr double n
Definition: Utm.h:38
const RelAlgExecutionUnit & ra_exe_unit_
size_t getColOffInBytes(const size_t col_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

llvm::Function * GroupByAndAggregate::codegenPerfectHashFunction ( )
private

Definition at line 1516 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CHECK_GT, executor_, anonymous_namespace{GroupByAndAggregate.cpp}::get_expr_range_info(), get_int_type(), getBucketedCardinality(), RelAlgExecutionUnit::groupby_exprs, GroupByPerfectHash, LL_CONTEXT, LL_INT, mark_function_always_inline(), query_infos_, and ra_exe_unit_.

Referenced by codegenMultiColumnPerfectHash().

1516  {
1517  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1518  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1519  auto ft = llvm::FunctionType::get(
1520  get_int_type(32, LL_CONTEXT),
1521  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1522  false);
1523  auto key_hash_func = llvm::Function::Create(ft,
1524  llvm::Function::ExternalLinkage,
1525  "perfect_key_hash",
1526  executor_->cgen_state_->module_);
1527  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1528  mark_function_always_inline(key_hash_func);
1529  auto& key_buff_arg = *key_hash_func->args().begin();
1530  llvm::Value* key_buff_lv = &key_buff_arg;
1531  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1532  llvm::IRBuilder<> key_hash_func_builder(bb);
1533  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1534  std::vector<int64_t> cardinalities;
1535  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1536  auto col_range_info =
1537  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1538  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1539  cardinalities.push_back(getBucketedCardinality(col_range_info));
1540  }
1541  size_t dim_idx = 0;
1542  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1543  auto* gep = key_hash_func_builder.CreateGEP(
1544  key_buff_lv->getType()->getScalarType()->getPointerElementType(),
1545  key_buff_lv,
1546  LL_INT(dim_idx));
1547  auto key_comp_lv =
1548  key_hash_func_builder.CreateLoad(gep->getType()->getPointerElementType(), gep);
1549  auto col_range_info =
1550  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1551  auto crt_term_lv =
1552  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1553  if (col_range_info.bucket) {
1554  crt_term_lv =
1555  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1556  }
1557  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1558  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1559  LL_INT(cardinalities[prev_dim_idx]));
1560  }
1561  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1562  ++dim_idx;
1563  }
1564  key_hash_func_builder.CreateRet(
1565  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1566  return key_hash_func;
1567 }
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
#define LL_CONTEXT
void mark_function_always_inline(llvm::Function *func)
#define LL_INT(v)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:305
#define AUTOMATIC_IR_METADATA(CGENSTATE)
const std::vector< InputTableInfo > & query_infos_
#define CHECK(condition)
Definition: Logger.h:291
const RelAlgExecutionUnit & ra_exe_unit_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::tuple< llvm::Value *, llvm::Value * > GroupByAndAggregate::codegenSingleColumnPerfectHash ( const QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
llvm::Value *  groups_buffer,
llvm::Value *  group_expr_lv_translated,
llvm::Value *  group_expr_lv_original,
const int32_t  row_size_quad 
)
private

Definition at line 1386 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, CHECK, CompilationOptions::device_type, QueryMemoryDescriptor::didOutputColumnar(), emitCall(), executor_, QueryMemoryDescriptor::getBucket(), QueryMemoryDescriptor::getMinVal(), QueryMemoryDescriptor::hasKeylessHash(), QueryMemoryDescriptor::interleavedBins(), LL_INT, QueryMemoryDescriptor::mustUseBaselineSort(), and QueryMemoryDescriptor::usesGetGroupValueFast().

Referenced by codegenGroupBy().

1392  {
1393  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1394  CHECK(query_mem_desc.usesGetGroupValueFast());
1395  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1396  ? "get_columnar_group_bin_offset"
1397  : "get_group_value_fast"};
1398  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1399  get_group_fn_name += "_keyless";
1400  }
1401  if (query_mem_desc.interleavedBins(co.device_type)) {
1402  CHECK(!query_mem_desc.didOutputColumnar());
1403  CHECK(query_mem_desc.hasKeylessHash());
1404  get_group_fn_name += "_semiprivate";
1405  }
1406  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1407  &*group_expr_lv_translated};
1408  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1409  query_mem_desc.mustUseBaselineSort()) {
1410  get_group_fn_name += "_with_original_key";
1411  get_group_fn_args.push_back(group_expr_lv_original);
1412  }
1413  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1414  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1415  if (!query_mem_desc.hasKeylessHash()) {
1416  if (!query_mem_desc.didOutputColumnar()) {
1417  get_group_fn_args.push_back(LL_INT(row_size_quad));
1418  }
1419  } else {
1420  if (!query_mem_desc.didOutputColumnar()) {
1421  get_group_fn_args.push_back(LL_INT(row_size_quad));
1422  }
1423  if (query_mem_desc.interleavedBins(co.device_type)) {
1424  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1425  get_group_fn_args.push_back(warp_idx);
1426  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1427  }
1428  }
1429  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1430  return std::make_tuple(&*groups_buffer,
1431  emitCall(get_group_fn_name, get_group_fn_args));
1432  }
1433  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1434 }
#define LL_INT(v)
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
ExecutorDeviceType device_type
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

llvm::Value * GroupByAndAggregate::codegenVarlenOutputBuffer ( const QueryMemoryDescriptor query_mem_desc)
private

Definition at line 1371 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, CHECK, executor_, QueryMemoryDescriptor::hasVarlenOutput(), LL_CONTEXT, and ROW_FUNC.

Referenced by codegen().

1372  {
1373  if (!query_mem_desc.hasVarlenOutput()) {
1374  return nullptr;
1375  }
1376 
1377  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1378  auto arg_it = ROW_FUNC->arg_begin();
1379  arg_it++; /* groups_buffer */
1380  auto varlen_output_buffer = arg_it++;
1381  CHECK(varlen_output_buffer->getType() == llvm::Type::getInt64PtrTy(LL_CONTEXT));
1382  return varlen_output_buffer;
1383 }
#define ROW_FUNC
#define LL_CONTEXT
#define AUTOMATIC_IR_METADATA(CGENSTATE)
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

llvm::Value * GroupByAndAggregate::codegenWindowRowPointer ( const Analyzer::WindowFunction window_func,
const QueryMemoryDescriptor query_mem_desc,
const CompilationOptions co,
DiamondCodegen diamond_codegen 
)
private

Definition at line 1620 of file GroupByAndAggregate.cpp.

References run_benchmark_import::args, AUTOMATIC_IR_METADATA, codegenOutputSlot(), CodeGenerator::codegenWindowPosition(), QueryMemoryDescriptor::didOutputColumnar(), emitCall(), executor_, get_int_type(), WindowProjectNodeContext::getActiveWindowFunctionContext(), QueryMemoryDescriptor::getEntryCount(), Analyzer::WindowFunction::getKind(), QueryMemoryDescriptor::getRowSize(), LL_BUILDER, LL_CONTEXT, LL_INT, CodeGenerator::posArg(), ROW_FUNC, and window_function_is_aggregate().

Referenced by TargetExprCodegen::codegen().

1624  {
1625  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1626  const auto window_func_context =
1628  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1629  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1630  ? 0
1631  : query_mem_desc.getRowSize() / sizeof(int64_t);
1632  auto arg_it = ROW_FUNC->arg_begin();
1633  auto groups_buffer = arg_it++;
1634  CodeGenerator code_generator(executor_);
1635  auto window_pos_lv = code_generator.codegenWindowPosition(
1636  window_func_context, code_generator.posArg(nullptr));
1637  const auto pos_in_window =
1638  LL_BUILDER.CreateTrunc(window_pos_lv, get_int_type(32, LL_CONTEXT));
1639  llvm::Value* entry_count_lv =
1640  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1641  std::vector<llvm::Value*> args{
1642  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1643  if (query_mem_desc.didOutputColumnar()) {
1644  const auto columnar_output_offset =
1645  emitCall("get_columnar_scan_output_offset", args);
1646  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1647  }
1648  args.push_back(LL_INT(row_size_quad));
1649  return emitCall("get_scan_output_slot", args);
1650  }
1651  auto arg_it = ROW_FUNC->arg_begin();
1652  auto groups_buffer = arg_it++;
1653  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1654 }
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:2794
#define ROW_FUNC
#define LL_BUILDER
#define LL_CONTEXT
#define LL_INT(v)
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
#define AUTOMATIC_IR_METADATA(CGENSTATE)
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:43
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

llvm::Value * GroupByAndAggregate::convertNullIfAny ( const SQLTypeInfo arg_type,
const TargetInfo agg_info,
llvm::Value *  target 
)
private

Definition at line 1569 of file GroupByAndAggregate.cpp.

References TargetInfo::agg_kind, AUTOMATIC_IR_METADATA, CHECK, executor_, SQLTypeInfo::get_size(), SQLTypeInfo::is_fp(), kAPPROX_COUNT_DISTINCT, kCOUNT, LL_BUILDER, and TargetInfo::sql_type.

Referenced by TargetExprCodegen::codegenAggregate().

1571  {
1572  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1573  const auto& agg_type = agg_info.sql_type;
1574  const size_t chosen_bytes = agg_type.get_size();
1575 
1576  bool need_conversion{false};
1577  llvm::Value* arg_null{nullptr};
1578  llvm::Value* agg_null{nullptr};
1579  llvm::Value* target_to_cast{target};
1580  if (arg_type.is_fp()) {
1581  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1582  if (agg_type.is_fp()) {
1583  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1584  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1585  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1586  need_conversion = true;
1587  }
1588  } else {
1589  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1590  return target;
1591  }
1592  } else {
1593  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1594  if (agg_type.is_fp()) {
1595  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1596  need_conversion = true;
1597  target_to_cast = executor_->castToFP(target, arg_type, agg_type);
1598  } else {
1599  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1600  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1601  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1602  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1603  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1604  need_conversion = true;
1605  }
1606  }
1607  }
1608  if (need_conversion) {
1609  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1610  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1611  return LL_BUILDER.CreateSelect(
1612  cmp,
1613  agg_null,
1614  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1615  } else {
1616  return target;
1617  }
1618 }
HOST DEVICE int get_size() const
Definition: sqltypes.h:403
#define LL_BUILDER
SQLTypeInfo sql_type
Definition: TargetInfo.h:52
bool is_fp() const
Definition: sqltypes.h:571
#define AUTOMATIC_IR_METADATA(CGENSTATE)
SQLAgg agg_kind
Definition: TargetInfo.h:51
Definition: sqldefs.h:78
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

llvm::Value * GroupByAndAggregate::emitCall ( const std::string &  fname,
const std::vector< llvm::Value * > &  args 
)
private

Definition at line 2203 of file GroupByAndAggregate.cpp.

References AUTOMATIC_IR_METADATA, and executor_.

Referenced by TargetExprCodegen::codegen(), TargetExprCodegen::codegenAggregate(), codegenCountDistinct(), codegenEstimator(), codegenMultiColumnBaselineHash(), codegenMultiColumnPerfectHash(), codegenOutputSlot(), codegenSingleColumnPerfectHash(), and codegenWindowRowPointer().

2204  {
2205  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2206  return executor_->cgen_state_->emitCall(fname, args);
2207 }
#define AUTOMATIC_IR_METADATA(CGENSTATE)

+ Here is the caller graph for this function:

llvm::Value * GroupByAndAggregate::getAdditionalLiteral ( const int32_t  off)
private

Definition at line 2007 of file GroupByAndAggregate.cpp.

References shared::bit_cast(), CHECK_LT, get_arg_by_name(), get_int_type(), LL_BUILDER, LL_CONTEXT, LL_INT, and ROW_FUNC.

Referenced by codegenCountDistinct().

2007  {
2008  CHECK_LT(off, 0);
2009  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
2010  auto* bit_cast = LL_BUILDER.CreateBitCast(
2011  lit_buff_lv, llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0));
2012  auto* gep =
2013  LL_BUILDER.CreateGEP(bit_cast->getType()->getScalarType()->getPointerElementType(),
2014  bit_cast,
2015  LL_INT(off));
2016  return LL_BUILDER.CreateLoad(gep->getType()->getPointerElementType(), gep);
2017 }
#define ROW_FUNC
#define LL_BUILDER
#define LL_CONTEXT
#define LL_INT(v)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:168
#define CHECK_LT(x, y)
Definition: Logger.h:303
TO bit_cast(FROM &&from)
Definition: misc.h:298

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int64_t GroupByAndAggregate::getBucketedCardinality ( const ColRangeInfo col_range_info)
staticprivate

Definition at line 353 of file GroupByAndAggregate.cpp.

References ColRangeInfo::bucket, ColRangeInfo::has_nulls, ColRangeInfo::max, and ColRangeInfo::min.

Referenced by codegenPerfectHashFunction(), and getColRangeInfo().

353  {
354  checked_int64_t crt_col_cardinality =
355  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
356  if (col_range_info.bucket) {
357  crt_col_cardinality /= col_range_info.bucket;
358  }
359  return static_cast<int64_t>(crt_col_cardinality +
360  (1 + (col_range_info.has_nulls ? 1 : 0)));
361 }
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t

+ Here is the caller graph for this function:

ColRangeInfo GroupByAndAggregate::getColRangeInfo ( )
private

Definition at line 215 of file GroupByAndAggregate.cpp.

References anonymous_namespace{GroupByAndAggregate.cpp}::cardinality_estimate_less_than_column_range(), CHECK, CHECK_GE, device_type_, executor_, anonymous_namespace{GroupByAndAggregate.cpp}::expr_is_rowid(), anonymous_namespace{GroupByAndAggregate.cpp}::get_expr_range_info(), Executor::getBaselineThreshold(), getBucketedCardinality(), group_cardinality_estimation_, RelAlgExecutionUnit::groupby_exprs, GroupByBaselineHash, GroupByPerfectHash, anonymous_namespace{GroupByAndAggregate.cpp}::has_count_distinct(), anonymous_namespace{GroupByAndAggregate.cpp}::is_column_range_too_big_for_perfect_hash(), kENCODING_DICT, MAX_BUFFER_SIZE, SortInfo::order_entries, RelAlgExecutionUnit::quals, query_infos_, ra_exe_unit_, RelAlgExecutionUnit::simple_quals, RelAlgExecutionUnit::sort_info, and RelAlgExecutionUnit::target_exprs.

Referenced by initQueryMemoryDescriptorImpl().

215  {
216  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
217  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
218  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
219  // can expect this to be true anyway for grouped queries since the precise version
220  // uses significantly more memory.
221  const int64_t baseline_threshold =
223  // `group_cardinality_estimation_` is set as the result of (NDV) cardinality estimator
224  auto group_cardinality_estimation = group_cardinality_estimation_.value_or(0);
225  if (ra_exe_unit_.groupby_exprs.size() != 1) {
226  try {
227  checked_int64_t cardinality{1};
228  bool has_nulls{false};
229  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
230  auto col_range_info = get_expr_range_info(
231  ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
232  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
233  // going through baseline hash if a non-integer type is encountered
235  0,
236  group_cardinality_estimation,
237  0,
238  false};
239  }
240  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
241  CHECK_GE(crt_col_cardinality, 0);
242  cardinality *= crt_col_cardinality;
243  if (col_range_info.has_nulls) {
244  has_nulls = true;
245  }
246  }
247  // For zero or high cardinalities, use baseline layout.
248  if (!cardinality || cardinality > baseline_threshold) {
250  0,
251  group_cardinality_estimation,
252  0,
253  false};
254  }
255  // todo (yoonmin) : should we consider min(group_cardinality_estimation,
256  // cardinality) if we have `group_cardinality_estimation` value?
258  0,
259  int64_t(cardinality),
260  0,
261  has_nulls};
262  } catch (...) { // overflow when computing cardinality
264  0,
265  group_cardinality_estimation,
266  0,
267  false};
268  }
269  }
270  // For single column groupby on high timestamps, force baseline hash due to wide ranges
271  // we are likely to encounter when applying quals to the expression range
272  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
273  // the range is small enough
274  if (ra_exe_unit_.groupby_exprs.front() &&
275  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
276  ra_exe_unit_.simple_quals.size() > 0) {
278  0,
279  group_cardinality_estimation,
280  0,
281  false};
282  }
283  const auto col_range_info = get_expr_range_info(
285  if (!ra_exe_unit_.groupby_exprs.front()) {
286  return col_range_info;
287  }
288  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
289  const int64_t col_count =
291  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
293  max_entry_count = std::min(max_entry_count, baseline_threshold);
294  }
295  const auto& groupby_expr_ti = ra_exe_unit_.groupby_exprs.front()->get_type_info();
296  if (groupby_expr_ti.is_string() && !col_range_info.bucket) {
297  CHECK(groupby_expr_ti.get_compression() == kENCODING_DICT);
298 
299  const bool has_filters =
300  !ra_exe_unit_.quals.empty() || !ra_exe_unit_.simple_quals.empty();
301  if (has_filters &&
302  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
303  // if filters are present, we can use the filter to narrow the cardinality of the
304  // group by in the case of ranges too big for perfect hash. Otherwise, we are better
305  // off attempting perfect hash (since we know the range will be made of
306  // monotonically increasing numbers from min to max for dictionary encoded strings)
307  // and failing later due to excessive memory use.
308  // Check the conditions where baseline hash can provide a performance increase and
309  // return baseline hash (potentially forcing an estimator query) as the range type.
310  // Otherwise, return col_range_info which will likely be perfect hash, though could
311  // be baseline from a previous call of this function prior to the estimator query.
312  if (!ra_exe_unit_.sort_info.order_entries.empty()) {
313  // TODO(adb): allow some sorts to pass through this block by centralizing sort
314  // algorithm decision making
316  // always use baseline hash for column range too big for perfect hash with count
317  // distinct descriptors. We will need 8GB of CPU memory minimum for the perfect
318  // hash group by in this case.
320  col_range_info.min,
321  col_range_info.max,
322  0,
323  col_range_info.has_nulls};
324  } else {
325  // use original col range for sort
326  return col_range_info;
327  }
328  }
329  // if filters are present and the filtered range is less than the cardinality of
330  // the column, consider baseline hash
333  col_range_info)) {
335  col_range_info.min,
336  col_range_info.max,
337  0,
338  col_range_info.has_nulls};
339  }
340  }
341  } else if ((!expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get())) &&
342  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
343  !col_range_info.bucket) {
345  col_range_info.min,
346  col_range_info.max,
347  0,
348  col_range_info.has_nulls};
349  }
350  return col_range_info;
351 }
std::vector< Analyzer::Expr * > target_exprs
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
#define CHECK_GE(x, y)
Definition: Logger.h:306
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
std::list< Analyzer::OrderEntry > order_entries
const std::vector< InputTableInfo > & query_infos_
bool expr_is_rowid(const Analyzer::Expr *expr)
const ExecutorDeviceType device_type_
bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
#define MAX_BUFFER_SIZE
const std::optional< int64_t > group_cardinality_estimation_
std::list< std::shared_ptr< Analyzer::Expr > > quals
#define CHECK(condition)
Definition: Logger.h:291
static size_t getBaselineThreshold(bool for_count_distinct, ExecutorDeviceType device_type)
Definition: Execute.h:1448
const RelAlgExecutionUnit & ra_exe_unit_
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int64_t GroupByAndAggregate::getShardedTopBucket ( const ColRangeInfo col_range_info,
const size_t  shard_count 
) const
private

Definition at line 422 of file GroupByAndAggregate.cpp.

References ColRangeInfo::bucket, CHECK, CHECK_GT, device_type_, executor_, g_leaf_count, and GPU.

Referenced by initQueryMemoryDescriptorImpl().

423  {
424  size_t device_count{0};
426  device_count = executor_->cudaMgr()->getDeviceCount();
427  CHECK_GT(device_count, 0u);
428  }
429 
430  int64_t bucket{col_range_info.bucket};
431 
432  if (shard_count) {
433  CHECK(!col_range_info.bucket);
434  /*
435  when a node has fewer devices than shard count,
436  a) In a distributed setup, the minimum distance between two keys would be
437  device_count because shards are stored consecutively across the physical tables,
438  i.e if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1
439  would have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf
440  node has only 1 device, in this case, all the keys from each node are loaded on
441  the device each.
442 
443  b) In a single node setup, the distance would be minimum of device_count or
444  difference of device_count - shard_count. For example: If a single node server
445  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
446  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9
447  device 3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum
448  of device_count or difference.
449 
450  When a node has device count equal to or more than shard count then the
451  minimum distance is always at least shard_count * no of leaf nodes.
452  */
453  if (device_count < shard_count) {
454  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
455  : std::min(device_count, shard_count - device_count);
456  } else {
457  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
458  }
459  }
460 
461  return bucket;
462 }
#define CHECK_GT(x, y)
Definition: Logger.h:305
const ExecutorDeviceType device_type_
#define CHECK(condition)
Definition: Logger.h:291
size_t g_leaf_count
Definition: ParserNode.cpp:78

+ Here is the caller graph for this function:

bool GroupByAndAggregate::gpuCanHandleOrderEntries ( const std::list< Analyzer::OrderEntry > &  order_entries)
private

Definition at line 979 of file GroupByAndAggregate.cpp.

References CHECK, CHECK_GE, CHECK_LE, executor_, Analyzer::AggExpr::get_arg(), anonymous_namespace{GroupByAndAggregate.cpp}::get_expr_range_info(), Analyzer::Expr::get_type_info(), GroupByPerfectHash, kAPPROX_COUNT_DISTINCT, kAVG, kMAX, kMIN, query_infos_, ra_exe_unit_, and RelAlgExecutionUnit::target_exprs.

Referenced by initQueryMemoryDescriptor().

980  {
981  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
982  return false;
983  }
984  for (const auto& order_entry : order_entries) {
985  CHECK_GE(order_entry.tle_no, 1);
986  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
987  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
988  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
989  return false;
990  }
991  // TODO(alex): relax the restrictions
992  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
993  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
994  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
995  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
996  return false;
997  }
998  if (agg_expr->get_arg()) {
999  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
1000  if (arg_ti.is_fp()) {
1001  return false;
1002  }
1003  auto expr_range_info =
1004  get_expr_range_info(ra_exe_unit_, query_infos_, agg_expr->get_arg(), executor_);
1005  // TOD(adb): QMD not actually initialized here?
1006  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
1007  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
1008  expr_range_info.has_nulls) &&
1009  order_entry.is_desc == order_entry.nulls_first) {
1010  return false;
1011  }
1012  }
1013  const auto& target_ti = target_expr->get_type_info();
1014  CHECK(!target_ti.is_buffer());
1015  if (!target_ti.is_integer()) {
1016  return false;
1017  }
1018  }
1019  return true;
1020 }
std::vector< Analyzer::Expr * > target_exprs
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
#define CHECK_GE(x, y)
Definition: Logger.h:306
Expr * get_arg() const
Definition: Analyzer.h:1330
Definition: sqldefs.h:75
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
const std::vector< InputTableInfo > & query_infos_
#define CHECK_LE(x, y)
Definition: Logger.h:304
#define CHECK(condition)
Definition: Logger.h:291
const RelAlgExecutionUnit & ra_exe_unit_
Definition: sqldefs.h:76
Definition: sqldefs.h:74

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::unique_ptr< QueryMemoryDescriptor > GroupByAndAggregate::initQueryMemoryDescriptor ( const bool  allow_multifrag,
const size_t  max_groups_buffer_entry_count,
const int8_t  crt_min_byte_width,
RenderInfo render_info,
const bool  output_columnar_hint 
)
private

Definition at line 850 of file GroupByAndAggregate.cpp.

References align_to_int64(), CHECK, device_type_, GPU, gpuCanHandleOrderEntries(), initQueryMemoryDescriptorImpl(), SortInfo::order_entries, query_mem_desc, ra_exe_unit_, shard_count_for_top_groups(), and RelAlgExecutionUnit::sort_info.

855  {
856  const auto shard_count = device_type_ == ExecutorDeviceType::GPU
858  : 0;
859  bool sort_on_gpu_hint =
860  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
863  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
864  // but the total output buffer size would be too big or it's a sharded top query.
865  // For the sake of managing risk, use the new result set way very selectively for
866  // this case only (alongside the baseline layout we've enabled for a while now).
867  bool must_use_baseline_sort = shard_count;
868  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
869  while (true) {
870  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
871  max_groups_buffer_entry_count,
872  crt_min_byte_width,
873  sort_on_gpu_hint,
874  render_info,
875  must_use_baseline_sort,
876  output_columnar_hint);
877  CHECK(query_mem_desc);
878  if (query_mem_desc->sortOnGpu() &&
879  (query_mem_desc->getBufferSizeBytes(device_type_) +
880  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
881  2 * 1024 * 1024 * 1024LL) {
882  must_use_baseline_sort = true;
883  sort_on_gpu_hint = false;
884  } else {
885  break;
886  }
887  }
888  return query_mem_desc;
889 }
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
std::list< Analyzer::OrderEntry > order_entries
const ExecutorDeviceType device_type_
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit)
#define CHECK(condition)
Definition: Logger.h:291
const RelAlgExecutionUnit & ra_exe_unit_
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

std::unique_ptr< QueryMemoryDescriptor > GroupByAndAggregate::initQueryMemoryDescriptorImpl ( const bool  allow_multifrag,
const size_t  max_groups_buffer_entry_count,
const int8_t  crt_min_byte_width,
const bool  sort_on_gpu_hint,
RenderInfo render_info,
const bool  must_use_baseline_sort,
const bool  output_columnar_hint 
)
private

Definition at line 891 of file GroupByAndAggregate.cpp.

References CPU, device_type_, executor_, g_enable_watchdog, g_watchdog_baseline_max_groups, anonymous_namespace{GroupByAndAggregate.cpp}::get_keyless_info(), getColRangeInfo(), getShardedTopBucket(), GPU, RelAlgExecutionUnit::groupby_exprs, GroupByBaselineHash, GroupByPerfectHash, ColRangeInfo::hash_type_, QueryMemoryDescriptor::init(), anonymous_namespace{GroupByAndAggregate.cpp}::init_count_distinct_descriptors(), LOG, query_infos_, ra_exe_unit_, shard_count_for_top_groups(), and logger::WARNING.

Referenced by initQueryMemoryDescriptor().

898  {
899  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
900 
901  const bool threads_can_reuse_group_by_buffers =
902  device_type_ == ExecutorDeviceType::CPU && is_group_by &&
903  ra_exe_unit_.groupby_exprs.front();
904 
905  auto col_range_info_nosharding = getColRangeInfo();
906 
907  const auto shard_count = device_type_ == ExecutorDeviceType::GPU
909  : 0;
910 
911  const auto col_range_info =
912  ColRangeInfo{col_range_info_nosharding.hash_type_,
913  col_range_info_nosharding.min,
914  col_range_info_nosharding.max,
915  getShardedTopBucket(col_range_info_nosharding, shard_count),
916  col_range_info_nosharding.has_nulls};
917 
918  // Non-grouped aggregates do not support accessing aggregated ranges
919  // Keyless hash is currently only supported with single-column perfect hash
920  const auto keyless_info =
921  !(is_group_by &&
922  col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash)
923  ? KeylessInfo{false, -1}
925 
926  if (g_enable_watchdog &&
927  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
928  max_groups_buffer_entry_count > g_watchdog_baseline_max_groups) ||
929  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
930  ra_exe_unit_.groupby_exprs.size() == 1 &&
931  (col_range_info.max - col_range_info.min) /
932  std::max(col_range_info.bucket, int64_t(1)) >
933  130000000))) {
934  throw WatchdogException("Query would use too much memory");
935  }
936 
937  const auto count_distinct_descriptors = init_count_distinct_descriptors(
938  ra_exe_unit_, query_infos_, col_range_info, device_type_, executor_);
939  try {
941  ra_exe_unit_,
942  query_infos_,
943  col_range_info,
944  keyless_info,
945  allow_multifrag,
946  device_type_,
947  crt_min_byte_width,
948  sort_on_gpu_hint,
949  shard_count,
950  max_groups_buffer_entry_count,
951  render_info,
952  count_distinct_descriptors,
953  must_use_baseline_sort,
954  output_columnar_hint,
955  /*streaming_top_n_hint=*/true,
956  threads_can_reuse_group_by_buffers);
957  } catch (const StreamingTopNOOM& e) {
958  LOG(WARNING) << e.what() << " Disabling Streaming Top N.";
960  ra_exe_unit_,
961  query_infos_,
962  col_range_info,
963  keyless_info,
964  allow_multifrag,
965  device_type_,
966  crt_min_byte_width,
967  sort_on_gpu_hint,
968  shard_count,
969  max_groups_buffer_entry_count,
970  render_info,
971  count_distinct_descriptors,
972  must_use_baseline_sort,
973  output_columnar_hint,
974  /*streaming_top_n_hint=*/false,
975  threads_can_reuse_group_by_buffers);
976  }
977 }
size_t g_watchdog_baseline_max_groups
KeylessInfo get_keyless_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
#define LOG(tag)
Definition: Logger.h:285
ColRangeInfo getColRangeInfo()
QueryDescriptionType hash_type_
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint, const bool threads_can_reuse_group_by_buffers)
CountDistinctDescriptors init_count_distinct_descriptors(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &group_by_range_info, const ExecutorDeviceType device_type, Executor *executor)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool g_enable_watchdog
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
const std::vector< InputTableInfo > & query_infos_
const ExecutorDeviceType device_type_
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit)
const RelAlgExecutionUnit & ra_exe_unit_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool GroupByAndAggregate::needsUnnestDoublePatch ( llvm::Value const *  val_ptr,
const std::string &  agg_base_name,
const bool  threads_share_memory,
const CompilationOptions co 
) const
private

Definition at line 29 of file MaxwellCodegenPatch.cpp.

References CompilationOptions::device_type, and executor_.

Referenced by TargetExprCodegen::codegenAggregate().

32  {
33  return (executor_->isArchMaxwell(co.device_type) && threads_share_memory &&
34  llvm::isa<llvm::AllocaInst>(val_ptr) &&
35  val_ptr->getType() ==
36  llvm::Type::getDoublePtrTy(executor_->cgen_state_->context_) &&
37  "agg_id" == agg_base_name);
38 }
ExecutorDeviceType device_type

+ Here is the caller graph for this function:

void GroupByAndAggregate::prependForceSync ( )
private

Definition at line 40 of file MaxwellCodegenPatch.cpp.

References executor_.

Referenced by codegen().

40  {
41  executor_->cgen_state_->ir_builder_.CreateCall(
42  executor_->cgen_state_->module_->getFunction("force_sync"));
43 }

+ Here is the caller graph for this function:

size_t GroupByAndAggregate::shard_count_for_top_groups ( const RelAlgExecutionUnit ra_exe_unit)
static

Definition at line 2226 of file GroupByAndAggregate.cpp.

References Catalog_Namespace::get_metadata_for_table(), Analyzer::ColumnVar::getColumnKey(), RelAlgExecutionUnit::groupby_exprs, SortInfo::limit, TableDescriptor::nShards, SortInfo::order_entries, and RelAlgExecutionUnit::sort_info.

Referenced by Executor::collectAllDeviceResults(), RelAlgExecutor::executeRelAlgQuerySingleStep(), initQueryMemoryDescriptor(), and initQueryMemoryDescriptorImpl().

2227  {
2228  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
2229  return 0;
2230  }
2231  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
2232  const auto grouped_col_expr =
2233  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
2234  if (!grouped_col_expr) {
2235  continue;
2236  }
2237  const auto& column_key = grouped_col_expr->getColumnKey();
2238  if (column_key.table_id <= 0) {
2239  return 0;
2240  }
2242  {column_key.db_id, column_key.table_id});
2243  if (td->shardedColumnId == column_key.column_id) {
2244  return td->nShards;
2245  }
2246  }
2247  return 0;
2248 }
const TableDescriptor * get_metadata_for_table(const ::shared::TableKey &table_key, bool populate_fragmenter)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
std::optional< size_t > limit
std::list< Analyzer::OrderEntry > order_entries
const shared::ColumnKey & getColumnKey() const
Definition: Analyzer.h:198

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

Friends And Related Function Documentation

friend class CodeGenerator
friend

Definition at line 219 of file GroupByAndAggregate.h.

friend class ExecutionKernel
friend

Definition at line 220 of file GroupByAndAggregate.h.

friend class Executor
friend

Definition at line 217 of file GroupByAndAggregate.h.

friend class QueryMemoryDescriptor
friend

Definition at line 218 of file GroupByAndAggregate.h.

friend struct TargetExprCodegen
friend

Definition at line 221 of file GroupByAndAggregate.h.

friend struct TargetExprCodegenBuilder
friend

Definition at line 222 of file GroupByAndAggregate.h.

Member Data Documentation

const ExecutorDeviceType GroupByAndAggregate::device_type_
private
const std::optional<int64_t> GroupByAndAggregate::group_cardinality_estimation_
private

Definition at line 215 of file GroupByAndAggregate.h.

Referenced by getColRangeInfo().

bool GroupByAndAggregate::output_columnar_
private

Definition at line 212 of file GroupByAndAggregate.h.

const std::vector<InputTableInfo>& GroupByAndAggregate::query_infos_
private
std::shared_ptr<RowSetMemoryOwner> GroupByAndAggregate::row_set_mem_owner_
private

Definition at line 211 of file GroupByAndAggregate.h.


The documentation for this class was generated from the following files: