33 #include "../CudaMgr/CudaMgr.h"
34 #include "../Shared/checked_alloc.h"
35 #include "../Utils/ChunkIter.h"
44 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
56 int32_t
get_agg_count(
const std::vector<Analyzer::Expr*>& target_exprs) {
58 for (
auto target_expr : target_exprs) {
61 if (!agg_expr || agg_expr->get_aggtype() ==
kSAMPLE) {
64 if (ti.is_array() || (ti.is_string() && ti.get_compression() ==
kENCODING_NONE)) {
66 }
else if (ti.is_geometry()) {
67 agg_count += ti.get_physical_coord_cols() * 2;
73 if (agg_expr && agg_expr->get_aggtype() ==
kAVG) {
89 if (!cd || !cd->isVirtualCol) {
97 for (
const auto& target_expr : ra_exe_unit.
target_exprs) {
107 const int64_t max_entry_count) {
124 const int64_t baseline_threshold =
132 bool has_nulls{
false};
141 cardinality *= crt_col_cardinality;
142 if (col_range_info.has_nulls) {
147 if (!cardinality || cardinality > baseline_threshold) {
152 int64_t(cardinality),
170 return col_range_info;
172 static const int64_t MAX_BUFFER_SIZE = 1 << 30;
173 const int64_t col_count =
175 int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count *
sizeof(int64_t));
177 max_entry_count = std::min(max_entry_count, baseline_threshold);
182 !col_range_info.bucket) {
187 col_range_info.has_nulls};
189 return col_range_info;
199 switch (expr_range.getType()) {
202 expr_range.getIntMin(),
203 expr_range.getIntMax(),
204 expr_range.getBucket(),
205 expr_range.hasNulls()};
220 if (col_range_info.
bucket) {
221 crt_col_cardinality /= col_range_info.
bucket;
223 return static_cast<int64_t
>(crt_col_cardinality +
224 (1 + (col_range_info.
has_nulls ? 1 : 0)));
227 #define LL_CONTEXT executor_->cgen_state_->context_
228 #define LL_BUILDER executor_->cgen_state_->ir_builder_
229 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
230 #define LL_INT(v) executor_->cgen_state_->llInt(v)
231 #define LL_FP(v) executor_->cgen_state_->llFp(v)
232 #define ROW_FUNC executor_->cgen_state_->row_func_
238 const std::vector<InputTableInfo>& query_infos,
239 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner)
240 : executor_(executor)
241 , ra_exe_unit_(ra_exe_unit)
242 , query_infos_(query_infos)
243 , row_set_mem_owner_(row_set_mem_owner)
244 , device_type_(device_type) {
249 const auto& groupby_ti = groupby_expr->get_type_info();
250 if (groupby_ti.is_string() && groupby_ti.get_compression() !=
kENCODING_DICT) {
251 throw std::runtime_error(
252 "Cannot group by string columns which are not dictionary encoded.");
254 if (groupby_ti.is_array()) {
255 throw std::runtime_error(
"Group by array not supported");
257 if (groupby_ti.is_geometry()) {
258 throw std::runtime_error(
"Group by geometry not supported");
264 const size_t shard_count)
const {
265 size_t device_count{0};
267 device_count =
executor_->getCatalog()->getDataMgr().getCudaMgr()->getDeviceCount();
271 int64_t bucket{col_range_info.
bucket};
294 if (device_count < shard_count) {
295 bucket =
g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
296 : std::min(device_count, shard_count - device_count);
298 bucket = shard_count * std::max(
g_leaf_count, static_cast<size_t>(1));
306 const bool allow_multifrag,
307 const size_t max_groups_buffer_entry_count,
308 const int8_t crt_min_byte_width,
310 const bool output_columnar_hint) {
311 const auto shard_count =
315 bool sort_on_gpu_hint =
323 bool must_use_baseline_sort = shard_count;
327 max_groups_buffer_entry_count,
331 must_use_baseline_sort,
332 output_columnar_hint);
333 CHECK(query_mem_desc);
334 if (query_mem_desc->sortOnGpu() &&
336 align_to_int64(query_mem_desc->getEntryCount() *
sizeof(int32_t))) >
337 2 * 1024 * 1024 * 1024L) {
338 must_use_baseline_sort =
true;
339 sort_on_gpu_hint =
false;
348 const bool allow_multifrag,
349 const size_t max_groups_buffer_entry_count,
350 const int8_t crt_min_byte_width,
351 const bool sort_on_gpu_hint,
353 const bool must_use_baseline_sort,
354 const bool output_columnar_hint) {
361 const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
363 auto col_range_info_nosharding = getColRangeInfo();
365 const auto shard_count =
367 ? shard_count_for_top_groups(ra_exe_unit_, *executor_->getCatalog())
370 const auto col_range_info =
372 col_range_info_nosharding.min,
373 col_range_info_nosharding.max,
374 getShardedTopBucket(col_range_info_nosharding, shard_count),
375 col_range_info_nosharding.has_nulls};
379 const auto keyless_info =
382 ra_exe_unit_.groupby_exprs.size() == 1)
384 : getKeylessInfo(ra_exe_unit_.target_exprs, is_group_by);
388 max_groups_buffer_entry_count > 120000000) ||
390 ra_exe_unit_.groupby_exprs.size() == 1 &&
391 (col_range_info.max - col_range_info.min) /
392 std::max(col_range_info.bucket, int64_t(1)) >
406 max_groups_buffer_entry_count,
408 count_distinct_descriptors,
409 must_use_baseline_sort,
410 output_columnar_hint);
422 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner) {
429 for (
size_t i = 0; i < array_expr->getElementCount(); i++) {
431 array_expr->getElement(i), executor, row_set_mem_owner);
438 if (cast_expr && cast_expr->get_optype() ==
kCAST && expr_ti.is_string()) {
440 auto sdp = executor->getStringDictionaryProxy(
441 expr_ti.get_comp_param(), row_set_mem_owner,
true);
443 const auto str_lit_expr =
445 if (str_lit_expr && str_lit_expr->get_constval().stringval) {
446 sdp->getOrAddTransient(*str_lit_expr->get_constval().stringval);
456 if (domain_set.empty()) {
459 if (expr_ti.is_string()) {
461 auto sdp = executor->getStringDictionaryProxy(
462 expr_ti.get_comp_param(), row_set_mem_owner,
true);
464 for (
const auto domain_expr : domain_set) {
465 const auto cast_expr =
dynamic_cast<const Analyzer::UOper*
>(domain_expr);
466 const auto str_lit_expr =
469 : dynamic_cast<const Analyzer::Constant*>(domain_expr);
470 if (str_lit_expr && str_lit_expr->get_constval().stringval) {
471 sdp->getOrAddTransient(*str_lit_expr->get_constval().stringval);
482 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner) {
485 group_expr.get(), executor, row_set_mem_owner);
487 for (
const auto target_expr : ra_exe_unit.
target_exprs) {
488 const auto& target_type = target_expr->get_type_info();
489 if (target_type.is_string() && target_type.get_compression() !=
kENCODING_DICT) {
494 if (agg_expr->get_aggtype() ==
kSAMPLE) {
496 agg_expr->get_arg(), executor, row_set_mem_owner);
500 target_expr, executor, row_set_mem_owner);
503 row_set_mem_owner->addLiteralStringDictProxy(executor->lit_str_dict_proxy_);
511 CHECK(agg_info.is_agg);
515 if (arg_ti.is_string() && arg_ti.get_compression() !=
kENCODING_DICT) {
516 throw std::runtime_error(
517 "Strings must be dictionary-encoded for COUNT(DISTINCT).");
520 throw std::runtime_error(
"APPROX_COUNT_DISTINCT on arrays not supported yet");
523 throw std::runtime_error(
524 "APPROX_COUNT_DISTINCT on geometry columns not supported");
526 if (agg_info.is_distinct && arg_ti.is_geometry()) {
527 throw std::runtime_error(
"COUNT DISTINCT on geometry columns not supported");
530 auto arg_range_info =
533 int64_t bitmap_sz_bits{0};
535 const auto error_rate = agg_expr->get_error_rate();
537 CHECK(error_rate->get_type_info().get_type() ==
kINT);
538 CHECK_GE(error_rate->get_constval().intval, 1);
545 !(arg_ti.is_array() || arg_ti.is_geometry())) {
547 if (arg_range_info.isEmpty()) {
548 count_distinct_descriptors.emplace_back(
558 if (agg_info.agg_kind ==
kCOUNT) {
559 bitmap_sz_bits = arg_range_info.max - arg_range_info.min + 1;
560 const int64_t MAX_BITMAP_BITS{8 * 1000 * 1000 * 1000L};
561 if (bitmap_sz_bits <= 0 || bitmap_sz_bits > MAX_BITMAP_BITS) {
568 !(arg_ti.is_array() || arg_ti.is_geometry())) {
575 const auto sub_bitmap_count =
577 count_distinct_descriptors.emplace_back(
589 return count_distinct_descriptors;
603 const std::vector<Analyzer::Expr*>& target_expr_list,
604 const bool is_group_by)
const {
605 bool keyless{
true}, found{
false}, shared_mem_support{
false},
606 shared_mem_valid_data_type{
true};
610 int32_t num_agg_expr{0};
612 for (
const auto target_expr : target_expr_list) {
618 shared_mem_valid_data_type =
621 if (agg_info.is_agg) {
627 const auto arg_expr =
agg_arg(target_expr);
629 switch (agg_info.agg_kind) {
632 if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
635 expr_range_info.hasNulls()) {
642 if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
645 expr_range_info.hasNulls()) {
650 if (!agg_info.skip_null_val) {
651 shared_mem_support =
true;
655 auto arg_ti = arg_expr->get_type_info();
657 arg_ti.set_notnull(
true);
659 if (!arg_ti.get_notnull()) {
662 !expr_range_info.hasNulls()) {
667 switch (expr_range_info.getType()) {
670 if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
675 if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
686 CHECK(agg_expr && agg_expr->get_arg());
687 const auto& arg_ti = agg_expr->get_arg()->get_type_info();
688 if (arg_ti.is_string() || arg_ti.is_array()) {
691 auto expr_range_info =
695 is_group_by || float_argument_input,
696 float_argument_input ?
sizeof(
float) : 8);
697 switch (expr_range_info.getType()) {
701 *
reinterpret_cast<const double*
>(may_alias_ptr(&init_max));
702 if (expr_range_info.getFpMax() < double_max) {
708 if (expr_range_info.getIntMax() < init_max) {
718 CHECK(agg_expr && agg_expr->get_arg());
719 const auto& arg_ti = agg_expr->get_arg()->get_type_info();
720 if (arg_ti.is_string() || arg_ti.is_array()) {
723 auto expr_range_info =
728 expr_range_info.hasNulls()) {
733 is_group_by || float_argument_input,
734 float_argument_input ?
sizeof(
float) : 8);
735 switch (expr_range_info.getType()) {
739 *
reinterpret_cast<const double*
>(may_alias_ptr(&init_min));
740 if (expr_range_info.getFpMin() > double_min) {
746 if (expr_range_info.getIntMin() > init_min) {
776 ((num_agg_expr == 1) && (target_expr_list.size() <= 2))
777 ? shared_mem_support && shared_mem_valid_data_type
788 switch (target_type_info.
get_type()) {
813 if (dynamic_cast<Analyzer::UOper*>(expr) &&
814 static_cast<Analyzer::UOper*>(expr)->get_optype() ==
kUNNEST) {
821 const std::list<Analyzer::OrderEntry>& order_entries) {
822 if (order_entries.size() > 1) {
825 for (
const auto order_entry : order_entries) {
829 if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
834 if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() ==
kAVG ||
835 agg_expr->get_aggtype() ==
kMIN || agg_expr->get_aggtype() ==
kMAX ||
839 if (agg_expr->get_arg()) {
841 if (arg_ti.is_fp()) {
848 expr_range_info.has_nulls) &&
849 order_entry.is_desc == order_entry.nulls_first) {
853 const auto& target_ti = target_expr->get_type_info();
854 CHECK(!target_ti.is_array());
855 if (!target_ti.is_integer()) {
865 const bool chain_to_next,
866 const std::string& label_prefix,
868 const bool share_false_edge_with_parent)
869 : executor_(executor), chain_to_next_(chain_to_next), parent_(parent) {
874 if (share_false_edge_with_parent) {
888 chain_to_next_ =
true;
892 CHECK(!parent_ || orig_cond_false_ != parent_->cond_false_);
893 cond_false_ = cond_false;
897 if (parent_ && orig_cond_false_ != parent_->cond_false_) {
899 }
else if (chain_to_next_) {
902 if (!parent_ || (!chain_to_next_ && cond_false_ != parent_->cond_false_)) {
908 llvm::BasicBlock* sc_false,
911 CHECK(filter_result);
913 bool can_return_error =
false;
914 llvm::BasicBlock* filter_false{
nullptr};
936 llvm::Value* old_total_matched_val{
nullptr};
938 old_total_matched_val =
939 LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
942 llvm::AtomicOrdering::Monotonic);
944 old_total_matched_val =
LL_BUILDER.CreateLoad(total_matched_ptr);
950 LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
953 auto agg_out_ptr_w_idx =
codegenGroupBy(query_mem_desc, co, filter_cfg);
958 filter_cfg.setChainToNext();
966 llvm::Value* nullcheck_cond{
nullptr};
968 nullcheck_cond =
LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
972 std::get<0>(agg_out_ptr_w_idx),
973 llvm::ConstantPointerNull::get(
977 nullcheck_cond,
executor_,
false,
"groupby_nullcheck", &filter_cfg,
false);
980 can_return_error =
true;
990 code_generator.
posArg(
nullptr),
996 std::stack<llvm::BasicBlock*> array_loops;
999 auto arg_it =
ROW_FUNC->arg_begin();
1000 std::vector<llvm::Value*> agg_out_vec;
1002 agg_out_vec.push_back(&*arg_it++);
1015 }
else if (sc_false) {
1016 const auto saved_insert_block =
LL_BUILDER.GetInsertBlock();
1019 LL_BUILDER.SetInsertPoint(saved_insert_block);
1022 return can_return_error;
1039 : query_mem_desc.
getRowSize() /
sizeof(int64_t);
1043 CHECK_GE(only_order_entry.tle_no,
int(1));
1044 const size_t target_idx = only_order_entry.tle_no - 1;
1047 const auto chosen_bytes =
1049 auto order_entry_lv =
executor_->cgen_state_->castToTypeIn(
1050 code_generator.
codegen(order_entry_expr,
true, co).front(), chosen_bytes * 8);
1052 std::string fname =
"get_bin_from_k_heap";
1053 const auto& oe_ti = order_entry_expr->get_type_info();
1054 llvm::Value* null_key_lv =
nullptr;
1055 if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1056 const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1057 switch (bit_width) {
1069 CHECK(oe_ti.is_fp());
1070 if (order_entry_lv->getType()->isDoubleTy()) {
1075 fname += order_entry_lv->getType()->isDoubleTy() ?
"_double" :
"_float";
1077 const auto key_slot_idx =
1085 LL_BOOL(only_order_entry.is_desc),
1086 LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1087 LL_BOOL(only_order_entry.nulls_first),
1091 llvm::Value* output_buffer_entry_count_lv{
nullptr};
1093 output_buffer_entry_count_lv =
1095 CHECK(output_buffer_entry_count_lv);
1097 const auto group_expr_lv =
1099 std::vector<llvm::Value*>
args{
1101 output_buffer_entry_count_lv
1102 ? output_buffer_entry_count_lv
1105 code_generator.
posArg(
nullptr)};
1107 const auto columnar_output_offset =
1109 return columnar_output_offset;
1120 auto arg_it =
ROW_FUNC->arg_begin();
1123 std::stack<llvm::BasicBlock*> array_loops;
1128 return std::make_tuple(
1132 return std::make_tuple(
1145 : query_mem_desc.
getRowSize() /
sizeof(int64_t);
1151 llvm::Value* group_key =
nullptr;
1152 llvm::Value* key_size_lv =
nullptr;
1163 col_width_size ==
sizeof(int32_t)
1171 int32_t subkey_idx = 0;
1175 const auto translated_null_value =
static_cast<int64_t
>(
1180 (col_range_info.bucket ? col_range_info.bucket : 1));
1182 const bool col_has_nulls =
1187 : col_range_info.has_nulls)
1190 const auto group_expr_lvs =
1191 executor_->groupByColumnCodegen(group_expr.get(),
1195 translated_null_value,
1199 const auto group_expr_lv = group_expr_lvs.translated_value;
1206 group_expr_lvs.original_value,
1218 &*
groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1230 return std::make_tuple(
nullptr,
nullptr);
1233 std::tuple<llvm::Value*, llvm::Value*>
1238 llvm::Value* group_expr_lv_translated,
1239 llvm::Value* group_expr_lv_original,
1240 const int32_t row_size_quad) {
1243 ?
"get_columnar_group_bin_offset"
1244 :
"get_group_value_fast"};
1246 get_group_fn_name +=
"_keyless";
1251 get_group_fn_name +=
"_semiprivate";
1253 std::vector<llvm::Value*> get_group_fn_args{&*
groups_buffer,
1254 &*group_expr_lv_translated};
1255 if (group_expr_lv_original && get_group_fn_name ==
"get_group_value_fast" &&
1257 get_group_fn_name +=
"_with_original_key";
1258 get_group_fn_args.push_back(group_expr_lv_original);
1264 get_group_fn_args.push_back(
LL_INT(row_size_quad));
1268 get_group_fn_args.push_back(
LL_INT(row_size_quad));
1272 get_group_fn_args.push_back(warp_idx);
1276 if (get_group_fn_name ==
"get_columnar_group_bin_offset") {
1277 return std::make_tuple(&*groups_buffer,
1278 emitCall(get_group_fn_name, get_group_fn_args));
1280 return std::make_tuple(
emitCall(get_group_fn_name, get_group_fn_args),
nullptr);
1284 llvm::Value* groups_buffer,
1285 llvm::Value* group_key,
1286 llvm::Value* key_size_lv,
1288 const int32_t row_size_quad) {
1294 LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1297 const std::string set_matching_func_name{
1298 "set_matching_group_value_perfect_hash_columnar"};
1299 const std::vector<llvm::Value*> set_matching_func_arg{
1306 emitCall(set_matching_func_name, set_matching_func_arg);
1307 return std::make_tuple(groups_buffer, hash_lv);
1309 return std::make_tuple(
1310 emitCall(
"get_matching_group_value_perfect_hash",
1316 std::tuple<llvm::Value*, llvm::Value*>
1319 llvm::Value* groups_buffer,
1320 llvm::Value* group_key,
1321 llvm::Value* key_size_lv,
1323 const size_t key_width,
1324 const int32_t row_size_quad) {
1325 auto arg_it =
ROW_FUNC->arg_begin();
1331 CHECK(arg_it->getName() ==
"agg_init_val");
1332 if (group_key->getType() != llvm::Type::getInt64PtrTy(
LL_CONTEXT)) {
1333 CHECK(key_width ==
sizeof(int32_t));
1337 std::vector<llvm::Value*> func_args{
1342 LL_INT(static_cast<int32_t>(key_width))};
1343 std::string func_name{
"get_group_value"};
1345 func_name +=
"_columnar_slot";
1347 func_args.push_back(
LL_INT(row_size_quad));
1348 func_args.push_back(&*arg_it);
1351 func_name +=
"_with_watchdog";
1354 return std::make_tuple(groups_buffer,
emitCall(func_name, func_args));
1356 return std::make_tuple(
emitCall(func_name, func_args),
nullptr);
1362 auto ft = llvm::FunctionType::get(
1366 auto key_hash_func = llvm::Function::Create(ft,
1367 llvm::Function::ExternalLinkage,
1370 executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1372 auto& key_buff_arg = *key_hash_func->args().begin();
1373 llvm::Value* key_buff_lv = &key_buff_arg;
1374 auto bb = llvm::BasicBlock::Create(
LL_CONTEXT,
"entry", key_hash_func);
1375 llvm::IRBuilder<> key_hash_func_builder(bb);
1377 std::vector<int64_t> cardinalities;
1385 auto key_comp_lv = key_hash_func_builder.CreateLoad(
1386 key_hash_func_builder.CreateGEP(key_buff_lv,
LL_INT(dim_idx)));
1389 key_hash_func_builder.CreateSub(key_comp_lv,
LL_INT(col_range_info.min));
1390 if (col_range_info.bucket) {
1392 key_hash_func_builder.CreateSDiv(crt_term_lv,
LL_INT(col_range_info.bucket));
1394 for (
size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1395 crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1396 LL_INT(cardinalities[prev_dim_idx]));
1398 hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1401 key_hash_func_builder.CreateRet(
1403 return key_hash_func;
1408 llvm::Value* target) {
1409 const auto& agg_type = agg_info.
sql_type;
1410 const size_t chosen_bytes = agg_type.
get_size();
1412 bool need_conversion{
false};
1413 llvm::Value* arg_null{
nullptr};
1414 llvm::Value* agg_null{
nullptr};
1415 llvm::Value* target_to_cast{target};
1416 if (arg_type.
is_fp()) {
1417 arg_null =
executor_->cgen_state_->inlineFpNull(arg_type);
1418 if (agg_type.is_fp()) {
1419 agg_null =
executor_->cgen_state_->inlineFpNull(agg_type);
1420 if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1421 static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1422 need_conversion =
true;
1429 arg_null =
executor_->cgen_state_->inlineIntNull(arg_type);
1430 if (agg_type.is_fp()) {
1431 agg_null =
executor_->cgen_state_->inlineFpNull(agg_type);
1432 need_conversion =
true;
1433 target_to_cast =
executor_->castToFP(target);
1435 agg_null =
executor_->cgen_state_->inlineIntNull(agg_type);
1436 if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1437 static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1438 (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1439 static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1440 need_conversion =
true;
1444 if (need_conversion) {
1445 auto cmp = arg_type.
is_fp() ?
LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1450 executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1461 const auto window_func_context =
1466 : query_mem_desc.
getRowSize() /
sizeof(int64_t);
1467 auto arg_it =
ROW_FUNC->arg_begin();
1468 auto groups_buffer = arg_it++;
1470 if (!window_func_context->getRowNumber()) {
1472 window_func_context->setRowNumber(
emitCall(
1473 "row_number_window_func",
1474 {
LL_INT(reinterpret_cast<const int64_t>(window_func_context->output())),
1475 code_generator.
posArg(
nullptr)}));
1477 const auto pos_in_window =
LL_BUILDER.CreateTrunc(window_func_context->getRowNumber(),
1479 llvm::Value* entry_count_lv =
1481 std::vector<llvm::Value*>
args{
1484 const auto columnar_output_offset =
1491 auto arg_it =
ROW_FUNC->arg_begin();
1492 auto groups_buffer = arg_it++;
1497 const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1498 const std::vector<llvm::Value*>& agg_out_vec,
1502 auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1505 const bool is_group_by{std::get<0>(agg_out_ptr_w_idx)};
1506 bool can_return_error =
false;
1508 CHECK(agg_out_vec.empty());
1510 CHECK(!agg_out_vec.empty());
1515 llvm::Value* output_buffer_byte_stream{
nullptr};
1516 llvm::Value* out_row_idx{
nullptr};
1519 output_buffer_byte_stream =
LL_BUILDER.CreateBitCast(
1520 std::get<0>(agg_out_ptr_w_idx),
1521 llvm::PointerType::get(llvm::Type::getInt8Ty(
LL_CONTEXT), 0));
1522 output_buffer_byte_stream->setName(
"out_buff_b_stream");
1523 CHECK(std::get<1>(agg_out_ptr_w_idx));
1524 out_row_idx =
LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1526 out_row_idx->setName(
"out_row_idx");
1535 target_builder(target_expr,
executor_, co);
1544 output_buffer_byte_stream,
1550 executor_->plan_state_->isLazyFetchColumn(target_expr);
1553 return can_return_error;
1560 llvm::Value* output_buffer_byte_stream,
1561 llvm::Value* out_row_idx,
1562 const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1564 const size_t chosen_bytes,
1565 const size_t agg_out_off,
1566 const size_t target_idx) {
1567 llvm::Value* agg_col_ptr{
nullptr};
1573 CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1575 CHECK(output_buffer_byte_stream);
1579 auto out_per_col_byte_idx =
1580 LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1581 auto byte_offset =
LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1582 LL_INT(static_cast<int64_t>(col_off)));
1583 byte_offset->setName(
"out_byte_off_target_" +
std::to_string(target_idx));
1584 auto output_ptr =
LL_BUILDER.CreateGEP(output_buffer_byte_stream, byte_offset);
1588 agg_col_ptr->setName(
"out_ptr_target_" +
std::to_string(target_idx));
1591 CHECK_EQ(
size_t(0), col_off % chosen_bytes);
1592 col_off /= chosen_bytes;
1593 CHECK(std::get<1>(agg_out_ptr_w_idx));
1594 auto offset =
LL_BUILDER.CreateAdd(std::get<1>(agg_out_ptr_w_idx),
LL_INT(col_off));
1597 std::get<0>(agg_out_ptr_w_idx),
1603 CHECK_EQ(
size_t(0), col_off % chosen_bytes);
1604 col_off /= chosen_bytes;
1607 std::get<0>(agg_out_ptr_w_idx),
1616 std::stack<llvm::BasicBlock*>& array_loops,
1621 auto estimator_comp_count_lv =
LL_INT(static_cast<int32_t>(estimator_arg.size()));
1623 estimator_comp_count_lv);
1624 int32_t subkey_idx = 0;
1625 for (
const auto estimator_arg_comp : estimator_arg) {
1626 const auto estimator_arg_comp_lvs =
1627 executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1635 CHECK(!estimator_arg_comp_lvs.original_value);
1636 const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1638 LL_BUILDER.CreateStore(estimator_arg_comp_lv,
1643 const auto key_bytes =
LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1644 const auto estimator_comp_bytes_lv =
1645 LL_INT(static_cast<int32_t>(estimator_arg.size() *
sizeof(int64_t)));
1646 const auto bitmap_size_lv =
1649 {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1653 reinterpret_cast<std::set<int64_t>*
>(*agg)->insert(val);
1658 const int64_t skip_val) {
1659 if (val != skip_val) {
1665 const size_t target_idx,
1667 std::vector<llvm::Value*>& agg_args,
1671 const auto& arg_ti =
1673 if (arg_ti.is_fp()) {
1674 agg_args.back() =
executor_->cgen_state_->ir_builder_.CreateBitCast(
1677 const auto& count_distinct_descriptor =
1682 agg_args.push_back(
LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1686 agg_args.push_back(base_dev_addr);
1687 agg_args.push_back(base_host_addr);
1688 emitCall(
"agg_approximate_count_distinct_gpu", agg_args);
1690 emitCall(
"agg_approximate_count_distinct", agg_args);
1694 std::string agg_fname{
"agg_count_distinct"};
1696 agg_fname +=
"_bitmap";
1697 agg_args.push_back(
LL_INT(static_cast<int64_t>(count_distinct_descriptor.min_val)));
1699 if (agg_info.skip_null_val) {
1700 auto null_lv =
executor_->cgen_state_->castToTypeIn(
1702 ?
static_cast<llvm::Value*
>(
executor_->cgen_state_->inlineFpNull(arg_ti))
1703 : static_cast<llvm::Value*>(
executor_->cgen_state_->inlineIntNull(arg_ti))),
1705 null_lv =
executor_->cgen_state_->ir_builder_.CreateBitCast(
1707 agg_fname +=
"_skip_val";
1708 agg_args.push_back(null_lv);
1712 agg_fname +=
"_gpu";
1715 agg_args.push_back(base_dev_addr);
1716 agg_args.push_back(base_host_addr);
1717 agg_args.push_back(
LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1719 count_distinct_descriptor.bitmapPaddedSizeBytes() %
1720 count_distinct_descriptor.sub_bitmap_count);
1721 agg_args.push_back(
LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1722 count_distinct_descriptor.sub_bitmap_count)));
1727 executor_->cgen_state_->emitExternalCall(
1728 agg_fname, llvm::Type::getVoidTy(
LL_CONTEXT), agg_args);
1749 if (target_ti.is_array() && !
executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1750 const auto target_lvs =
1751 agg_expr ? code_generator.
codegen(agg_expr->get_arg(),
true, co)
1753 target_expr, !
executor_->plan_state_->allow_lazy_fetch_, co);
1754 if (target_ti.isChunkIteratorPackaging()) {
1757 CHECK_EQ(
size_t(1), target_lvs.size());
1758 CHECK(!agg_expr || agg_expr->get_aggtype() ==
kSAMPLE);
1762 const auto& elem_ti = target_ti.get_elem_type();
1764 executor_->cgen_state_->emitExternalCall(
1767 {target_lvs.front(), code_generator.
posArg(target_expr)}),
1768 executor_->cgen_state_->emitExternalCall(
1771 {target_lvs.front(),
1772 code_generator.
posArg(target_expr),
1774 }
else if (target_ti.isStandardBufferPackaging()) {
1776 throw std::runtime_error(
1777 "Using array[] operator as argument to an aggregate operator is not "
1780 return {target_lvs[0], target_lvs[1]};
1783 if (target_ti.is_geometry() &&
1784 !
executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1785 auto generate_coord_lvs =
1786 [&](
auto* selected_target_expr,
1787 bool const fetch_columns) -> std::vector<llvm::Value*> {
1788 const auto target_lvs =
1789 code_generator.
codegen(selected_target_expr, fetch_columns, co);
1792 CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
1796 CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
1802 std::vector<llvm::Value*> coords;
1804 for (
const auto& target_lv : target_lvs) {
1810 const size_t elem_sz = ctr == 0 ? 1 : 4;
1812 int32_t fixlen = -1;
1813 if (target_ti.get_type() ==
kPOINT) {
1816 const auto coords_cd =
executor_->getPhysicalColumnDescriptor(col_var, 1);
1817 if (coords_cd && coords_cd->columnType.get_type() ==
kARRAY) {
1818 fixlen = coords_cd->columnType.get_size();
1823 coords.push_back(
executor_->cgen_state_->emitExternalCall(
1824 "fast_fixlen_array_buff",
1826 {target_lv, code_generator.
posArg(selected_target_expr)}));
1827 coords.push_back(
executor_->cgen_state_->llInt(int64_t(fixlen)));
1830 coords.push_back(
executor_->cgen_state_->emitExternalCall(
1833 {target_lv, code_generator.
posArg(selected_target_expr)}));
1834 coords.push_back(
executor_->cgen_state_->emitExternalCall(
1838 code_generator.
posArg(selected_target_expr),
1845 return generate_coord_lvs(agg_expr->get_arg(),
true);
1847 return generate_coord_lvs(target_expr,
1848 !
executor_->plan_state_->allow_lazy_fetch_);
1852 return agg_expr ? code_generator.codegen(agg_expr->get_arg(),
true, co)
1853 : code_generator.codegen(
1854 target_expr, !
executor_->plan_state_->allow_lazy_fetch_, co);
1858 const std::vector<llvm::Value*>&
args) {
1859 return executor_->cgen_state_->emitCall(fname, args);
1876 const auto grouped_col_expr =
1878 if (!grouped_col_expr) {
1881 if (grouped_col_expr->get_table_id() <= 0) {
1885 if (td->shardedColumnId == grouped_col_expr->get_column_id()) {
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, GroupByAndAggregate::DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
SqlWindowFunctionKind getKind() const
bool use_streaming_top_n(const RelAlgExecutionUnit &ra_exe_unit, const bool output_columnar)
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
const int32_t groups_buffer_size return groups_buffer
llvm::Value * getAdditionalLiteral(const int32_t off)
void get_domain(DomainSet &domain_set) const override
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
size_t getEntryCount() const
class for a per-database catalog. also includes metadata for the current database and the current use...
HOST DEVICE EncodingType get_compression() const
int hll_size_for_rate(const int err_percent)
std::vector< int8_t > get_col_byte_widths(const T &col_expr_list, const std::vector< ssize_t > &col_exprs_to_not_project)
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner >)
bool expr_is_rowid(const Analyzer::Expr *expr, const Catalog_Namespace::Catalog &cat)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
Streaming Top N algorithm.
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
bool mustUseBaselineSort() const
void mark_function_always_inline(llvm::Function *func)
ColRangeInfo getColRangeInfo()
const std::list< Analyzer::OrderEntry > order_entries
static const size_t baseline_threshold
QueryDescriptionType hash_type_
llvm::Value * posArg(const Analyzer::Expr *) const
static bool supportedExprForGpuSharedMemUsage(Analyzer::Expr *expr)
HOST DEVICE int get_size() const
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
bool hasKeylessHash() const
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
size_t getEffectiveKeyWidth() const
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
bool supportedTypeForGpuSharedMemUsage(const SQLTypeInfo &target_type_info) const
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
std::list< const Expr * > DomainSet
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
size_t getRowSize() const
Helpers for codegen of target expressions.
size_t getColOnlyOffInBytes(const size_t col_idx) const
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
int64_t getMaxVal() const
const SQLTypeInfo get_compact_type(const TargetInfo &target)
CountDistinctDescriptors initCountDistinctDescriptors()
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, GroupByAndAggregate::DiamondCodegen &diamond_codegen) const
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
size_t getGroupbyColCount() const
const JoinQualsPerNestingLevel join_quals
HOST DEVICE SQLTypes get_type() const
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
void setFalseTarget(llvm::BasicBlock *cond_false)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
llvm::BasicBlock * cond_false_
bool is_distinct_target(const TargetInfo &target_info)
KeylessInfo getKeylessInfo(const std::vector< Analyzer::Expr * > &target_expr_list, const bool is_group_by) const
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
DiamondCodegen(llvm::Value *cond, Executor *executor, const bool chain_to_next, const std::string &label_prefix, DiamondCodegen *parent, const bool share_false_edge_with_parent)
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void > > checked_int64_t
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
const SQLTypeInfo & get_type_info() const
ExecutorDeviceType device_type_
QueryDescriptionType getQueryDescriptionType() const
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
int64_t getBucket() const
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
void agg_count_distinct(int64_t *agg, const int64_t val)
llvm::BasicBlock * cond_true_
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
Descriptor for the result set buffer layout.
llvm::BasicBlock * orig_cond_false_
void add_transient_string_literals_for_expression(const Analyzer::Expr *expr, Executor *executor, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool didOutputColumnar() const
bool usesGetGroupValueFast() const
bool interleavedBins(const ExecutorDeviceType) const
void addTransientStringLiterals()
bool threadsShareMemory() const
int64_t getMinVal() const
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn't useful.
ColRangeInfo getExprRangeInfo(const Analyzer::Expr *expr) const
uint32_t log2_bytes(const uint32_t bytes)
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint)
Allocate GPU memory using GpuBuffers via DataMgr.
const bool with_dynamic_watchdog_
const RelAlgExecutionUnit & ra_exe_unit_
size_t getColOffInBytes(const size_t col_idx) const
const int64_t const uint32_t const uint32_t const uint32_t const bool keyless
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
SQLOps get_optype() const
static WindowFunctionContext * getActiveWindowFunctionContext()
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)
void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)