OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
anonymous_namespace{GroupByAndAggregate.cpp} Namespace Reference

Functions

int32_t get_agg_count (const std::vector< Analyzer::Expr * > &target_exprs)
 
bool expr_is_rowid (const Analyzer::Expr *expr)
 
bool has_count_distinct (const RelAlgExecutionUnit &ra_exe_unit)
 
bool is_column_range_too_big_for_perfect_hash (const ColRangeInfo &col_range_info, const int64_t max_entry_count)
 
bool cardinality_estimate_less_than_column_range (const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
 
ColRangeInfo get_expr_range_info (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
 
int64_t get_bucketed_cardinality_without_nulls (const ColRangeInfo &col_range_info)
 
KeylessInfo get_keyless_info (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
 
CountDistinctDescriptors init_count_distinct_descriptors (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &group_by_range_info, const ExecutorDeviceType device_type, Executor *executor)
 

Function Documentation

bool anonymous_namespace{GroupByAndAggregate.cpp}::cardinality_estimate_less_than_column_range ( const int64_t  cardinality_estimate,
const ColRangeInfo col_range_info 
)

Definition at line 158 of file GroupByAndAggregate.cpp.

References ColRangeInfo::max, and ColRangeInfo::min.

Referenced by GroupByAndAggregate::getColRangeInfo().

159  {
160  try {
161  // the cardinality estimate is the size of the baseline hash table. further penalize
162  // the baseline hash table by a factor of 2x due to overhead in computing baseline
163  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
164  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
165  // count of the column, we use baseline hash on the filtered set
166  return checked_int64_t(cardinality_estimate) * 2 <
167  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
168  checked_int64_t(col_range_info.min));
169  } catch (...) {
170  return false;
171  }
172 }
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t

+ Here is the caller graph for this function:

bool anonymous_namespace{GroupByAndAggregate.cpp}::expr_is_rowid ( const Analyzer::Expr expr)

Definition at line 125 of file GroupByAndAggregate.cpp.

References CHECK_EQ, and get_column_descriptor_maybe().

Referenced by GroupByAndAggregate::getColRangeInfo().

125  {
126  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
127  if (!col) {
128  return false;
129  }
130  const auto cd = get_column_descriptor_maybe(col->getColumnKey());
131  if (!cd || !cd->isVirtualCol) {
132  return false;
133  }
134  CHECK_EQ("rowid", cd->columnName);
135  return true;
136 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
const ColumnDescriptor * get_column_descriptor_maybe(const shared::ColumnKey &column_key)
Definition: Execute.h:241

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int32_t anonymous_namespace{GroupByAndAggregate.cpp}::get_agg_count ( const std::vector< Analyzer::Expr * > &  target_exprs)

Definition at line 100 of file GroupByAndAggregate.cpp.

References agg_count(), CHECK, Analyzer::Expr::get_type_info(), kAVG, and kSAMPLE.

Referenced by GroupByAndAggregate::codegen().

100  {
101  int32_t agg_count{0};
102  for (auto target_expr : target_exprs) {
103  CHECK(target_expr);
104  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
105  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
106  const auto& ti = target_expr->get_type_info();
107  if (ti.is_buffer()) {
108  agg_count += 2;
109  } else if (ti.is_geometry()) {
110  agg_count += ti.get_physical_coord_cols() * 2;
111  } else {
112  ++agg_count;
113  }
114  continue;
115  }
116  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
117  agg_count += 2;
118  } else {
119  ++agg_count;
120  }
121  }
122  return agg_count;
123 }
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
#define CHECK(condition)
Definition: Logger.h:291
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
Definition: sqldefs.h:74

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int64_t anonymous_namespace{GroupByAndAggregate.cpp}::get_bucketed_cardinality_without_nulls ( const ColRangeInfo col_range_info)

Definition at line 365 of file GroupByAndAggregate.cpp.

References ColRangeInfo::bucket, ColRangeInfo::max, and ColRangeInfo::min.

Referenced by init_count_distinct_descriptors().

365  {
366  if (col_range_info.min <= col_range_info.max) {
367  size_t size = col_range_info.max - col_range_info.min;
368  if (col_range_info.bucket) {
369  size /= col_range_info.bucket;
370  }
371  if (size >= static_cast<size_t>(std::numeric_limits<int64_t>::max())) {
372  // try to use unordered_set instead of crashing due to CHECK failure
373  // i.e., CHECK_LT(size, std::numeric_limits<int64_t>::max());
374  return 0;
375  }
376  return static_cast<int64_t>(size + 1);
377  } else {
378  return 0;
379  }
380 }

+ Here is the caller graph for this function:

ColRangeInfo anonymous_namespace{GroupByAndAggregate.cpp}::get_expr_range_info ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const Analyzer::Expr expr,
Executor executor 
)

Definition at line 174 of file GroupByAndAggregate.cpp.

References CHECK, Double, Float, getExpressionRange(), GroupByBaselineHash, GroupByPerfectHash, Integer, Invalid, NonGroupedAggregate, Projection, and RelAlgExecutionUnit::simple_quals.

Referenced by GroupByAndAggregate::codegenGroupBy(), GroupByAndAggregate::codegenPerfectHashFunction(), GroupByAndAggregate::getColRangeInfo(), GroupByAndAggregate::gpuCanHandleOrderEntries(), and init_count_distinct_descriptors().

177  {
178  if (!expr) {
179  return {QueryDescriptionType::Projection, 0, 0, 0, false};
180  }
181 
182  const auto expr_range = getExpressionRange(
183  expr, query_infos, executor, boost::make_optional(ra_exe_unit.simple_quals));
184  switch (expr_range.getType()) {
186  if (expr_range.getIntMin() > expr_range.getIntMax()) {
187  return {
188  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
189  }
191  expr_range.getIntMin(),
192  expr_range.getIntMax(),
193  expr_range.getBucket(),
194  expr_range.hasNulls()};
195  }
198  if (expr_range.getFpMin() > expr_range.getFpMax()) {
199  return {
200  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
201  }
202  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
203  }
205  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
206  default:
207  CHECK(false);
208  }
209  CHECK(false);
210  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
211 }
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
#define CHECK(condition)
Definition: Logger.h:291
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

KeylessInfo anonymous_namespace{GroupByAndAggregate.cpp}::get_keyless_info ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const bool  is_group_by,
Executor executor 
)

This function goes through all target expressions and answers two questions:

  1. Is it possible to have keyless hash?
  2. If yes to 1, then what aggregate expression should be considered to represent the key's presence, if needed (e.g., in detecting empty entries in the result set).

NOTE: Keyless hash is only valid with single-column group by at the moment.

Definition at line 475 of file GroupByAndAggregate.cpp.

References agg_arg(), CHECK, constrained_not_null(), Double, Float, g_bigint_count, get_agg_initial_val(), get_compact_type(), get_target_info(), getExpressionRange(), Integer, Invalid, is_distinct_target(), kAVG, kCOUNT, kMAX, kMIN, kSUM, RelAlgExecutionUnit::quals, takes_float_argument(), and RelAlgExecutionUnit::target_exprs.

Referenced by GroupByAndAggregate::initQueryMemoryDescriptorImpl().

478  {
479  bool keyless{true}, found{false};
480  int32_t num_agg_expr{0};
481  int32_t index{0};
482  for (const auto target_expr : ra_exe_unit.target_exprs) {
483  const auto agg_info = get_target_info(target_expr, g_bigint_count);
484  const auto chosen_type = get_compact_type(agg_info);
485  if (agg_info.is_agg) {
486  num_agg_expr++;
487  }
488  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
489  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
490  CHECK(agg_expr);
491  const auto arg_expr = agg_arg(target_expr);
492  const bool float_argument_input = takes_float_argument(agg_info);
493  switch (agg_info.agg_kind) {
494  case kAVG:
495  ++index;
496  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
497  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
498  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
499  expr_range_info.hasNulls()) {
500  break;
501  }
502  }
503  found = true;
504  break;
505  case kCOUNT:
506  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
507  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
508  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
509  expr_range_info.hasNulls()) {
510  break;
511  }
512  }
513  found = true;
514  break;
515  case kSUM: {
516  auto arg_ti = arg_expr->get_type_info();
517  if (constrained_not_null(arg_expr, ra_exe_unit.quals)) {
518  arg_ti.set_notnull(true);
519  }
520  if (!arg_ti.get_notnull()) {
521  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
522  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
523  !expr_range_info.hasNulls()) {
524  found = true;
525  }
526  } else {
527  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
528  switch (expr_range_info.getType()) {
531  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
532  found = true;
533  }
534  break;
536  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
537  found = true;
538  }
539  break;
540  default:
541  break;
542  }
543  }
544  break;
545  }
546  case kMIN: {
547  CHECK(agg_expr && agg_expr->get_arg());
548  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
549  if (arg_ti.is_string() || arg_ti.is_buffer()) {
550  break;
551  }
552  auto expr_range_info =
553  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
554  auto init_max = get_agg_initial_val(agg_info.agg_kind,
555  chosen_type,
556  is_group_by || float_argument_input,
557  float_argument_input ? sizeof(float) : 8);
558  switch (expr_range_info.getType()) {
561  auto double_max =
562  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
563  if (expr_range_info.getFpMax() < double_max) {
564  found = true;
565  }
566  break;
567  }
569  if (expr_range_info.getIntMax() < init_max) {
570  found = true;
571  }
572  break;
573  default:
574  break;
575  }
576  break;
577  }
578  case kMAX: {
579  CHECK(agg_expr && agg_expr->get_arg());
580  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
581  if (arg_ti.is_string() || arg_ti.is_buffer()) {
582  break;
583  }
584  auto expr_range_info =
585  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
586  // NULL sentinel and init value for kMAX are identical, which results in
587  // ambiguity in detecting empty keys in presence of nulls.
588  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
589  expr_range_info.hasNulls()) {
590  break;
591  }
592  auto init_min = get_agg_initial_val(agg_info.agg_kind,
593  chosen_type,
594  is_group_by || float_argument_input,
595  float_argument_input ? sizeof(float) : 8);
596  switch (expr_range_info.getType()) {
599  auto double_min =
600  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
601  if (expr_range_info.getFpMin() > double_min) {
602  found = true;
603  }
604  break;
605  }
607  if (expr_range_info.getIntMin() > init_min) {
608  found = true;
609  }
610  break;
611  default:
612  break;
613  }
614  break;
615  }
616  default:
617  keyless = false;
618  break;
619  }
620  }
621  if (!keyless) {
622  break;
623  }
624  if (!found) {
625  ++index;
626  }
627  }
628 
629  // shouldn't use keyless for projection only
630  return {
631  keyless && found,
632  index,
633  };
634 }
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:106
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
Definition: sqldefs.h:75
const SQLTypeInfo get_compact_type(const TargetInfo &target)
bool g_bigint_count
Definition: sqldefs.h:77
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
Definition: sqldefs.h:78
std::list< std::shared_ptr< Analyzer::Expr > > quals
#define CHECK(condition)
Definition: Logger.h:291
Definition: sqldefs.h:76
Definition: sqldefs.h:74

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool anonymous_namespace{GroupByAndAggregate.cpp}::has_count_distinct ( const RelAlgExecutionUnit ra_exe_unit)

Definition at line 138 of file GroupByAndAggregate.cpp.

References g_bigint_count, get_target_info(), is_distinct_target(), and RelAlgExecutionUnit::target_exprs.

Referenced by GroupByAndAggregate::getColRangeInfo().

138  {
139  for (const auto& target_expr : ra_exe_unit.target_exprs) {
140  const auto agg_info = get_target_info(target_expr, g_bigint_count);
141  if (agg_info.is_agg && is_distinct_target(agg_info)) {
142  return true;
143  }
144  }
145  return false;
146 }
std::vector< Analyzer::Expr * > target_exprs
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

CountDistinctDescriptors anonymous_namespace{GroupByAndAggregate.cpp}::init_count_distinct_descriptors ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const ColRangeInfo group_by_range_info,
const ExecutorDeviceType  device_type,
Executor executor 
)

Definition at line 636 of file GroupByAndAggregate.cpp.

References align_to_int64(), Bitmap, ColRangeInfo::bucket, CHECK, CHECK_GE, Analyzer::ColumnVar::collect_column_var(), Analyzer::ColumnVar::colvar_comp(), g_bigint_count, g_bitmap_memory_limit, g_enable_watchdog, g_hll_precision_bits, Analyzer::AggExpr::get_arg(), get_bucketed_cardinality_without_nulls(), get_count_distinct_sub_bitmap_count(), DateConverters::get_epoch_days_from_seconds(), get_expr_range_info(), get_target_info(), Analyzer::Expr::get_type_info(), GPU, GroupByPerfectHash, hll_size_for_rate(), Invalid, is_distinct_target(), kAPPROX_COUNT_DISTINCT, kCOUNT, kDATE, kENCODING_DATE_IN_DAYS, kINT, ColRangeInfo::max, ColRangeInfo::min, Projection, RelAlgExecutionUnit::target_exprs, RelAlgExecutionUnit::target_exprs_original_type_infos, and UnorderedSet.

Referenced by GroupByAndAggregate::initQueryMemoryDescriptorImpl().

641  {
642  CountDistinctDescriptors count_distinct_descriptors;
643  auto compute_bytes_per_group =
644  [](size_t bitmap_sz, size_t sub_bitmap_count, ExecutorDeviceType device_type) {
645  size_t effective_size_bytes = (bitmap_sz + 7) / 8;
646  const auto padded_size =
647  (device_type == ExecutorDeviceType::GPU || sub_bitmap_count > 1)
648  ? align_to_int64(effective_size_bytes)
649  : effective_size_bytes;
650  return padded_size * sub_bitmap_count;
651  };
652  for (size_t i = 0; i < ra_exe_unit.target_exprs.size(); i++) {
653  const auto target_expr = ra_exe_unit.target_exprs[i];
654  auto agg_info = get_target_info(target_expr, g_bigint_count);
655  if (is_distinct_target(agg_info)) {
656  CHECK(agg_info.is_agg);
657  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
658  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
659  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
660  if (arg_ti.is_text_encoding_none()) {
661  throw std::runtime_error(
662  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
663  }
664  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
665  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
666  }
667  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
668  throw std::runtime_error(
669  "APPROX_COUNT_DISTINCT on geometry columns not supported");
670  }
671  if (agg_info.is_distinct && arg_ti.is_geometry()) {
672  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
673  }
674  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
675  auto arg_range_info =
676  arg_ti.is_fp() ? no_range_info
678  ra_exe_unit, query_infos, agg_expr->get_arg(), executor);
679  const auto it = ra_exe_unit.target_exprs_original_type_infos.find(i);
680  if (it != ra_exe_unit.target_exprs_original_type_infos.end()) {
681  const auto& original_target_expr_ti = it->second;
682  if (arg_ti.is_integer() && original_target_expr_ti.get_type() == kDATE &&
683  original_target_expr_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
684  // manually encode the col range of date col if necessary
685  // (see conditionally_change_arg_to_int_type function in RelAlgExecutor.cpp)
686  auto is_date_value_not_encoded = [&original_target_expr_ti](int64_t date_val) {
687  if (original_target_expr_ti.get_comp_param() == 16) {
688  return date_val < INT16_MIN || date_val > INT16_MAX;
689  } else {
690  return date_val < INT32_MIN || date_val > INT32_MIN;
691  }
692  };
693  if (is_date_value_not_encoded(arg_range_info.min)) {
694  // chunk metadata of the date column contains decoded value
695  // so we manually encode it again here to represent its column range correctly
696  arg_range_info.min =
698  }
699  if (is_date_value_not_encoded(arg_range_info.max)) {
700  arg_range_info.max =
702  }
703  // now we manually encode the value, so we need to invalidate bucket value
704  // i.e., 86000 -> 0, to correctly calculate the size of bitmap
705  arg_range_info.bucket = 0;
706  }
707  }
708 
710  int64_t bitmap_sz_bits{0};
711  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
712  const auto error_rate_expr = agg_expr->get_arg1();
713  if (error_rate_expr) {
714  CHECK(error_rate_expr->get_type_info().get_type() == kINT);
715  auto const error_rate =
716  dynamic_cast<Analyzer::Constant const*>(error_rate_expr.get());
717  CHECK(error_rate);
718  CHECK_GE(error_rate->get_constval().intval, 1);
719  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
720  } else {
721  bitmap_sz_bits = g_hll_precision_bits;
722  }
723  }
724  if (arg_range_info.isEmpty()) {
725  count_distinct_descriptors.emplace_back(
727  0,
728  arg_range_info.bucket,
729  64,
730  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
731  device_type,
732  1});
733  continue;
734  }
735  const auto sub_bitmap_count =
736  get_count_distinct_sub_bitmap_count(bitmap_sz_bits, ra_exe_unit, device_type);
737  size_t worst_case_num_groups{1};
738  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
739  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
740  // implementation for arrays
741  count_distinct_impl_type = CountDistinctImplType::Bitmap;
742  if (shared::is_any<kCOUNT, kCOUNT_IF>(agg_info.agg_kind)) {
743  bitmap_sz_bits = get_bucketed_cardinality_without_nulls(arg_range_info);
744  if (bitmap_sz_bits <= 0 || g_bitmap_memory_limit <= bitmap_sz_bits) {
745  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
746  }
747  // check a potential OOM when using bitmap-based approach
748  const auto total_bytes_per_entry =
749  compute_bytes_per_group(bitmap_sz_bits, sub_bitmap_count, device_type);
750  const auto range_bucket = std::max(group_by_range_info.bucket, (int64_t)1);
751  const auto maximum_num_groups =
752  (group_by_range_info.max - group_by_range_info.min + 1) / range_bucket;
753  const auto total_bitmap_bytes_for_groups =
754  total_bytes_per_entry * maximum_num_groups;
755  // we can estimate a potential OOM of bitmap-based count-distinct operator
756  // by using the logic "check_total_bitmap_memory"
757  if (total_bitmap_bytes_for_groups >=
758  static_cast<size_t>(g_bitmap_memory_limit)) {
759  const auto agg_expr_max_entry_count =
760  arg_range_info.max - arg_range_info.min + 1;
761  int64_t max_agg_expr_table_cardinality{1};
762  std::set<const Analyzer::ColumnVar*,
763  bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
765  agg_expr->collect_column_var(colvar_set, true);
766  for (const auto cv : colvar_set) {
767  auto it =
768  std::find_if(query_infos.begin(),
769  query_infos.end(),
770  [&](const auto& input_table_info) {
771  return input_table_info.table_key == cv->getTableKey();
772  });
773  int64_t cur_table_cardinality =
774  it != query_infos.end()
775  ? static_cast<int64_t>(it->info.getNumTuplesUpperBound())
776  : -1;
777  max_agg_expr_table_cardinality =
778  std::max(max_agg_expr_table_cardinality, cur_table_cardinality);
779  worst_case_num_groups *= cur_table_cardinality;
780  }
781  auto has_valid_stat = [agg_expr_max_entry_count, maximum_num_groups]() {
782  return agg_expr_max_entry_count > 0 && maximum_num_groups > 0;
783  };
784  // if we have valid stats regarding input expr, we can try to relax the OOM
785  if (has_valid_stat()) {
786  // a threshold related to a ratio of a range of agg expr (let's say R)
787  // and table cardinality (C), i.e., use unordered_set if the # bits to build
788  // a bitmap based on R is four times larger than that of C
789  const size_t unordered_set_threshold{2};
790  // When we detect OOM of bitmap-based approach we selectively switch it to
791  // hash set-based processing logic if one of the followings is satisfied:
792  // 1) the column range is too wide compared with the table cardinality, or
793  // 2) the column range is too wide compared with the avg of # unique values
794  // per group by entry
795  const auto bits_for_agg_entry = std::ceil(log(agg_expr_max_entry_count));
796  const auto bits_for_agg_table =
797  std::ceil(log(max_agg_expr_table_cardinality));
798  const auto avg_num_unique_entries_per_group =
799  std::ceil(max_agg_expr_table_cardinality / maximum_num_groups);
800  // case a) given a range of entry count of agg_expr and the maximum
801  // cardinality among source tables of the agg_expr , we try to detect the
802  // misleading case of too sparse column range , i.e., agg_expr has 1M column
803  // range but only has two tuples {1 and 1M} / case b) check whether
804  // using bitmap is really beneficial when considering uniform distribution
805  // of (unique) keys.
806  if ((bits_for_agg_entry - bits_for_agg_table) >= unordered_set_threshold ||
807  agg_expr_max_entry_count >= avg_num_unique_entries_per_group) {
808  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
809  } else {
810  throw std::runtime_error(
811  "Consider using approx_count_distinct operator instead of "
812  "count_distinct operator to lower the memory "
813  "requirements");
814  }
815  }
816  }
817  }
818  }
819  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
820  count_distinct_impl_type == CountDistinctImplType::UnorderedSet &&
821  !(arg_ti.is_array() || arg_ti.is_geometry())) {
822  count_distinct_impl_type = CountDistinctImplType::Bitmap;
823  }
824  const size_t too_many_entries{100000000};
825  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
826  worst_case_num_groups > too_many_entries &&
827  count_distinct_impl_type == CountDistinctImplType::UnorderedSet) {
828  throw WatchdogException(
829  "Detect too many input entries for set-based count distinct operator under "
830  "the watchdog");
831  }
832  count_distinct_descriptors.emplace_back(
833  CountDistinctDescriptor{count_distinct_impl_type,
834  arg_range_info.min,
835  arg_range_info.bucket,
836  bitmap_sz_bits,
837  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
838  device_type,
839  sub_bitmap_count});
840  } else {
841  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
842  CountDistinctImplType::Invalid, 0, 0, 0, false, device_type, 0});
843  }
844  }
845  return count_distinct_descriptors;
846 }
std::vector< Analyzer::Expr * > target_exprs
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
Definition: Analyzer.h:215
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:113
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
Definition: Analyzer.h:222
#define CHECK_GE(x, y)
Definition: Logger.h:306
Expr * get_arg() const
Definition: Analyzer.h:1330
int g_hll_precision_bits
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
ExecutorDeviceType
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:34
bool g_bigint_count
bool g_enable_watchdog
int64_t g_bitmap_memory_limit
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
Definition: sqltypes.h:80
int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo &col_range_info)
std::unordered_map< size_t, SQLTypeInfo > target_exprs_original_type_infos
Definition: sqldefs.h:78
CountDistinctImplType
#define CHECK(condition)
Definition: Logger.h:291
int64_t get_epoch_days_from_seconds(const int64_t seconds)
Definition: sqltypes.h:72
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool anonymous_namespace{GroupByAndAggregate.cpp}::is_column_range_too_big_for_perfect_hash ( const ColRangeInfo col_range_info,
const int64_t  max_entry_count 
)

Definition at line 148 of file GroupByAndAggregate.cpp.

References ColRangeInfo::max, and ColRangeInfo::min.

Referenced by GroupByAndAggregate::getColRangeInfo().

149  {
150  try {
151  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
152  checked_int64_t(col_range_info.min)) >= max_entry_count;
153  } catch (...) {
154  return true;
155  }
156 }
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t

+ Here is the caller graph for this function: