OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
anonymous_namespace{GroupByAndAggregate.cpp} Namespace Reference

Functions

int32_t get_agg_count (const std::vector< Analyzer::Expr * > &target_exprs)
 
bool expr_is_rowid (const Analyzer::Expr *expr)
 
bool has_count_distinct (const RelAlgExecutionUnit &ra_exe_unit)
 
bool is_column_range_too_big_for_perfect_hash (const ColRangeInfo &col_range_info, const int64_t max_entry_count)
 
bool cardinality_estimate_less_than_column_range (const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
 
ColRangeInfo get_expr_range_info (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
 
int64_t get_bucketed_cardinality_without_nulls (const ColRangeInfo &col_range_info)
 
KeylessInfo get_keyless_info (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
 
CountDistinctDescriptors init_count_distinct_descriptors (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &group_by_range_info, const ExecutorDeviceType device_type, Executor *executor)
 

Function Documentation

bool anonymous_namespace{GroupByAndAggregate.cpp}::cardinality_estimate_less_than_column_range ( const int64_t  cardinality_estimate,
const ColRangeInfo col_range_info 
)

Definition at line 158 of file GroupByAndAggregate.cpp.

References ColRangeInfo::max, and ColRangeInfo::min.

Referenced by GroupByAndAggregate::getColRangeInfo().

159  {
160  try {
161  // the cardinality estimate is the size of the baseline hash table. further penalize
162  // the baseline hash table by a factor of 2x due to overhead in computing baseline
163  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
164  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
165  // count of the column, we use baseline hash on the filtered set
166  return checked_int64_t(cardinality_estimate) * 2 <
167  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
168  checked_int64_t(col_range_info.min));
169  } catch (...) {
170  return false;
171  }
172 }
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t

+ Here is the caller graph for this function:

bool anonymous_namespace{GroupByAndAggregate.cpp}::expr_is_rowid ( const Analyzer::Expr expr)

Definition at line 125 of file GroupByAndAggregate.cpp.

References CHECK_EQ, and get_column_descriptor_maybe().

Referenced by GroupByAndAggregate::getColRangeInfo().

125  {
126  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
127  if (!col) {
128  return false;
129  }
130  const auto cd = get_column_descriptor_maybe(col->getColumnKey());
131  if (!cd || !cd->isVirtualCol) {
132  return false;
133  }
134  CHECK_EQ("rowid", cd->columnName);
135  return true;
136 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
const ColumnDescriptor * get_column_descriptor_maybe(const shared::ColumnKey &column_key)
Definition: Execute.h:220

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int32_t anonymous_namespace{GroupByAndAggregate.cpp}::get_agg_count ( const std::vector< Analyzer::Expr * > &  target_exprs)

Definition at line 100 of file GroupByAndAggregate.cpp.

References agg_count(), CHECK, Analyzer::Expr::get_type_info(), kAVG, and kSAMPLE.

Referenced by GroupByAndAggregate::codegen().

100  {
101  int32_t agg_count{0};
102  for (auto target_expr : target_exprs) {
103  CHECK(target_expr);
104  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
105  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
106  const auto& ti = target_expr->get_type_info();
107  if (ti.is_buffer()) {
108  agg_count += 2;
109  } else if (ti.is_geometry()) {
110  agg_count += ti.get_physical_coord_cols() * 2;
111  } else {
112  ++agg_count;
113  }
114  continue;
115  }
116  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
117  agg_count += 2;
118  } else {
119  ++agg_count;
120  }
121  }
122  return agg_count;
123 }
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
#define CHECK(condition)
Definition: Logger.h:291
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
Definition: sqldefs.h:74

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int64_t anonymous_namespace{GroupByAndAggregate.cpp}::get_bucketed_cardinality_without_nulls ( const ColRangeInfo col_range_info)

Definition at line 346 of file GroupByAndAggregate.cpp.

References ColRangeInfo::bucket, ColRangeInfo::max, and ColRangeInfo::min.

Referenced by init_count_distinct_descriptors().

346  {
347  if (col_range_info.min <= col_range_info.max) {
348  size_t size = col_range_info.max - col_range_info.min;
349  if (col_range_info.bucket) {
350  size /= col_range_info.bucket;
351  }
352  if (size >= static_cast<size_t>(std::numeric_limits<int64_t>::max())) {
353  // try to use unordered_set instead of crashing due to CHECK failure
354  // i.e., CHECK_LT(size, std::numeric_limits<int64_t>::max());
355  return 0;
356  }
357  return static_cast<int64_t>(size + 1);
358  } else {
359  return 0;
360  }
361 }

+ Here is the caller graph for this function:

ColRangeInfo anonymous_namespace{GroupByAndAggregate.cpp}::get_expr_range_info ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const Analyzer::Expr expr,
Executor executor 
)

Definition at line 174 of file GroupByAndAggregate.cpp.

References CHECK, Double, Float, getExpressionRange(), GroupByBaselineHash, GroupByPerfectHash, Integer, Invalid, NonGroupedAggregate, Projection, and RelAlgExecutionUnit::simple_quals.

Referenced by GroupByAndAggregate::codegenGroupBy(), GroupByAndAggregate::codegenPerfectHashFunction(), GroupByAndAggregate::getColRangeInfo(), GroupByAndAggregate::gpuCanHandleOrderEntries(), and init_count_distinct_descriptors().

177  {
178  if (!expr) {
179  return {QueryDescriptionType::Projection, 0, 0, 0, false};
180  }
181 
182  const auto expr_range = getExpressionRange(
183  expr, query_infos, executor, boost::make_optional(ra_exe_unit.simple_quals));
184  switch (expr_range.getType()) {
186  if (expr_range.getIntMin() > expr_range.getIntMax()) {
187  return {
188  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
189  }
191  expr_range.getIntMin(),
192  expr_range.getIntMax(),
193  expr_range.getBucket(),
194  expr_range.hasNulls()};
195  }
198  if (expr_range.getFpMin() > expr_range.getFpMax()) {
199  return {
200  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
201  }
202  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
203  }
205  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
206  default:
207  CHECK(false);
208  }
209  CHECK(false);
210  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
211 }
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
#define CHECK(condition)
Definition: Logger.h:291
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

KeylessInfo anonymous_namespace{GroupByAndAggregate.cpp}::get_keyless_info ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const bool  is_group_by,
Executor executor 
)

This function goes through all target expressions and answers two questions:

  1. Is it possible to have keyless hash?
  2. If yes to 1, then what aggregate expression should be considered to represent the key's presence, if needed (e.g., in detecting empty entries in the result set).

NOTE: Keyless hash is only valid with single-column group by at the moment.

Definition at line 456 of file GroupByAndAggregate.cpp.

References agg_arg(), CHECK, constrained_not_null(), Double, Float, g_bigint_count, get_agg_initial_val(), get_compact_type(), get_target_info(), getExpressionRange(), Integer, Invalid, is_distinct_target(), kAVG, kCOUNT, kMAX, kMIN, kSUM, RelAlgExecutionUnit::quals, takes_float_argument(), and RelAlgExecutionUnit::target_exprs.

Referenced by GroupByAndAggregate::initQueryMemoryDescriptorImpl().

459  {
460  bool keyless{true}, found{false};
461  int32_t num_agg_expr{0};
462  int32_t index{0};
463  for (const auto target_expr : ra_exe_unit.target_exprs) {
464  const auto agg_info = get_target_info(target_expr, g_bigint_count);
465  const auto chosen_type = get_compact_type(agg_info);
466  if (agg_info.is_agg) {
467  num_agg_expr++;
468  }
469  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
470  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
471  CHECK(agg_expr);
472  const auto arg_expr = agg_arg(target_expr);
473  const bool float_argument_input = takes_float_argument(agg_info);
474  switch (agg_info.agg_kind) {
475  case kAVG:
476  ++index;
477  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
478  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
479  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
480  expr_range_info.hasNulls()) {
481  break;
482  }
483  }
484  found = true;
485  break;
486  case kCOUNT:
487  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
488  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
489  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
490  expr_range_info.hasNulls()) {
491  break;
492  }
493  }
494  found = true;
495  break;
496  case kSUM: {
497  auto arg_ti = arg_expr->get_type_info();
498  if (constrained_not_null(arg_expr, ra_exe_unit.quals)) {
499  arg_ti.set_notnull(true);
500  }
501  if (!arg_ti.get_notnull()) {
502  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
503  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
504  !expr_range_info.hasNulls()) {
505  found = true;
506  }
507  } else {
508  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
509  switch (expr_range_info.getType()) {
512  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
513  found = true;
514  }
515  break;
517  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
518  found = true;
519  }
520  break;
521  default:
522  break;
523  }
524  }
525  break;
526  }
527  case kMIN: {
528  CHECK(agg_expr && agg_expr->get_arg());
529  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
530  if (arg_ti.is_string() || arg_ti.is_buffer()) {
531  break;
532  }
533  auto expr_range_info =
534  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
535  auto init_max = get_agg_initial_val(agg_info.agg_kind,
536  chosen_type,
537  is_group_by || float_argument_input,
538  float_argument_input ? sizeof(float) : 8);
539  switch (expr_range_info.getType()) {
542  auto double_max =
543  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
544  if (expr_range_info.getFpMax() < double_max) {
545  found = true;
546  }
547  break;
548  }
550  if (expr_range_info.getIntMax() < init_max) {
551  found = true;
552  }
553  break;
554  default:
555  break;
556  }
557  break;
558  }
559  case kMAX: {
560  CHECK(agg_expr && agg_expr->get_arg());
561  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
562  if (arg_ti.is_string() || arg_ti.is_buffer()) {
563  break;
564  }
565  auto expr_range_info =
566  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
567  // NULL sentinel and init value for kMAX are identical, which results in
568  // ambiguity in detecting empty keys in presence of nulls.
569  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
570  expr_range_info.hasNulls()) {
571  break;
572  }
573  auto init_min = get_agg_initial_val(agg_info.agg_kind,
574  chosen_type,
575  is_group_by || float_argument_input,
576  float_argument_input ? sizeof(float) : 8);
577  switch (expr_range_info.getType()) {
580  auto double_min =
581  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
582  if (expr_range_info.getFpMin() > double_min) {
583  found = true;
584  }
585  break;
586  }
588  if (expr_range_info.getIntMin() > init_min) {
589  found = true;
590  }
591  break;
592  default:
593  break;
594  }
595  break;
596  }
597  default:
598  keyless = false;
599  break;
600  }
601  }
602  if (!keyless) {
603  break;
604  }
605  if (!found) {
606  ++index;
607  }
608  }
609 
610  // shouldn't use keyless for projection only
611  return {
612  keyless && found,
613  index,
614  };
615 }
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:102
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:88
Definition: sqldefs.h:75
const SQLTypeInfo get_compact_type(const TargetInfo &target)
bool g_bigint_count
Definition: sqldefs.h:77
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:98
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
Definition: sqldefs.h:78
std::list< std::shared_ptr< Analyzer::Expr > > quals
#define CHECK(condition)
Definition: Logger.h:291
Definition: sqldefs.h:76
Definition: sqldefs.h:74

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool anonymous_namespace{GroupByAndAggregate.cpp}::has_count_distinct ( const RelAlgExecutionUnit ra_exe_unit)

Definition at line 138 of file GroupByAndAggregate.cpp.

References g_bigint_count, get_target_info(), is_distinct_target(), and RelAlgExecutionUnit::target_exprs.

Referenced by GroupByAndAggregate::getColRangeInfo().

138  {
139  for (const auto& target_expr : ra_exe_unit.target_exprs) {
140  const auto agg_info = get_target_info(target_expr, g_bigint_count);
141  if (agg_info.is_agg && is_distinct_target(agg_info)) {
142  return true;
143  }
144  }
145  return false;
146 }
std::vector< Analyzer::Expr * > target_exprs
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:88
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:98

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

CountDistinctDescriptors anonymous_namespace{GroupByAndAggregate.cpp}::init_count_distinct_descriptors ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const ColRangeInfo group_by_range_info,
const ExecutorDeviceType  device_type,
Executor executor 
)

Definition at line 617 of file GroupByAndAggregate.cpp.

References align_to_int64(), Bitmap, ColRangeInfo::bucket, CHECK, CHECK_GE, Analyzer::ColumnVar::collect_column_var(), Analyzer::ColumnVar::colvar_comp(), g_bigint_count, g_bitmap_memory_limit, g_enable_watchdog, g_hll_precision_bits, Analyzer::AggExpr::get_arg(), get_bucketed_cardinality_without_nulls(), get_count_distinct_sub_bitmap_count(), DateConverters::get_epoch_days_from_seconds(), get_expr_range_info(), get_target_info(), Analyzer::Expr::get_type_info(), GPU, GroupByPerfectHash, hll_size_for_rate(), Invalid, is_distinct_target(), kAPPROX_COUNT_DISTINCT, kCOUNT, kDATE, kENCODING_DATE_IN_DAYS, kINT, ColRangeInfo::max, ColRangeInfo::min, Projection, RelAlgExecutionUnit::target_exprs, RelAlgExecutionUnit::target_exprs_original_type_infos, and UnorderedSet.

Referenced by GroupByAndAggregate::initQueryMemoryDescriptorImpl().

622  {
623  CountDistinctDescriptors count_distinct_descriptors;
624  auto compute_bytes_per_group =
625  [](size_t bitmap_sz, size_t sub_bitmap_count, ExecutorDeviceType device_type) {
626  size_t effective_size_bytes = (bitmap_sz + 7) / 8;
627  const auto padded_size =
628  (device_type == ExecutorDeviceType::GPU || sub_bitmap_count > 1)
629  ? align_to_int64(effective_size_bytes)
630  : effective_size_bytes;
631  return padded_size * sub_bitmap_count;
632  };
633  for (size_t i = 0; i < ra_exe_unit.target_exprs.size(); i++) {
634  const auto target_expr = ra_exe_unit.target_exprs[i];
635  auto agg_info = get_target_info(target_expr, g_bigint_count);
636  if (is_distinct_target(agg_info)) {
637  CHECK(agg_info.is_agg);
638  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
639  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
640  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
641  if (arg_ti.is_bytes()) {
642  throw std::runtime_error(
643  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
644  }
645  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
646  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
647  }
648  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
649  throw std::runtime_error(
650  "APPROX_COUNT_DISTINCT on geometry columns not supported");
651  }
652  if (agg_info.is_distinct && arg_ti.is_geometry()) {
653  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
654  }
655  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
656  auto arg_range_info =
657  arg_ti.is_fp() ? no_range_info
659  ra_exe_unit, query_infos, agg_expr->get_arg(), executor);
660  const auto it = ra_exe_unit.target_exprs_original_type_infos.find(i);
661  if (it != ra_exe_unit.target_exprs_original_type_infos.end()) {
662  const auto& original_target_expr_ti = it->second;
663  if (arg_ti.is_integer() && original_target_expr_ti.get_type() == kDATE &&
664  original_target_expr_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
665  // manually encode the col range of date col if necessary
666  // (see conditionally_change_arg_to_int_type function in RelAlgExecutor.cpp)
667  auto is_date_value_not_encoded = [&original_target_expr_ti](int64_t date_val) {
668  if (original_target_expr_ti.get_comp_param() == 16) {
669  return date_val < INT16_MIN || date_val > INT16_MAX;
670  } else {
671  return date_val < INT32_MIN || date_val > INT32_MIN;
672  }
673  };
674  if (is_date_value_not_encoded(arg_range_info.min)) {
675  // chunk metadata of the date column contains decoded value
676  // so we manually encode it again here to represent its column range correctly
677  arg_range_info.min =
679  }
680  if (is_date_value_not_encoded(arg_range_info.max)) {
681  arg_range_info.max =
683  }
684  // now we manually encode the value, so we need to invalidate bucket value
685  // i.e., 86000 -> 0, to correctly calculate the size of bitmap
686  arg_range_info.bucket = 0;
687  }
688  }
689 
691  int64_t bitmap_sz_bits{0};
692  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
693  const auto error_rate_expr = agg_expr->get_arg1();
694  if (error_rate_expr) {
695  CHECK(error_rate_expr->get_type_info().get_type() == kINT);
696  auto const error_rate =
697  dynamic_cast<Analyzer::Constant const*>(error_rate_expr.get());
698  CHECK(error_rate);
699  CHECK_GE(error_rate->get_constval().intval, 1);
700  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
701  } else {
702  bitmap_sz_bits = g_hll_precision_bits;
703  }
704  }
705  if (arg_range_info.isEmpty()) {
706  count_distinct_descriptors.emplace_back(
708  0,
709  64,
710  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
711  device_type,
712  1});
713  continue;
714  }
715  const auto sub_bitmap_count =
716  get_count_distinct_sub_bitmap_count(bitmap_sz_bits, ra_exe_unit, device_type);
717  size_t worst_case_num_groups{1};
718  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
719  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
720  // implementation for arrays
721  count_distinct_impl_type = CountDistinctImplType::Bitmap;
722  if (shared::is_any<kCOUNT, kCOUNT_IF>(agg_info.agg_kind)) {
723  bitmap_sz_bits = get_bucketed_cardinality_without_nulls(arg_range_info);
724  if (bitmap_sz_bits <= 0 || g_bitmap_memory_limit <= bitmap_sz_bits) {
725  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
726  }
727  // check a potential OOM when using bitmap-based approach
728  const auto total_bytes_per_entry =
729  compute_bytes_per_group(bitmap_sz_bits, sub_bitmap_count, device_type);
730  const auto range_bucket = std::max(group_by_range_info.bucket, (int64_t)1);
731  const auto maximum_num_groups =
732  (group_by_range_info.max - group_by_range_info.min + 1) / range_bucket;
733  const auto total_bitmap_bytes_for_groups =
734  total_bytes_per_entry * maximum_num_groups;
735  // we can estimate a potential OOM of bitmap-based count-distinct operator
736  // by using the logic "check_total_bitmap_memory"
737  if (total_bitmap_bytes_for_groups >=
738  static_cast<size_t>(g_bitmap_memory_limit)) {
739  const auto agg_expr_max_entry_count =
740  arg_range_info.max - arg_range_info.min + 1;
741  int64_t max_agg_expr_table_cardinality{1};
742  std::set<const Analyzer::ColumnVar*,
743  bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
745  agg_expr->collect_column_var(colvar_set, true);
746  for (const auto cv : colvar_set) {
747  auto it =
748  std::find_if(query_infos.begin(),
749  query_infos.end(),
750  [&](const auto& input_table_info) {
751  return input_table_info.table_key == cv->getTableKey();
752  });
753  int64_t cur_table_cardinality =
754  it != query_infos.end()
755  ? static_cast<int64_t>(it->info.getNumTuplesUpperBound())
756  : -1;
757  max_agg_expr_table_cardinality =
758  std::max(max_agg_expr_table_cardinality, cur_table_cardinality);
759  worst_case_num_groups *= cur_table_cardinality;
760  }
761  auto has_valid_stat = [agg_expr_max_entry_count, maximum_num_groups]() {
762  return agg_expr_max_entry_count > 0 && maximum_num_groups > 0;
763  };
764  // if we have valid stats regarding input expr, we can try to relax the OOM
765  if (has_valid_stat()) {
766  // a threshold related to a ratio of a range of agg expr (let's say R)
767  // and table cardinality (C), i.e., use unordered_set if the # bits to build
768  // a bitmap based on R is four times larger than that of C
769  const size_t unordered_set_threshold{2};
770  // When we detect OOM of bitmap-based approach we selectively switch it to
771  // hash set-based processing logic if one of the followings is satisfied:
772  // 1) the column range is too wide compared with the table cardinality, or
773  // 2) the column range is too wide compared with the avg of # unique values
774  // per group by entry
775  const auto bits_for_agg_entry = std::ceil(log(agg_expr_max_entry_count));
776  const auto bits_for_agg_table =
777  std::ceil(log(max_agg_expr_table_cardinality));
778  const auto avg_num_unique_entries_per_group =
779  std::ceil(max_agg_expr_table_cardinality / maximum_num_groups);
780  // case a) given a range of entry count of agg_expr and the maximum
781  // cardinality among source tables of the agg_expr , we try to detect the
782  // misleading case of too sparse column range , i.e., agg_expr has 1M column
783  // range but only has two tuples {1 and 1M} / case b) check whether
784  // using bitmap is really beneficial when considering uniform distribution
785  // of (unique) keys.
786  if ((bits_for_agg_entry - bits_for_agg_table) >= unordered_set_threshold ||
787  agg_expr_max_entry_count >= avg_num_unique_entries_per_group) {
788  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
789  } else {
790  throw std::runtime_error(
791  "Consider using approx_count_distinct operator instead of "
792  "count_distinct operator to lower the memory "
793  "requirements");
794  }
795  }
796  }
797  }
798  }
799  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
800  count_distinct_impl_type == CountDistinctImplType::UnorderedSet &&
801  !(arg_ti.is_array() || arg_ti.is_geometry())) {
802  count_distinct_impl_type = CountDistinctImplType::Bitmap;
803  }
804  const size_t too_many_entries{100000000};
805  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
806  worst_case_num_groups > too_many_entries &&
807  count_distinct_impl_type == CountDistinctImplType::UnorderedSet) {
808  throw WatchdogException(
809  "Detect too many input entries for set-based count distinct operator under "
810  "the watchdog");
811  }
812  count_distinct_descriptors.emplace_back(
813  CountDistinctDescriptor{count_distinct_impl_type,
814  arg_range_info.min,
815  bitmap_sz_bits,
816  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
817  device_type,
818  sub_bitmap_count});
819  } else {
820  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
821  CountDistinctImplType::Invalid, 0, 0, false, device_type, 0});
822  }
823  }
824  return count_distinct_descriptors;
825 }
std::vector< Analyzer::Expr * > target_exprs
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
Definition: Analyzer.h:215
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:113
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
Definition: Analyzer.h:222
ExecutorDeviceType
#define CHECK_GE(x, y)
Definition: Logger.h:306
Expr * get_arg() const
Definition: Analyzer.h:1208
int g_hll_precision_bits
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:88
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:34
bool g_bigint_count
bool g_enable_watchdog
int64_t g_bitmap_memory_limit
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:98
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
Definition: sqltypes.h:70
int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo &col_range_info)
std::unordered_map< size_t, SQLTypeInfo > target_exprs_original_type_infos
Definition: sqldefs.h:78
CountDistinctImplType
#define CHECK(condition)
Definition: Logger.h:291
int64_t get_epoch_days_from_seconds(const int64_t seconds)
Definition: sqltypes.h:62
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool anonymous_namespace{GroupByAndAggregate.cpp}::is_column_range_too_big_for_perfect_hash ( const ColRangeInfo col_range_info,
const int64_t  max_entry_count 
)

Definition at line 148 of file GroupByAndAggregate.cpp.

References ColRangeInfo::max, and ColRangeInfo::min.

Referenced by GroupByAndAggregate::getColRangeInfo().

149  {
150  try {
151  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
152  checked_int64_t(col_range_info.min)) >= max_entry_count;
153  } catch (...) {
154  return true;
155  }
156 }
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t

+ Here is the caller graph for this function: