OmniSciDB  a987f07e93
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
anonymous_namespace{GroupByAndAggregate.cpp} Namespace Reference

Functions

int32_t get_agg_count (const std::vector< Analyzer::Expr * > &target_exprs)
 
bool expr_is_rowid (const Analyzer::Expr *expr, const Catalog_Namespace::Catalog &cat)
 
bool has_count_distinct (const RelAlgExecutionUnit &ra_exe_unit)
 
bool is_column_range_too_big_for_perfect_hash (const ColRangeInfo &col_range_info, const int64_t max_entry_count)
 
bool cardinality_estimate_less_than_column_range (const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
 
ColRangeInfo get_expr_range_info (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
 
int64_t get_bucketed_cardinality_without_nulls (const ColRangeInfo &col_range_info)
 
KeylessInfo get_keyless_info (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
 
CountDistinctDescriptors init_count_distinct_descriptors (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &group_by_range_info, const ExecutorDeviceType device_type, Executor *executor)
 

Function Documentation

bool anonymous_namespace{GroupByAndAggregate.cpp}::cardinality_estimate_less_than_column_range ( const int64_t  cardinality_estimate,
const ColRangeInfo col_range_info 
)

Definition at line 159 of file GroupByAndAggregate.cpp.

References ColRangeInfo::max, and ColRangeInfo::min.

Referenced by GroupByAndAggregate::getColRangeInfo().

160  {
161  try {
162  // the cardinality estimate is the size of the baseline hash table. further penalize
163  // the baseline hash table by a factor of 2x due to overhead in computing baseline
164  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
165  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
166  // count of the column, we use baseline hash on the filtered set
167  return checked_int64_t(cardinality_estimate) * 2 <
168  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
169  checked_int64_t(col_range_info.min));
170  } catch (...) {
171  return false;
172  }
173 }
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t

+ Here is the caller graph for this function:

bool anonymous_namespace{GroupByAndAggregate.cpp}::expr_is_rowid ( const Analyzer::Expr expr,
const Catalog_Namespace::Catalog cat 
)

Definition at line 125 of file GroupByAndAggregate.cpp.

References cat(), CHECK_EQ, and get_column_descriptor_maybe().

Referenced by GroupByAndAggregate::getColRangeInfo().

125  {
126  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
127  if (!col) {
128  return false;
129  }
130  const auto cd =
131  get_column_descriptor_maybe(col->get_column_id(), col->get_table_id(), cat);
132  if (!cd || !cd->isVirtualCol) {
133  return false;
134  }
135  CHECK_EQ("rowid", cd->columnName);
136  return true;
137 }
#define CHECK_EQ(x, y)
Definition: Logger.h:297
std::string cat(Ts &&...args)
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:220

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int32_t anonymous_namespace{GroupByAndAggregate.cpp}::get_agg_count ( const std::vector< Analyzer::Expr * > &  target_exprs)

Definition at line 100 of file GroupByAndAggregate.cpp.

References agg_count(), CHECK, Analyzer::Expr::get_type_info(), kAVG, and kSAMPLE.

Referenced by GroupByAndAggregate::codegen().

100  {
101  int32_t agg_count{0};
102  for (auto target_expr : target_exprs) {
103  CHECK(target_expr);
104  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
105  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
106  const auto& ti = target_expr->get_type_info();
107  if (ti.is_buffer()) {
108  agg_count += 2;
109  } else if (ti.is_geometry()) {
110  agg_count += ti.get_physical_coord_cols() * 2;
111  } else {
112  ++agg_count;
113  }
114  continue;
115  }
116  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
117  agg_count += 2;
118  } else {
119  ++agg_count;
120  }
121  }
122  return agg_count;
123 }
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:83
#define CHECK(condition)
Definition: Logger.h:289
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
Definition: sqldefs.h:74

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int64_t anonymous_namespace{GroupByAndAggregate.cpp}::get_bucketed_cardinality_without_nulls ( const ColRangeInfo col_range_info)

Definition at line 348 of file GroupByAndAggregate.cpp.

References ColRangeInfo::bucket, ColRangeInfo::max, and ColRangeInfo::min.

Referenced by init_count_distinct_descriptors().

348  {
349  if (col_range_info.min <= col_range_info.max) {
350  size_t size = col_range_info.max - col_range_info.min;
351  if (col_range_info.bucket) {
352  size /= col_range_info.bucket;
353  }
354  if (size >= static_cast<size_t>(std::numeric_limits<int64_t>::max())) {
355  // try to use unordered_set instead of crashing due to CHECK failure
356  // i.e., CHECK_LT(size, std::numeric_limits<int64_t>::max());
357  return 0;
358  }
359  return static_cast<int64_t>(size + 1);
360  } else {
361  return 0;
362  }
363 }

+ Here is the caller graph for this function:

ColRangeInfo anonymous_namespace{GroupByAndAggregate.cpp}::get_expr_range_info ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const Analyzer::Expr expr,
Executor executor 
)

Definition at line 175 of file GroupByAndAggregate.cpp.

References CHECK, Double, Float, getExpressionRange(), GroupByBaselineHash, GroupByPerfectHash, Integer, Invalid, NonGroupedAggregate, Projection, and RelAlgExecutionUnit::simple_quals.

Referenced by GroupByAndAggregate::codegenGroupBy(), GroupByAndAggregate::codegenPerfectHashFunction(), GroupByAndAggregate::getColRangeInfo(), GroupByAndAggregate::gpuCanHandleOrderEntries(), and init_count_distinct_descriptors().

178  {
179  if (!expr) {
180  return {QueryDescriptionType::Projection, 0, 0, 0, false};
181  }
182 
183  const auto expr_range = getExpressionRange(
184  expr, query_infos, executor, boost::make_optional(ra_exe_unit.simple_quals));
185  switch (expr_range.getType()) {
187  if (expr_range.getIntMin() > expr_range.getIntMax()) {
188  return {
189  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
190  }
192  expr_range.getIntMin(),
193  expr_range.getIntMax(),
194  expr_range.getBucket(),
195  expr_range.hasNulls()};
196  }
199  if (expr_range.getFpMin() > expr_range.getFpMax()) {
200  return {
201  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
202  }
203  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
204  }
206  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
207  default:
208  CHECK(false);
209  }
210  CHECK(false);
211  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
212 }
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
#define CHECK(condition)
Definition: Logger.h:289
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

KeylessInfo anonymous_namespace{GroupByAndAggregate.cpp}::get_keyless_info ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const bool  is_group_by,
Executor executor 
)

This function goes through all target expressions and answers two questions:

  1. Is it possible to have keyless hash?
  2. If yes to 1, then what aggregate expression should be considered to represent the key's presence, if needed (e.g., in detecting empty entries in the result set).

NOTE: Keyless hash is only valid with single-column group by at the moment.

Definition at line 458 of file GroupByAndAggregate.cpp.

References agg_arg(), CHECK, constrained_not_null(), Double, Float, g_bigint_count, get_agg_initial_val(), get_compact_type(), get_target_info(), getExpressionRange(), Integer, Invalid, is_distinct_target(), kAVG, kCOUNT, kMAX, kMIN, kSUM, RelAlgExecutionUnit::quals, takes_float_argument(), and RelAlgExecutionUnit::target_exprs.

Referenced by GroupByAndAggregate::initQueryMemoryDescriptorImpl().

461  {
462  bool keyless{true}, found{false};
463  int32_t num_agg_expr{0};
464  int32_t index{0};
465  for (const auto target_expr : ra_exe_unit.target_exprs) {
466  const auto agg_info = get_target_info(target_expr, g_bigint_count);
467  const auto chosen_type = get_compact_type(agg_info);
468  if (agg_info.is_agg) {
469  num_agg_expr++;
470  }
471  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
472  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
473  CHECK(agg_expr);
474  const auto arg_expr = agg_arg(target_expr);
475  const bool float_argument_input = takes_float_argument(agg_info);
476  switch (agg_info.agg_kind) {
477  case kAVG:
478  ++index;
479  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
480  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
481  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
482  expr_range_info.hasNulls()) {
483  break;
484  }
485  }
486  found = true;
487  break;
488  case kCOUNT:
489  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
490  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
491  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
492  expr_range_info.hasNulls()) {
493  break;
494  }
495  }
496  found = true;
497  break;
498  case kSUM: {
499  auto arg_ti = arg_expr->get_type_info();
500  if (constrained_not_null(arg_expr, ra_exe_unit.quals)) {
501  arg_ti.set_notnull(true);
502  }
503  if (!arg_ti.get_notnull()) {
504  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
505  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
506  !expr_range_info.hasNulls()) {
507  found = true;
508  }
509  } else {
510  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
511  switch (expr_range_info.getType()) {
514  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
515  found = true;
516  }
517  break;
519  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
520  found = true;
521  }
522  break;
523  default:
524  break;
525  }
526  }
527  break;
528  }
529  case kMIN: {
530  CHECK(agg_expr && agg_expr->get_arg());
531  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
532  if (arg_ti.is_string() || arg_ti.is_buffer()) {
533  break;
534  }
535  auto expr_range_info =
536  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
537  auto init_max = get_agg_initial_val(agg_info.agg_kind,
538  chosen_type,
539  is_group_by || float_argument_input,
540  float_argument_input ? sizeof(float) : 8);
541  switch (expr_range_info.getType()) {
544  auto double_max =
545  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
546  if (expr_range_info.getFpMax() < double_max) {
547  found = true;
548  }
549  break;
550  }
552  if (expr_range_info.getIntMax() < init_max) {
553  found = true;
554  }
555  break;
556  default:
557  break;
558  }
559  break;
560  }
561  case kMAX: {
562  CHECK(agg_expr && agg_expr->get_arg());
563  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
564  if (arg_ti.is_string() || arg_ti.is_buffer()) {
565  break;
566  }
567  auto expr_range_info =
568  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
569  // NULL sentinel and init value for kMAX are identical, which results in
570  // ambiguity in detecting empty keys in presence of nulls.
571  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
572  expr_range_info.hasNulls()) {
573  break;
574  }
575  auto init_min = get_agg_initial_val(agg_info.agg_kind,
576  chosen_type,
577  is_group_by || float_argument_input,
578  float_argument_input ? sizeof(float) : 8);
579  switch (expr_range_info.getType()) {
582  auto double_min =
583  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
584  if (expr_range_info.getFpMin() > double_min) {
585  found = true;
586  }
587  break;
588  }
590  if (expr_range_info.getIntMin() > init_min) {
591  found = true;
592  }
593  break;
594  default:
595  break;
596  }
597  break;
598  }
599  default:
600  keyless = false;
601  break;
602  }
603  }
604  if (!keyless) {
605  break;
606  }
607  if (!found) {
608  ++index;
609  }
610  }
611 
612  // shouldn't use keyless for projection only
613  return {
614  keyless && found,
615  index,
616  };
617 }
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:102
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:88
Definition: sqldefs.h:75
const SQLTypeInfo get_compact_type(const TargetInfo &target)
bool g_bigint_count
Definition: sqldefs.h:77
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:98
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
Definition: sqldefs.h:78
std::list< std::shared_ptr< Analyzer::Expr > > quals
#define CHECK(condition)
Definition: Logger.h:289
Definition: sqldefs.h:76
Definition: sqldefs.h:74

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool anonymous_namespace{GroupByAndAggregate.cpp}::has_count_distinct ( const RelAlgExecutionUnit ra_exe_unit)

Definition at line 139 of file GroupByAndAggregate.cpp.

References g_bigint_count, get_target_info(), is_distinct_target(), and RelAlgExecutionUnit::target_exprs.

Referenced by GroupByAndAggregate::getColRangeInfo().

139  {
140  for (const auto& target_expr : ra_exe_unit.target_exprs) {
141  const auto agg_info = get_target_info(target_expr, g_bigint_count);
142  if (agg_info.is_agg && is_distinct_target(agg_info)) {
143  return true;
144  }
145  }
146  return false;
147 }
std::vector< Analyzer::Expr * > target_exprs
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:88
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:98

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

CountDistinctDescriptors anonymous_namespace{GroupByAndAggregate.cpp}::init_count_distinct_descriptors ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const ColRangeInfo group_by_range_info,
const ExecutorDeviceType  device_type,
Executor executor 
)

Definition at line 619 of file GroupByAndAggregate.cpp.

References align_to_int64(), Bitmap, ColRangeInfo::bucket, CHECK, CHECK_GE, Analyzer::ColumnVar::collect_column_var(), Analyzer::ColumnVar::colvar_comp(), g_bigint_count, g_bitmap_memory_limit, g_enable_watchdog, g_hll_precision_bits, Analyzer::AggExpr::get_arg(), get_bucketed_cardinality_without_nulls(), get_count_distinct_sub_bitmap_count(), DateConverters::get_epoch_days_from_seconds(), get_expr_range_info(), get_target_info(), Analyzer::Expr::get_type_info(), GPU, GroupByPerfectHash, hll_size_for_rate(), Invalid, is_distinct_target(), kAPPROX_COUNT_DISTINCT, kCOUNT, kDATE, kENCODING_DATE_IN_DAYS, kINT, ColRangeInfo::max, ColRangeInfo::min, Projection, RelAlgExecutionUnit::target_exprs, RelAlgExecutionUnit::target_exprs_original_type_infos, and UnorderedSet.

Referenced by GroupByAndAggregate::initQueryMemoryDescriptorImpl().

624  {
625  CountDistinctDescriptors count_distinct_descriptors;
626  auto compute_bytes_per_group =
627  [](size_t bitmap_sz, size_t sub_bitmap_count, ExecutorDeviceType device_type) {
628  size_t effective_size_bytes = (bitmap_sz + 7) / 8;
629  const auto padded_size =
630  (device_type == ExecutorDeviceType::GPU || sub_bitmap_count > 1)
631  ? align_to_int64(effective_size_bytes)
632  : effective_size_bytes;
633  return padded_size * sub_bitmap_count;
634  };
635  for (size_t i = 0; i < ra_exe_unit.target_exprs.size(); i++) {
636  const auto target_expr = ra_exe_unit.target_exprs[i];
637  auto agg_info = get_target_info(target_expr, g_bigint_count);
638  if (is_distinct_target(agg_info)) {
639  CHECK(agg_info.is_agg);
640  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
641  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
642  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
643  if (arg_ti.is_bytes()) {
644  throw std::runtime_error(
645  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
646  }
647  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
648  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
649  }
650  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
651  throw std::runtime_error(
652  "APPROX_COUNT_DISTINCT on geometry columns not supported");
653  }
654  if (agg_info.is_distinct && arg_ti.is_geometry()) {
655  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
656  }
657  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
658  auto arg_range_info =
659  arg_ti.is_fp() ? no_range_info
661  ra_exe_unit, query_infos, agg_expr->get_arg(), executor);
662  const auto it = ra_exe_unit.target_exprs_original_type_infos.find(i);
663  if (it != ra_exe_unit.target_exprs_original_type_infos.end()) {
664  const auto& original_target_expr_ti = it->second;
665  if (arg_ti.is_integer() && original_target_expr_ti.get_type() == kDATE &&
666  original_target_expr_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
667  // manually encode the col range of date col if necessary
668  // (see conditionally_change_arg_to_int_type function in RelAlgExecutor.cpp)
669  auto is_date_value_not_encoded = [&original_target_expr_ti](int64_t date_val) {
670  if (original_target_expr_ti.get_comp_param() == 16) {
671  return date_val < INT16_MIN || date_val > INT16_MAX;
672  } else {
673  return date_val < INT32_MIN || date_val > INT32_MIN;
674  }
675  };
676  if (is_date_value_not_encoded(arg_range_info.min)) {
677  // chunk metadata of the date column contains decoded value
678  // so we manually encode it again here to represent its column range correctly
679  arg_range_info.min =
681  }
682  if (is_date_value_not_encoded(arg_range_info.max)) {
683  arg_range_info.max =
685  }
686  // now we manually encode the value, so we need to invalidate bucket value
687  // i.e., 86000 -> 0, to correctly calculate the size of bitmap
688  arg_range_info.bucket = 0;
689  }
690  }
691 
693  int64_t bitmap_sz_bits{0};
694  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
695  const auto error_rate_expr = agg_expr->get_arg1();
696  if (error_rate_expr) {
697  CHECK(error_rate_expr->get_type_info().get_type() == kINT);
698  auto const error_rate =
699  dynamic_cast<Analyzer::Constant const*>(error_rate_expr.get());
700  CHECK(error_rate);
701  CHECK_GE(error_rate->get_constval().intval, 1);
702  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
703  } else {
704  bitmap_sz_bits = g_hll_precision_bits;
705  }
706  }
707  if (arg_range_info.isEmpty()) {
708  count_distinct_descriptors.emplace_back(
710  0,
711  64,
712  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
713  device_type,
714  1});
715  continue;
716  }
717  const auto sub_bitmap_count =
718  get_count_distinct_sub_bitmap_count(bitmap_sz_bits, ra_exe_unit, device_type);
719  size_t worst_case_num_groups{1};
720  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
721  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
722  // implementation for arrays
723  count_distinct_impl_type = CountDistinctImplType::Bitmap;
724  if (shared::is_any<kCOUNT, kCOUNT_IF>(agg_info.agg_kind)) {
725  bitmap_sz_bits = get_bucketed_cardinality_without_nulls(arg_range_info);
726  if (bitmap_sz_bits <= 0 || g_bitmap_memory_limit <= bitmap_sz_bits) {
727  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
728  }
729  // check a potential OOM when using bitmap-based approach
730  const auto total_bytes_per_entry =
731  compute_bytes_per_group(bitmap_sz_bits, sub_bitmap_count, device_type);
732  const auto range_bucket = std::max(group_by_range_info.bucket, (int64_t)1);
733  const auto maximum_num_groups =
734  (group_by_range_info.max - group_by_range_info.min + 1) / range_bucket;
735  const auto total_bitmap_bytes_for_groups =
736  total_bytes_per_entry * maximum_num_groups;
737  // we can estimate a potential OOM of bitmap-based count-distinct operator
738  // by using the logic "check_total_bitmap_memory"
739  if (total_bitmap_bytes_for_groups >=
740  static_cast<size_t>(g_bitmap_memory_limit)) {
741  const auto agg_expr_max_entry_count =
742  arg_range_info.max - arg_range_info.min + 1;
743  int64_t max_agg_expr_table_cardinality{1};
744  std::set<const Analyzer::ColumnVar*,
745  bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
747  agg_expr->collect_column_var(colvar_set, true);
748  for (const auto cv : colvar_set) {
749  auto it =
750  std::find_if(query_infos.begin(),
751  query_infos.end(),
752  [&](const auto& input_table_info) {
753  return input_table_info.table_id == cv->get_table_id();
754  });
755  int64_t cur_table_cardinality =
756  it != query_infos.end()
757  ? static_cast<int64_t>(it->info.getNumTuplesUpperBound())
758  : -1;
759  max_agg_expr_table_cardinality =
760  std::max(max_agg_expr_table_cardinality, cur_table_cardinality);
761  worst_case_num_groups *= cur_table_cardinality;
762  }
763  auto has_valid_stat = [agg_expr_max_entry_count, maximum_num_groups]() {
764  return agg_expr_max_entry_count > 0 && maximum_num_groups > 0;
765  };
766  // if we have valid stats regarding input expr, we can try to relax the OOM
767  if (has_valid_stat()) {
768  // a threshold related to a ratio of a range of agg expr (let's say R)
769  // and table cardinality (C), i.e., use unordered_set if the # bits to build
770  // a bitmap based on R is four times larger than that of C
771  const size_t unordered_set_threshold{2};
772  // When we detect OOM of bitmap-based approach we selectively switch it to
773  // hash set-based processing logic if one of the followings is satisfied:
774  // 1) the column range is too wide compared with the table cardinality, or
775  // 2) the column range is too wide compared with the avg of # unique values
776  // per group by entry
777  const auto bits_for_agg_entry = std::ceil(log(agg_expr_max_entry_count));
778  const auto bits_for_agg_table =
779  std::ceil(log(max_agg_expr_table_cardinality));
780  const auto avg_num_unique_entries_per_group =
781  std::ceil(max_agg_expr_table_cardinality / maximum_num_groups);
782  // case a) given a range of entry count of agg_expr and the maximum
783  // cardinality among source tables of the agg_expr , we try to detect the
784  // misleading case of too sparse column range , i.e., agg_expr has 1M column
785  // range but only has two tuples {1 and 1M} / case b) check whether
786  // using bitmap is really beneficial when considering uniform distribution
787  // of (unique) keys.
788  if ((bits_for_agg_entry - bits_for_agg_table) >= unordered_set_threshold ||
789  agg_expr_max_entry_count >= avg_num_unique_entries_per_group) {
790  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
791  } else {
792  throw std::runtime_error(
793  "Consider using approx_count_distinct operator instead of "
794  "count_distinct operator to lower the memory "
795  "requirements");
796  }
797  }
798  }
799  }
800  }
801  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
802  count_distinct_impl_type == CountDistinctImplType::UnorderedSet &&
803  !(arg_ti.is_array() || arg_ti.is_geometry())) {
804  count_distinct_impl_type = CountDistinctImplType::Bitmap;
805  }
806  const size_t too_many_entries{100000000};
807  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
808  worst_case_num_groups > too_many_entries &&
809  count_distinct_impl_type == CountDistinctImplType::UnorderedSet) {
810  throw WatchdogException(
811  "Detect too many input entries for set-based count distinct operator under "
812  "the watchdog");
813  }
814  count_distinct_descriptors.emplace_back(
815  CountDistinctDescriptor{count_distinct_impl_type,
816  arg_range_info.min,
817  bitmap_sz_bits,
818  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
819  device_type,
820  sub_bitmap_count});
821  } else {
822  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
823  CountDistinctImplType::Invalid, 0, 0, false, device_type, 0});
824  }
825  }
826  return count_distinct_descriptors;
827 }
std::vector< Analyzer::Expr * > target_exprs
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
Definition: Analyzer.h:217
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:113
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
Definition: Analyzer.h:222
ExecutorDeviceType
#define CHECK_GE(x, y)
Definition: Logger.h:302
Expr * get_arg() const
Definition: Analyzer.h:1204
int g_hll_precision_bits
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:88
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:34
bool g_bigint_count
bool g_enable_watchdog
int64_t g_bitmap_memory_limit
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:98
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:83
Definition: sqltypes.h:68
int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo &col_range_info)
std::unordered_map< size_t, SQLTypeInfo > target_exprs_original_type_infos
Definition: sqldefs.h:78
CountDistinctImplType
#define CHECK(condition)
Definition: Logger.h:289
int64_t get_epoch_days_from_seconds(const int64_t seconds)
Definition: sqltypes.h:60
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool anonymous_namespace{GroupByAndAggregate.cpp}::is_column_range_too_big_for_perfect_hash ( const ColRangeInfo col_range_info,
const int64_t  max_entry_count 
)

Definition at line 149 of file GroupByAndAggregate.cpp.

References ColRangeInfo::max, and ColRangeInfo::min.

Referenced by GroupByAndAggregate::getColRangeInfo().

150  {
151  try {
152  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
153  checked_int64_t(col_range_info.min)) >= max_entry_count;
154  } catch (...) {
155  return true;
156  }
157 }
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t

+ Here is the caller graph for this function: