OmniSciDB  c0231cc57d
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
19 
20 #include "CardinalityEstimator.h"
21 #include "CodeGenerator.h"
23 #include "ExpressionRange.h"
24 #include "ExpressionRewrite.h"
25 #include "GpuInitGroups.h"
26 #include "InPlaceSort.h"
28 #include "MaxwellCodegenPatch.h"
30 #include "TargetExprBuilder.h"
31 
32 #include "../CudaMgr/CudaMgr.h"
33 #include "../Shared/checked_alloc.h"
34 #include "../Shared/funcannotations.h"
35 #include "../Utils/ChunkIter.h"
37 #include "Execute.h"
38 #include "QueryTemplateGenerator.h"
39 #include "RuntimeFunctions.h"
40 #include "StreamingTopN.h"
41 #include "TopKSort.h"
42 #include "WindowContext.h"
43 
44 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
45 
46 #include <cstring> // strcat()
47 #include <limits>
48 #include <numeric>
49 #include <string_view>
50 #include <thread>
51 
52 bool g_cluster{false};
53 bool g_bigint_count{false};
56 extern int64_t g_bitmap_memory_limit;
57 extern size_t g_leaf_count;
58 
59 bool ColRangeInfo::isEmpty() const {
60  return min == 0 && max == -1;
61 }
62 
63 std::ostream& operator<<(std::ostream& out, const ColRangeInfo& info) {
64  out << "Hash Type = " << info.hash_type_ << " min = " << info.min
65  << " max = " << info.max << " bucket = " << info.bucket
66  << " has_nulls = " << info.has_nulls << "\n";
67  return out;
68 }
69 
70 std::ostream& operator<<(std::ostream& out, const CountDistinctImplType& type) {
71  switch (type) {
73  out << "Invalid";
74  break;
76  out << "Bitmap";
77  break;
79  out << "UnorderedSet";
80  break;
81  default:
82  out << "<Unkown Type>";
83  break;
84  }
85  return out;
86 }
87 
88 std::ostream& operator<<(std::ostream& out, const CountDistinctDescriptor& desc) {
89  out << "Type = " << desc.impl_type_ << " min val = " << desc.min_val
90  << " bitmap_sz_bits = " << desc.bitmap_sz_bits
91  << " bool approximate = " << desc.approximate
92  << " device_type = " << desc.device_type
93  << " sub_bitmap_count = " << desc.sub_bitmap_count;
94  return out;
95 }
96 
97 namespace {
98 
99 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
100  int32_t agg_count{0};
101  for (auto target_expr : target_exprs) {
102  CHECK(target_expr);
103  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
104  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
105  const auto& ti = target_expr->get_type_info();
106  if (ti.is_buffer()) {
107  agg_count += 2;
108  } else if (ti.is_geometry()) {
109  agg_count += ti.get_physical_coord_cols() * 2;
110  } else {
111  ++agg_count;
112  }
113  continue;
114  }
115  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
116  agg_count += 2;
117  } else {
118  ++agg_count;
119  }
120  }
121  return agg_count;
122 }
123 
125  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
126  if (!col) {
127  return false;
128  }
129  const auto cd =
130  get_column_descriptor_maybe(col->get_column_id(), col->get_table_id(), cat);
131  if (!cd || !cd->isVirtualCol) {
132  return false;
133  }
134  CHECK_EQ("rowid", cd->columnName);
135  return true;
136 }
137 
138 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
139  for (const auto& target_expr : ra_exe_unit.target_exprs) {
140  const auto agg_info = get_target_info(target_expr, g_bigint_count);
141  if (agg_info.is_agg && is_distinct_target(agg_info)) {
142  return true;
143  }
144  }
145  return false;
146 }
147 
149  const int64_t max_entry_count) {
150  try {
151  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
152  checked_int64_t(col_range_info.min)) >= max_entry_count;
153  } catch (...) {
154  return true;
155  }
156 }
157 
158 bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate,
159  const ColRangeInfo& col_range_info) {
160  try {
161  // the cardinality estimate is the size of the baseline hash table. further penalize
162  // the baseline hash table by a factor of 2x due to overhead in computing baseline
163  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
164  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
165  // count of the column, we use baseline hash on the filtered set
166  return checked_int64_t(cardinality_estimate) * 2 <
167  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
168  checked_int64_t(col_range_info.min));
169  } catch (...) {
170  return false;
171  }
172 }
173 
175  const std::vector<InputTableInfo>& query_infos,
176  const Analyzer::Expr* expr,
177  Executor* executor) {
178  if (!expr) {
179  return {QueryDescriptionType::Projection, 0, 0, 0, false};
180  }
181 
182  const auto expr_range = getExpressionRange(
183  expr, query_infos, executor, boost::make_optional(ra_exe_unit.simple_quals));
184  switch (expr_range.getType()) {
186  if (expr_range.getIntMin() > expr_range.getIntMax()) {
187  return {
188  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
189  }
191  expr_range.getIntMin(),
192  expr_range.getIntMax(),
193  expr_range.getBucket(),
194  expr_range.hasNulls()};
195  }
198  if (expr_range.getFpMin() > expr_range.getFpMax()) {
199  return {
200  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
201  }
202  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
203  }
205  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
206  default:
207  CHECK(false);
208  }
209  CHECK(false);
210  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
211 }
212 
213 } // namespace
214 
216  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
217  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
218  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
219  // can expect this to be true anyway for grouped queries since the precise version
220  // uses significantly more memory.
221  const int64_t baseline_threshold =
226  if (ra_exe_unit_.groupby_exprs.size() != 1) {
227  try {
228  checked_int64_t cardinality{1};
229  bool has_nulls{false};
230  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
231  auto col_range_info = get_expr_range_info(
232  ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
233  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
234  // going through baseline hash if a non-integer type is encountered
235  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
236  }
237  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
238  CHECK_GE(crt_col_cardinality, 0);
239  cardinality *= crt_col_cardinality;
240  if (col_range_info.has_nulls) {
241  has_nulls = true;
242  }
243  }
244  // For zero or high cardinalities, use baseline layout.
245  if (!cardinality || cardinality > baseline_threshold) {
246  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
247  }
249  0,
250  int64_t(cardinality),
251  0,
252  has_nulls};
253  } catch (...) { // overflow when computing cardinality
254  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
255  }
256  }
257  // For single column groupby on high timestamps, force baseline hash due to wide ranges
258  // we are likely to encounter when applying quals to the expression range
259  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
260  // the range is small enough
261  if (ra_exe_unit_.groupby_exprs.front() &&
262  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
263  ra_exe_unit_.simple_quals.size() > 0) {
264  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
265  }
266  const auto col_range_info = get_expr_range_info(
268  if (!ra_exe_unit_.groupby_exprs.front()) {
269  return col_range_info;
270  }
271  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
272  const int64_t col_count =
274  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
276  max_entry_count = std::min(max_entry_count, baseline_threshold);
277  }
278  const auto& groupby_expr_ti = ra_exe_unit_.groupby_exprs.front()->get_type_info();
279  if (groupby_expr_ti.is_string() && !col_range_info.bucket) {
280  CHECK(groupby_expr_ti.get_compression() == kENCODING_DICT);
281 
282  const bool has_filters =
283  !ra_exe_unit_.quals.empty() || !ra_exe_unit_.simple_quals.empty();
284  if (has_filters &&
285  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
286  // if filters are present, we can use the filter to narrow the cardinality of the
287  // group by in the case of ranges too big for perfect hash. Otherwise, we are better
288  // off attempting perfect hash (since we know the range will be made of
289  // monotonically increasing numbers from min to max for dictionary encoded strings)
290  // and failing later due to excessive memory use.
291  // Check the conditions where baseline hash can provide a performance increase and
292  // return baseline hash (potentially forcing an estimator query) as the range type.
293  // Otherwise, return col_range_info which will likely be perfect hash, though could
294  // be baseline from a previous call of this function prior to the estimator query.
295  if (!ra_exe_unit_.sort_info.order_entries.empty()) {
296  // TODO(adb): allow some sorts to pass through this block by centralizing sort
297  // algorithm decision making
299  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
300  // always use baseline hash for column range too big for perfect hash with count
301  // distinct descriptors. We will need 8GB of CPU memory minimum for the perfect
302  // hash group by in this case.
304  col_range_info.min,
305  col_range_info.max,
306  0,
307  col_range_info.has_nulls};
308  } else {
309  // use original col range for sort
310  return col_range_info;
311  }
312  }
313  // if filters are present and the filtered range is less than the cardinality of
314  // the column, consider baseline hash
317  col_range_info)) {
319  col_range_info.min,
320  col_range_info.max,
321  0,
322  col_range_info.has_nulls};
323  }
324  }
325  } else if ((!expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get(),
326  *executor_->catalog_)) &&
327  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
328  !col_range_info.bucket) {
330  col_range_info.min,
331  col_range_info.max,
332  0,
333  col_range_info.has_nulls};
334  }
335  return col_range_info;
336 }
337 
339  checked_int64_t crt_col_cardinality =
340  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
341  if (col_range_info.bucket) {
342  crt_col_cardinality /= col_range_info.bucket;
343  }
344  return static_cast<int64_t>(crt_col_cardinality +
345  (1 + (col_range_info.has_nulls ? 1 : 0)));
346 }
347 
348 namespace {
349 // Like getBucketedCardinality() without counting nulls.
350 int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo& col_range_info) {
351  if (col_range_info.min <= col_range_info.max) {
352  size_t size = col_range_info.max - col_range_info.min;
353  if (col_range_info.bucket) {
354  size /= col_range_info.bucket;
355  }
356  CHECK_LT(size, std::numeric_limits<int64_t>::max());
357  return static_cast<int64_t>(size + 1);
358  } else {
359  return 0;
360  }
361 }
362 } // namespace
363 
364 #define LL_CONTEXT executor_->cgen_state_->context_
365 #define LL_BUILDER executor_->cgen_state_->ir_builder_
366 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
367 #define LL_INT(v) executor_->cgen_state_->llInt(v)
368 #define LL_FP(v) executor_->cgen_state_->llFp(v)
369 #define ROW_FUNC executor_->cgen_state_->row_func_
370 #define CUR_FUNC executor_->cgen_state_->current_func_
371 
373  Executor* executor,
374  const ExecutorDeviceType device_type,
375  const RelAlgExecutionUnit& ra_exe_unit,
376  const std::vector<InputTableInfo>& query_infos,
377  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
378  const std::optional<int64_t>& group_cardinality_estimation)
379  : executor_(executor)
380  , ra_exe_unit_(ra_exe_unit)
381  , query_infos_(query_infos)
382  , row_set_mem_owner_(row_set_mem_owner)
383  , device_type_(device_type)
384  , group_cardinality_estimation_(group_cardinality_estimation) {
385  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
386  if (!groupby_expr) {
387  continue;
388  }
389  const auto& groupby_ti = groupby_expr->get_type_info();
390  if (groupby_ti.is_bytes()) {
391  throw std::runtime_error(
392  "Cannot group by string columns which are not dictionary encoded.");
393  }
394  if (groupby_ti.is_buffer()) {
395  throw std::runtime_error("Group by buffer not supported");
396  }
397  if (groupby_ti.is_geometry()) {
398  throw std::runtime_error("Group by geometry not supported");
399  }
400  }
401 }
402 
404  const size_t shard_count) const {
405  size_t device_count{0};
407  device_count = executor_->cudaMgr()->getDeviceCount();
408  CHECK_GT(device_count, 0u);
409  }
410 
411  int64_t bucket{col_range_info.bucket};
412 
413  if (shard_count) {
414  CHECK(!col_range_info.bucket);
415  /*
416  when a node has fewer devices than shard count,
417  a) In a distributed setup, the minimum distance between two keys would be
418  device_count because shards are stored consecutively across the physical tables,
419  i.e if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1
420  would have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf
421  node has only 1 device, in this case, all the keys from each node are loaded on
422  the device each.
423 
424  b) In a single node setup, the distance would be minimum of device_count or
425  difference of device_count - shard_count. For example: If a single node server
426  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
427  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9
428  device 3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum
429  of device_count or difference.
430 
431  When a node has device count equal to or more than shard count then the
432  minimum distance is always at least shard_count * no of leaf nodes.
433  */
434  if (device_count < shard_count) {
435  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
436  : std::min(device_count, shard_count - device_count);
437  } else {
438  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
439  }
440  }
441 
442  return bucket;
443 }
444 
445 namespace {
446 
457  const std::vector<InputTableInfo>& query_infos,
458  const bool is_group_by,
459  Executor* executor) {
460  bool keyless{true}, found{false};
461  int32_t num_agg_expr{0};
462  int32_t index{0};
463  for (const auto target_expr : ra_exe_unit.target_exprs) {
464  const auto agg_info = get_target_info(target_expr, g_bigint_count);
465  const auto chosen_type = get_compact_type(agg_info);
466  if (agg_info.is_agg) {
467  num_agg_expr++;
468  }
469  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
470  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
471  CHECK(agg_expr);
472  const auto arg_expr = agg_arg(target_expr);
473  const bool float_argument_input = takes_float_argument(agg_info);
474  switch (agg_info.agg_kind) {
475  case kAVG:
476  ++index;
477  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
478  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
479  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
480  expr_range_info.hasNulls()) {
481  break;
482  }
483  }
484  found = true;
485  break;
486  case kCOUNT:
487  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
488  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
489  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
490  expr_range_info.hasNulls()) {
491  break;
492  }
493  }
494  found = true;
495  break;
496  case kSUM: {
497  auto arg_ti = arg_expr->get_type_info();
498  if (constrained_not_null(arg_expr, ra_exe_unit.quals)) {
499  arg_ti.set_notnull(true);
500  }
501  if (!arg_ti.get_notnull()) {
502  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
503  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
504  !expr_range_info.hasNulls()) {
505  found = true;
506  }
507  } else {
508  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
509  switch (expr_range_info.getType()) {
512  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
513  found = true;
514  }
515  break;
517  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
518  found = true;
519  }
520  break;
521  default:
522  break;
523  }
524  }
525  break;
526  }
527  case kMIN: {
528  CHECK(agg_expr && agg_expr->get_arg());
529  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
530  if (arg_ti.is_string() || arg_ti.is_buffer()) {
531  break;
532  }
533  auto expr_range_info =
534  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
535  auto init_max = get_agg_initial_val(agg_info.agg_kind,
536  chosen_type,
537  is_group_by || float_argument_input,
538  float_argument_input ? sizeof(float) : 8);
539  switch (expr_range_info.getType()) {
542  auto double_max =
543  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
544  if (expr_range_info.getFpMax() < double_max) {
545  found = true;
546  }
547  break;
548  }
550  if (expr_range_info.getIntMax() < init_max) {
551  found = true;
552  }
553  break;
554  default:
555  break;
556  }
557  break;
558  }
559  case kMAX: {
560  CHECK(agg_expr && agg_expr->get_arg());
561  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
562  if (arg_ti.is_string() || arg_ti.is_buffer()) {
563  break;
564  }
565  auto expr_range_info =
566  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
567  // NULL sentinel and init value for kMAX are identical, which results in
568  // ambiguity in detecting empty keys in presence of nulls.
569  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
570  expr_range_info.hasNulls()) {
571  break;
572  }
573  auto init_min = get_agg_initial_val(agg_info.agg_kind,
574  chosen_type,
575  is_group_by || float_argument_input,
576  float_argument_input ? sizeof(float) : 8);
577  switch (expr_range_info.getType()) {
580  auto double_min =
581  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
582  if (expr_range_info.getFpMin() > double_min) {
583  found = true;
584  }
585  break;
586  }
588  if (expr_range_info.getIntMin() > init_min) {
589  found = true;
590  }
591  break;
592  default:
593  break;
594  }
595  break;
596  }
597  default:
598  keyless = false;
599  break;
600  }
601  }
602  if (!keyless) {
603  break;
604  }
605  if (!found) {
606  ++index;
607  }
608  }
609 
610  // shouldn't use keyless for projection only
611  return {
612  keyless && found,
613  index,
614  };
615 }
616 
618  const RelAlgExecutionUnit& ra_exe_unit,
619  const std::vector<InputTableInfo>& query_infos,
620  const ExecutorDeviceType device_type,
621  Executor* executor) {
622  CountDistinctDescriptors count_distinct_descriptors;
623  for (size_t i = 0; i < ra_exe_unit.target_exprs.size(); i++) {
624  const auto target_expr = ra_exe_unit.target_exprs[i];
625  auto agg_info = get_target_info(target_expr, g_bigint_count);
626  if (is_distinct_target(agg_info)) {
627  CHECK(agg_info.is_agg);
628  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
629  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
630  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
631  if (arg_ti.is_bytes()) {
632  throw std::runtime_error(
633  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
634  }
635  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
636  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
637  }
638  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
639  throw std::runtime_error(
640  "APPROX_COUNT_DISTINCT on geometry columns not supported");
641  }
642  if (agg_info.is_distinct && arg_ti.is_geometry()) {
643  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
644  }
645  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
646  auto arg_range_info =
647  arg_ti.is_fp() ? no_range_info
649  ra_exe_unit, query_infos, agg_expr->get_arg(), executor);
650  const auto it = ra_exe_unit.target_exprs_original_type_infos.find(i);
651  if (it != ra_exe_unit.target_exprs_original_type_infos.end()) {
652  const auto& original_target_expr_ti = it->second;
653  if (arg_ti.is_integer() && original_target_expr_ti.get_type() == kDATE &&
654  original_target_expr_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
655  // manually encode the col range of date col if necessary
656  // (see conditionally_change_arg_to_int_type function in RelAlgExecutor.cpp)
657  auto is_date_value_not_encoded = [&original_target_expr_ti](int64_t date_val) {
658  if (original_target_expr_ti.get_comp_param() == 16) {
659  return date_val < INT16_MIN || date_val > INT16_MAX;
660  } else {
661  return date_val < INT32_MIN || date_val > INT32_MIN;
662  }
663  };
664  if (is_date_value_not_encoded(arg_range_info.min)) {
665  // chunk metadata of the date column contains decoded value
666  // so we manually encode it again here to represent its column range correctly
667  arg_range_info.min =
669  }
670  if (is_date_value_not_encoded(arg_range_info.max)) {
671  arg_range_info.max =
673  }
674  // now we manually encode the value, so we need to invalidate bucket value
675  // i.e., 86000 -> 0, to correctly calculate the size of bitmap
676  arg_range_info.bucket = 0;
677  }
678  }
679 
681  int64_t bitmap_sz_bits{0};
682  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
683  const auto error_rate = agg_expr->get_arg1();
684  if (error_rate) {
685  CHECK(error_rate->get_type_info().get_type() == kINT);
686  CHECK_GE(error_rate->get_constval().intval, 1);
687  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
688  } else {
689  bitmap_sz_bits = g_hll_precision_bits;
690  }
691  }
692  if (arg_range_info.isEmpty()) {
693  count_distinct_descriptors.emplace_back(
695  0,
696  64,
697  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
698  device_type,
699  1});
700  continue;
701  }
702  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
703  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
704  // implementation for arrays
705  count_distinct_impl_type = CountDistinctImplType::Bitmap;
706  if (agg_info.agg_kind == kCOUNT) {
707  bitmap_sz_bits = get_bucketed_cardinality_without_nulls(arg_range_info);
708  if (bitmap_sz_bits <= 0 || g_bitmap_memory_limit <= bitmap_sz_bits) {
709  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
710  }
711  }
712  }
713  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
714  count_distinct_impl_type == CountDistinctImplType::UnorderedSet &&
715  !(arg_ti.is_array() || arg_ti.is_geometry())) {
716  count_distinct_impl_type = CountDistinctImplType::Bitmap;
717  }
718 
719  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
720  count_distinct_impl_type == CountDistinctImplType::UnorderedSet) {
721  throw WatchdogException("Cannot use a fast path for COUNT distinct");
722  }
723  const auto sub_bitmap_count =
724  get_count_distinct_sub_bitmap_count(bitmap_sz_bits, ra_exe_unit, device_type);
725  count_distinct_descriptors.emplace_back(
726  CountDistinctDescriptor{count_distinct_impl_type,
727  arg_range_info.min,
728  bitmap_sz_bits,
729  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
730  device_type,
731  sub_bitmap_count});
732  } else {
733  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
734  CountDistinctImplType::Invalid, 0, 0, false, device_type, 0});
735  }
736  }
737  return count_distinct_descriptors;
738 }
739 
740 } // namespace
741 
742 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
743  const bool allow_multifrag,
744  const size_t max_groups_buffer_entry_count,
745  const int8_t crt_min_byte_width,
746  RenderInfo* render_info,
747  const bool output_columnar_hint) {
748  const auto shard_count =
751  : 0;
752  bool sort_on_gpu_hint =
753  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
756  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
757  // but the total output buffer size would be too big or it's a sharded top query.
758  // For the sake of managing risk, use the new result set way very selectively for
759  // this case only (alongside the baseline layout we've enabled for a while now).
760  bool must_use_baseline_sort = shard_count;
761  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
762  while (true) {
763  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
764  max_groups_buffer_entry_count,
765  crt_min_byte_width,
766  sort_on_gpu_hint,
767  render_info,
768  must_use_baseline_sort,
769  output_columnar_hint);
770  CHECK(query_mem_desc);
771  if (query_mem_desc->sortOnGpu() &&
772  (query_mem_desc->getBufferSizeBytes(device_type_) +
773  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
774  2 * 1024 * 1024 * 1024LL) {
775  must_use_baseline_sort = true;
776  sort_on_gpu_hint = false;
777  } else {
778  break;
779  }
780  }
781  return query_mem_desc;
782 }
783 
784 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
785  const bool allow_multifrag,
786  const size_t max_groups_buffer_entry_count,
787  const int8_t crt_min_byte_width,
788  const bool sort_on_gpu_hint,
789  RenderInfo* render_info,
790  const bool must_use_baseline_sort,
791  const bool output_columnar_hint) {
792  const auto count_distinct_descriptors = init_count_distinct_descriptors(
794 
795  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
796 
797  auto col_range_info_nosharding = getColRangeInfo();
798 
799  const auto shard_count =
802  : 0;
803 
804  const auto col_range_info =
805  ColRangeInfo{col_range_info_nosharding.hash_type_,
806  col_range_info_nosharding.min,
807  col_range_info_nosharding.max,
808  getShardedTopBucket(col_range_info_nosharding, shard_count),
809  col_range_info_nosharding.has_nulls};
810 
811  // Non-grouped aggregates do not support accessing aggregated ranges
812  // Keyless hash is currently only supported with single-column perfect hash
813  const auto keyless_info =
814  !(is_group_by &&
815  col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash)
816  ? KeylessInfo{false, -1}
818 
819  if (g_enable_watchdog &&
820  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
821  max_groups_buffer_entry_count > g_watchdog_baseline_max_groups) ||
822  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
823  ra_exe_unit_.groupby_exprs.size() == 1 &&
824  (col_range_info.max - col_range_info.min) /
825  std::max(col_range_info.bucket, int64_t(1)) >
826  130000000))) {
827  throw WatchdogException("Query would use too much memory");
828  }
829  try {
831  ra_exe_unit_,
832  query_infos_,
833  col_range_info,
834  keyless_info,
835  allow_multifrag,
836  device_type_,
837  crt_min_byte_width,
838  sort_on_gpu_hint,
839  shard_count,
840  max_groups_buffer_entry_count,
841  render_info,
842  count_distinct_descriptors,
843  must_use_baseline_sort,
844  output_columnar_hint,
845  /*streaming_top_n_hint=*/true);
846  } catch (const StreamingTopNOOM& e) {
847  LOG(WARNING) << e.what() << " Disabling Streaming Top N.";
849  ra_exe_unit_,
850  query_infos_,
851  col_range_info,
852  keyless_info,
853  allow_multifrag,
854  device_type_,
855  crt_min_byte_width,
856  sort_on_gpu_hint,
857  shard_count,
858  max_groups_buffer_entry_count,
859  render_info,
860  count_distinct_descriptors,
861  must_use_baseline_sort,
862  output_columnar_hint,
863  /*streaming_top_n_hint=*/false);
864  }
865 }
866 
868  const std::list<Analyzer::OrderEntry>& order_entries) {
869  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
870  return false;
871  }
872  for (const auto& order_entry : order_entries) {
873  CHECK_GE(order_entry.tle_no, 1);
874  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
875  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
876  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
877  return false;
878  }
879  // TODO(alex): relax the restrictions
880  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
881  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
882  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
883  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
884  return false;
885  }
886  if (agg_expr->get_arg()) {
887  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
888  if (arg_ti.is_fp()) {
889  return false;
890  }
891  auto expr_range_info =
892  get_expr_range_info(ra_exe_unit_, query_infos_, agg_expr->get_arg(), executor_);
893  // TOD(adb): QMD not actually initialized here?
894  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
895  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
896  expr_range_info.has_nulls) &&
897  order_entry.is_desc == order_entry.nulls_first) {
898  return false;
899  }
900  }
901  const auto& target_ti = target_expr->get_type_info();
902  CHECK(!target_ti.is_buffer());
903  if (!target_ti.is_integer()) {
904  return false;
905  }
906  }
907  return true;
908 }
909 
910 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
911  llvm::BasicBlock* sc_false,
913  const CompilationOptions& co,
914  const GpuSharedMemoryContext& gpu_smem_context) {
915  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
916  CHECK(filter_result);
917 
918  bool can_return_error = false;
919  llvm::BasicBlock* filter_false{nullptr};
920 
921  {
922  const bool is_group_by = !ra_exe_unit_.groupby_exprs.empty();
923 
924  if (executor_->isArchMaxwell(co.device_type)) {
926  }
927  DiamondCodegen filter_cfg(filter_result,
928  executor_,
929  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
930  "filter", // filter_true and filter_false basic blocks
931  nullptr,
932  false);
933  filter_false = filter_cfg.cond_false_;
934 
935  if (is_group_by) {
937  !query_mem_desc.useStreamingTopN()) {
938  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
939  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
940  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
941  llvm::Value* old_total_matched_val{nullptr};
943  old_total_matched_val =
944  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
945  total_matched_ptr,
946  LL_INT(int32_t(1)),
947 #if LLVM_VERSION_MAJOR > 12
948  LLVM_ALIGN(8),
949 #endif
950  llvm::AtomicOrdering::Monotonic);
951  } else {
952  old_total_matched_val = LL_BUILDER.CreateLoad(
953  total_matched_ptr->getType()->getPointerElementType(), total_matched_ptr);
954  LL_BUILDER.CreateStore(
955  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
956  total_matched_ptr);
957  }
958  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
959  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
960  }
961 
962  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
963  auto varlen_output_buffer = codegenVarlenOutputBuffer(query_mem_desc);
964  if (query_mem_desc.usesGetGroupValueFast() ||
965  query_mem_desc.getQueryDescriptionType() ==
967  if (query_mem_desc.getGroupbyColCount() > 1) {
968  filter_cfg.setChainToNext();
969  }
970  // Don't generate null checks if the group slot is guaranteed to be non-null,
971  // as it's the case for get_group_value_fast* family.
972  can_return_error = codegenAggCalls(agg_out_ptr_w_idx,
973  varlen_output_buffer,
974  {},
976  co,
977  gpu_smem_context,
978  filter_cfg);
979  } else {
980  {
981  llvm::Value* nullcheck_cond{nullptr};
982  if (query_mem_desc.didOutputColumnar()) {
983  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
984  LL_INT(int32_t(0)));
985  } else {
986  nullcheck_cond = LL_BUILDER.CreateICmpNE(
987  std::get<0>(agg_out_ptr_w_idx),
988  llvm::ConstantPointerNull::get(
989  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
990  }
991  DiamondCodegen nullcheck_cfg(
992  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
993  codegenAggCalls(agg_out_ptr_w_idx,
994  varlen_output_buffer,
995  {},
997  co,
998  gpu_smem_context,
999  filter_cfg);
1000  }
1001  can_return_error = true;
1002  if (query_mem_desc.getQueryDescriptionType() ==
1004  query_mem_desc.useStreamingTopN()) {
1005  // Ignore rejection on pushing current row to top-K heap.
1006  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
1007  } else {
1008  CodeGenerator code_generator(executor_);
1009  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
1010  // TODO(alex): remove the trunc once pos is converted to 32 bits
1011  code_generator.posArg(nullptr),
1012  get_int_type(32, LL_CONTEXT))));
1013  }
1014  }
1015  } else {
1016  if (ra_exe_unit_.estimator) {
1017  std::stack<llvm::BasicBlock*> array_loops;
1018  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
1019  } else {
1020  auto arg_it = ROW_FUNC->arg_begin();
1021  std::vector<llvm::Value*> agg_out_vec;
1022  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
1023  agg_out_vec.push_back(&*arg_it++);
1024  }
1025  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
1026  /*varlen_output_buffer=*/nullptr,
1027  agg_out_vec,
1028  query_mem_desc,
1029  co,
1030  gpu_smem_context,
1031  filter_cfg);
1032  }
1033  }
1034  }
1035 
1036  if (ra_exe_unit_.join_quals.empty()) {
1037  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
1038  } else if (sc_false) {
1039  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
1040  LL_BUILDER.SetInsertPoint(sc_false);
1041  LL_BUILDER.CreateBr(filter_false);
1042  LL_BUILDER.SetInsertPoint(saved_insert_block);
1043  }
1044 
1045  return can_return_error;
1046 }
1047 
1049  llvm::Value* groups_buffer,
1051  const CompilationOptions& co,
1052  DiamondCodegen& diamond_codegen) {
1053  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1055  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1056  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
1057  CHECK(!group_expr);
1058  if (!query_mem_desc.didOutputColumnar()) {
1059  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
1060  }
1061  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1062  ? 0
1063  : query_mem_desc.getRowSize() / sizeof(int64_t);
1064  CodeGenerator code_generator(executor_);
1065  if (query_mem_desc.useStreamingTopN()) {
1066  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
1067  CHECK_GE(only_order_entry.tle_no, int(1));
1068  const size_t target_idx = only_order_entry.tle_no - 1;
1069  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1070  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1071  const auto chosen_bytes =
1072  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1073  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1074  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1076  std::string fname = "get_bin_from_k_heap";
1077  const auto& oe_ti = order_entry_expr->get_type_info();
1078  llvm::Value* null_key_lv = nullptr;
1079  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1080  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1081  switch (bit_width) {
1082  case 32:
1083  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1084  break;
1085  case 64:
1086  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1087  break;
1088  default:
1089  CHECK(false);
1090  }
1091  fname += "_int" + std::to_string(bit_width) + "_t";
1092  } else {
1093  CHECK(oe_ti.is_fp());
1094  if (order_entry_lv->getType()->isDoubleTy()) {
1095  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1096  } else {
1097  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1098  }
1099  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1100  }
1101  const auto key_slot_idx =
1103  return emitCall(
1104  fname,
1105  {groups_buffer,
1106  LL_INT(n),
1107  LL_INT(row_size_quad),
1108  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1109  LL_BOOL(only_order_entry.is_desc),
1110  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1111  LL_BOOL(only_order_entry.nulls_first),
1112  null_key_lv,
1113  order_entry_lv});
1114  } else {
1115  auto* arg = get_arg_by_name(ROW_FUNC, "max_matched");
1116  const auto output_buffer_entry_count_lv =
1117  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1118  arg = get_arg_by_name(ROW_FUNC, "old_total_matched");
1119  const auto group_expr_lv =
1120  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1121  std::vector<llvm::Value*> args{groups_buffer,
1122  output_buffer_entry_count_lv,
1123  group_expr_lv,
1124  code_generator.posArg(nullptr)};
1125  if (query_mem_desc.didOutputColumnar()) {
1126  const auto columnar_output_offset =
1127  emitCall("get_columnar_scan_output_offset", args);
1128  return columnar_output_offset;
1129  }
1130  args.push_back(LL_INT(row_size_quad));
1131  return emitCall("get_scan_output_slot", args);
1132  }
1133 }
1134 
1135 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1137  const CompilationOptions& co,
1138  DiamondCodegen& diamond_codegen) {
1139  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1140  auto arg_it = ROW_FUNC->arg_begin();
1141  auto groups_buffer = arg_it++;
1142 
1143  std::stack<llvm::BasicBlock*> array_loops;
1144 
1145  // TODO(Saman): move this logic outside of this function.
1147  if (query_mem_desc.didOutputColumnar()) {
1148  return std::make_tuple(
1149  &*groups_buffer,
1150  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1151  } else {
1152  return std::make_tuple(
1153  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1154  nullptr);
1155  }
1156  }
1157 
1158  CHECK(query_mem_desc.getQueryDescriptionType() ==
1160  query_mem_desc.getQueryDescriptionType() ==
1162 
1163  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1164  ? 0
1165  : query_mem_desc.getRowSize() / sizeof(int64_t);
1166 
1167  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1168  ? sizeof(int64_t)
1169  : query_mem_desc.getEffectiveKeyWidth();
1170  // for multi-column group by
1171  llvm::Value* group_key = nullptr;
1172  llvm::Value* key_size_lv = nullptr;
1173 
1174  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1175  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1176  if (query_mem_desc.getQueryDescriptionType() ==
1178  group_key =
1179  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1180  } else if (query_mem_desc.getQueryDescriptionType() ==
1182  group_key =
1183  col_width_size == sizeof(int32_t)
1184  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1185  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1186  }
1187  CHECK(group_key);
1188  CHECK(key_size_lv);
1189  }
1190 
1191  int32_t subkey_idx = 0;
1192  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1193  for (const auto& group_expr : ra_exe_unit_.groupby_exprs) {
1194  const auto col_range_info =
1196  const auto translated_null_value = static_cast<int64_t>(
1197  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1198  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1199  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1200  : checked_int64_t(col_range_info.max) +
1201  (col_range_info.bucket ? col_range_info.bucket : 1));
1202 
1203  const bool col_has_nulls =
1204  query_mem_desc.getQueryDescriptionType() ==
1206  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1207  ? query_mem_desc.hasNulls()
1208  : col_range_info.has_nulls)
1209  : false;
1210 
1211  const auto group_expr_lvs =
1212  executor_->groupByColumnCodegen(group_expr.get(),
1213  col_width_size,
1214  co,
1215  col_has_nulls,
1216  translated_null_value,
1217  diamond_codegen,
1218  array_loops,
1219  query_mem_desc.threadsShareMemory());
1220  const auto group_expr_lv = group_expr_lvs.translated_value;
1221  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1222  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1223  return codegenSingleColumnPerfectHash(query_mem_desc,
1224  co,
1225  &*groups_buffer,
1226  group_expr_lv,
1227  group_expr_lvs.original_value,
1228  row_size_quad);
1229  } else {
1230  // store the sub-key to the buffer
1231  LL_BUILDER.CreateStore(
1232  group_expr_lv,
1233  LL_BUILDER.CreateGEP(
1234  group_key->getType()->getScalarType()->getPointerElementType(),
1235  group_key,
1236  LL_INT(subkey_idx++)));
1237  }
1238  }
1239  if (query_mem_desc.getQueryDescriptionType() ==
1241  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1243  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1244  } else if (query_mem_desc.getQueryDescriptionType() ==
1247  &*groups_buffer,
1248  group_key,
1249  key_size_lv,
1250  query_mem_desc,
1251  col_width_size,
1252  row_size_quad);
1253  }
1254  CHECK(false);
1255  return std::make_tuple(nullptr, nullptr);
1256 }
1257 
1260  if (!query_mem_desc.hasVarlenOutput()) {
1261  return nullptr;
1262  }
1263 
1264  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1265  auto arg_it = ROW_FUNC->arg_begin();
1266  arg_it++; /* groups_buffer */
1267  auto varlen_output_buffer = arg_it++;
1268  CHECK(varlen_output_buffer->getType() == llvm::Type::getInt64PtrTy(LL_CONTEXT));
1269  return varlen_output_buffer;
1270 }
1271 
1272 std::tuple<llvm::Value*, llvm::Value*>
1275  const CompilationOptions& co,
1276  llvm::Value* groups_buffer,
1277  llvm::Value* group_expr_lv_translated,
1278  llvm::Value* group_expr_lv_original,
1279  const int32_t row_size_quad) {
1280  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1281  CHECK(query_mem_desc.usesGetGroupValueFast());
1282  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1283  ? "get_columnar_group_bin_offset"
1284  : "get_group_value_fast"};
1285  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1286  get_group_fn_name += "_keyless";
1287  }
1288  if (query_mem_desc.interleavedBins(co.device_type)) {
1289  CHECK(!query_mem_desc.didOutputColumnar());
1290  CHECK(query_mem_desc.hasKeylessHash());
1291  get_group_fn_name += "_semiprivate";
1292  }
1293  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1294  &*group_expr_lv_translated};
1295  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1296  query_mem_desc.mustUseBaselineSort()) {
1297  get_group_fn_name += "_with_original_key";
1298  get_group_fn_args.push_back(group_expr_lv_original);
1299  }
1300  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1301  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1302  if (!query_mem_desc.hasKeylessHash()) {
1303  if (!query_mem_desc.didOutputColumnar()) {
1304  get_group_fn_args.push_back(LL_INT(row_size_quad));
1305  }
1306  } else {
1307  if (!query_mem_desc.didOutputColumnar()) {
1308  get_group_fn_args.push_back(LL_INT(row_size_quad));
1309  }
1310  if (query_mem_desc.interleavedBins(co.device_type)) {
1311  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1312  get_group_fn_args.push_back(warp_idx);
1313  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1314  }
1315  }
1316  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1317  return std::make_tuple(&*groups_buffer,
1318  emitCall(get_group_fn_name, get_group_fn_args));
1319  }
1320  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1321 }
1322 
1323 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1324  llvm::Value* groups_buffer,
1325  llvm::Value* group_key,
1326  llvm::Value* key_size_lv,
1327  const QueryMemoryDescriptor& query_mem_desc,
1328  const int32_t row_size_quad) {
1329  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1330  CHECK(query_mem_desc.getQueryDescriptionType() ==
1332  // compute the index (perfect hash)
1333  auto perfect_hash_func = codegenPerfectHashFunction();
1334  auto hash_lv =
1335  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1336 
1337  if (query_mem_desc.didOutputColumnar()) {
1338  if (!query_mem_desc.hasKeylessHash()) {
1339  const std::string set_matching_func_name{
1340  "set_matching_group_value_perfect_hash_columnar"};
1341  const std::vector<llvm::Value*> set_matching_func_arg{
1342  groups_buffer,
1343  hash_lv,
1344  group_key,
1345  key_size_lv,
1346  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1347  query_mem_desc.getEntryCount())};
1348  emitCall(set_matching_func_name, set_matching_func_arg);
1349  }
1350  return std::make_tuple(groups_buffer, hash_lv);
1351  } else {
1352  if (query_mem_desc.hasKeylessHash()) {
1353  return std::make_tuple(emitCall("get_matching_group_value_perfect_hash_keyless",
1354  {groups_buffer, hash_lv, LL_INT(row_size_quad)}),
1355  nullptr);
1356  } else {
1357  return std::make_tuple(
1358  emitCall(
1359  "get_matching_group_value_perfect_hash",
1360  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1361  nullptr);
1362  }
1363  }
1364 }
1365 
1366 std::tuple<llvm::Value*, llvm::Value*>
1368  const CompilationOptions& co,
1369  llvm::Value* groups_buffer,
1370  llvm::Value* group_key,
1371  llvm::Value* key_size_lv,
1372  const QueryMemoryDescriptor& query_mem_desc,
1373  const size_t key_width,
1374  const int32_t row_size_quad) {
1375  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1376  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1377  CHECK(key_width == sizeof(int32_t));
1378  group_key =
1379  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1380  }
1381  std::vector<llvm::Value*> func_args{
1382  groups_buffer,
1383  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1384  &*group_key,
1385  &*key_size_lv,
1386  LL_INT(static_cast<int32_t>(key_width))};
1387  std::string func_name{"get_group_value"};
1388  if (query_mem_desc.didOutputColumnar()) {
1389  func_name += "_columnar_slot";
1390  } else {
1391  func_args.push_back(LL_INT(row_size_quad));
1392  }
1393  if (co.with_dynamic_watchdog) {
1394  func_name += "_with_watchdog";
1395  }
1396  if (query_mem_desc.didOutputColumnar()) {
1397  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1398  } else {
1399  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1400  }
1401 }
1402 
1404  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1405  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1406  auto ft = llvm::FunctionType::get(
1407  get_int_type(32, LL_CONTEXT),
1408  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1409  false);
1410  auto key_hash_func = llvm::Function::Create(ft,
1411  llvm::Function::ExternalLinkage,
1412  "perfect_key_hash",
1413  executor_->cgen_state_->module_);
1414  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1415  mark_function_always_inline(key_hash_func);
1416  auto& key_buff_arg = *key_hash_func->args().begin();
1417  llvm::Value* key_buff_lv = &key_buff_arg;
1418  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1419  llvm::IRBuilder<> key_hash_func_builder(bb);
1420  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1421  std::vector<int64_t> cardinalities;
1422  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1423  auto col_range_info =
1424  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1425  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1426  cardinalities.push_back(getBucketedCardinality(col_range_info));
1427  }
1428  size_t dim_idx = 0;
1429  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1430  auto* gep = key_hash_func_builder.CreateGEP(
1431  key_buff_lv->getType()->getScalarType()->getPointerElementType(),
1432  key_buff_lv,
1433  LL_INT(dim_idx));
1434  auto key_comp_lv =
1435  key_hash_func_builder.CreateLoad(gep->getType()->getPointerElementType(), gep);
1436  auto col_range_info =
1437  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1438  auto crt_term_lv =
1439  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1440  if (col_range_info.bucket) {
1441  crt_term_lv =
1442  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1443  }
1444  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1445  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1446  LL_INT(cardinalities[prev_dim_idx]));
1447  }
1448  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1449  ++dim_idx;
1450  }
1451  key_hash_func_builder.CreateRet(
1452  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1453  return key_hash_func;
1454 }
1455 
1457  const TargetInfo& agg_info,
1458  llvm::Value* target) {
1459  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1460  const auto& agg_type = agg_info.sql_type;
1461  const size_t chosen_bytes = agg_type.get_size();
1462 
1463  bool need_conversion{false};
1464  llvm::Value* arg_null{nullptr};
1465  llvm::Value* agg_null{nullptr};
1466  llvm::Value* target_to_cast{target};
1467  if (arg_type.is_fp()) {
1468  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1469  if (agg_type.is_fp()) {
1470  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1471  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1472  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1473  need_conversion = true;
1474  }
1475  } else {
1476  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1477  return target;
1478  }
1479  } else {
1480  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1481  if (agg_type.is_fp()) {
1482  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1483  need_conversion = true;
1484  target_to_cast = executor_->castToFP(target, arg_type, agg_type);
1485  } else {
1486  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1487  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1488  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1489  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1490  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1491  need_conversion = true;
1492  }
1493  }
1494  }
1495  if (need_conversion) {
1496  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1497  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1498  return LL_BUILDER.CreateSelect(
1499  cmp,
1500  agg_null,
1501  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1502  } else {
1503  return target;
1504  }
1505 }
1506 
1508  const Analyzer::WindowFunction* window_func,
1509  const QueryMemoryDescriptor& query_mem_desc,
1510  const CompilationOptions& co,
1511  DiamondCodegen& diamond_codegen) {
1512  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1513  const auto window_func_context =
1515  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1516  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1517  ? 0
1518  : query_mem_desc.getRowSize() / sizeof(int64_t);
1519  auto arg_it = ROW_FUNC->arg_begin();
1520  auto groups_buffer = arg_it++;
1521  CodeGenerator code_generator(executor_);
1522  auto window_pos_lv = code_generator.codegenWindowPosition(
1523  window_func_context, code_generator.posArg(nullptr));
1524  const auto pos_in_window =
1525  LL_BUILDER.CreateTrunc(window_pos_lv, get_int_type(32, LL_CONTEXT));
1526  llvm::Value* entry_count_lv =
1527  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1528  std::vector<llvm::Value*> args{
1529  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1530  if (query_mem_desc.didOutputColumnar()) {
1531  const auto columnar_output_offset =
1532  emitCall("get_columnar_scan_output_offset", args);
1533  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1534  }
1535  args.push_back(LL_INT(row_size_quad));
1536  return emitCall("get_scan_output_slot", args);
1537  }
1538  auto arg_it = ROW_FUNC->arg_begin();
1539  auto groups_buffer = arg_it++;
1540  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1541 }
1542 
1544  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1545  llvm::Value* varlen_output_buffer,
1546  const std::vector<llvm::Value*>& agg_out_vec,
1547  QueryMemoryDescriptor& query_mem_desc,
1548  const CompilationOptions& co,
1549  const GpuSharedMemoryContext& gpu_smem_context,
1550  DiamondCodegen& diamond_codegen) {
1551  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1552  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1553  // TODO(alex): unify the two cases, the output for non-group by queries
1554  // should be a contiguous buffer
1555  const bool is_group_by = std::get<0>(agg_out_ptr_w_idx);
1556  bool can_return_error = false;
1557  if (is_group_by) {
1558  CHECK(agg_out_vec.empty());
1559  } else {
1560  CHECK(!agg_out_vec.empty());
1561  }
1562 
1563  // output buffer is casted into a byte stream to be able to handle data elements of
1564  // different sizes (only used when actual column width sizes are used)
1565  llvm::Value* output_buffer_byte_stream{nullptr};
1566  llvm::Value* out_row_idx{nullptr};
1567  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1569  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1570  std::get<0>(agg_out_ptr_w_idx),
1571  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1572  output_buffer_byte_stream->setName("out_buff_b_stream");
1573  CHECK(std::get<1>(agg_out_ptr_w_idx));
1574  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1575  llvm::Type::getInt64Ty(LL_CONTEXT));
1576  out_row_idx->setName("out_row_idx");
1577  }
1578 
1579  TargetExprCodegenBuilder target_builder(ra_exe_unit_, is_group_by);
1580  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1581  ++target_idx) {
1582  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1583  CHECK(target_expr);
1584 
1585  target_builder(target_expr, executor_, query_mem_desc, co);
1586  }
1587 
1588  target_builder.codegen(this,
1589  executor_,
1590  query_mem_desc,
1591  co,
1592  gpu_smem_context,
1593  agg_out_ptr_w_idx,
1594  agg_out_vec,
1595  output_buffer_byte_stream,
1596  out_row_idx,
1597  varlen_output_buffer,
1598  diamond_codegen);
1599 
1600  for (auto target_expr : ra_exe_unit_.target_exprs) {
1601  CHECK(target_expr);
1602  executor_->plan_state_->isLazyFetchColumn(target_expr);
1603  }
1604 
1605  return can_return_error;
1606 }
1607 
1612  llvm::Value* output_buffer_byte_stream,
1613  llvm::Value* out_row_idx,
1614  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1615  const QueryMemoryDescriptor& query_mem_desc,
1616  const size_t chosen_bytes,
1617  const size_t agg_out_off,
1618  const size_t target_idx) {
1619  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1620  llvm::Value* agg_col_ptr{nullptr};
1621  if (query_mem_desc.didOutputColumnar()) {
1622  // TODO(Saman): remove the second columnar branch, and support all query description
1623  // types through the first branch. Then, input arguments should also be cleaned up
1624  if (!g_cluster &&
1626  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1627  chosen_bytes == 8);
1628  CHECK(output_buffer_byte_stream);
1629  CHECK(out_row_idx);
1630  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1631  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1632  auto out_per_col_byte_idx =
1633 #ifdef _WIN32
1634  LL_BUILDER.CreateShl(out_row_idx, __lzcnt(chosen_bytes) - 1);
1635 #else
1636  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1637 #endif
1638  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1639  LL_INT(static_cast<int64_t>(col_off)));
1640  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1641  auto output_ptr = LL_BUILDER.CreateGEP(
1642  output_buffer_byte_stream->getType()->getScalarType()->getPointerElementType(),
1643  output_buffer_byte_stream,
1644  byte_offset);
1645  agg_col_ptr = LL_BUILDER.CreateBitCast(
1646  output_ptr,
1647  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1648  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1649  } else {
1650  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1651  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1652  col_off /= chosen_bytes;
1653  CHECK(std::get<1>(agg_out_ptr_w_idx));
1654  auto offset = LL_BUILDER.CreateAdd(std::get<1>(agg_out_ptr_w_idx), LL_INT(col_off));
1655  auto* bit_cast = LL_BUILDER.CreateBitCast(
1656  std::get<0>(agg_out_ptr_w_idx),
1657  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1658  agg_col_ptr = LL_BUILDER.CreateGEP(
1659  bit_cast->getType()->getScalarType()->getPointerElementType(),
1660  bit_cast,
1661  offset);
1662  }
1663  } else {
1664  uint32_t col_off = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1665  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1666  col_off /= chosen_bytes;
1667  auto* bit_cast = LL_BUILDER.CreateBitCast(
1668  std::get<0>(agg_out_ptr_w_idx),
1669  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1670  agg_col_ptr = LL_BUILDER.CreateGEP(
1671  bit_cast->getType()->getScalarType()->getPointerElementType(),
1672  bit_cast,
1673  LL_INT(col_off));
1674  }
1675  CHECK(agg_col_ptr);
1676  return agg_col_ptr;
1677 }
1678 
1679 void GroupByAndAggregate::codegenEstimator(std::stack<llvm::BasicBlock*>& array_loops,
1680  DiamondCodegen& diamond_codegen,
1681  const QueryMemoryDescriptor& query_mem_desc,
1682  const CompilationOptions& co) {
1683  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1684  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1685  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1686  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1687  estimator_comp_count_lv);
1688  int32_t subkey_idx = 0;
1689  for (const auto& estimator_arg_comp : estimator_arg) {
1690  const auto estimator_arg_comp_lvs =
1691  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1692  query_mem_desc.getEffectiveKeyWidth(),
1693  co,
1694  false,
1695  0,
1696  diamond_codegen,
1697  array_loops,
1698  true);
1699  CHECK(!estimator_arg_comp_lvs.original_value);
1700  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1701  // store the sub-key to the buffer
1702  LL_BUILDER.CreateStore(
1703  estimator_arg_comp_lv,
1704  LL_BUILDER.CreateGEP(
1705  estimator_key_lv->getType()->getScalarType()->getPointerElementType(),
1706  estimator_key_lv,
1707  LL_INT(subkey_idx++)));
1708  }
1709  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1710  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1711  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1712  const auto estimator_comp_bytes_lv =
1713  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1714  const auto bitmap_size_lv =
1715  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1716  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1717  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1718 }
1719 
1720 extern "C" RUNTIME_EXPORT void agg_count_distinct(int64_t* agg, const int64_t val) {
1721  reinterpret_cast<CountDistinctSet*>(*agg)->insert(val);
1722 }
1723 
1724 extern "C" RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t* agg,
1725  const int64_t val,
1726  const int64_t skip_val) {
1727  if (val != skip_val) {
1728  agg_count_distinct(agg, val);
1729  }
1730 }
1731 
1732 extern "C" RUNTIME_EXPORT void agg_approx_quantile(int64_t* agg, const double val) {
1733  auto* t_digest = reinterpret_cast<quantile::TDigest*>(*agg);
1734  t_digest->allocate();
1735  t_digest->add(val);
1736 }
1737 
1739  const size_t target_idx,
1740  const Analyzer::Expr* target_expr,
1741  std::vector<llvm::Value*>& agg_args,
1742  const QueryMemoryDescriptor& query_mem_desc,
1743  const ExecutorDeviceType device_type) {
1744  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1745  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1746  const auto& arg_ti =
1747  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1748  if (arg_ti.is_fp()) {
1749  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1750  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1751  }
1752  const auto& count_distinct_descriptor =
1753  query_mem_desc.getCountDistinctDescriptor(target_idx);
1754  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1755  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1756  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1757  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1758  if (device_type == ExecutorDeviceType::GPU) {
1759  const auto base_dev_addr = getAdditionalLiteral(-1);
1760  const auto base_host_addr = getAdditionalLiteral(-2);
1761  agg_args.push_back(base_dev_addr);
1762  agg_args.push_back(base_host_addr);
1763  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1764  } else {
1765  emitCall("agg_approximate_count_distinct", agg_args);
1766  }
1767  return;
1768  }
1769  std::string agg_fname{"agg_count_distinct"};
1770  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1771  agg_fname += "_bitmap";
1772  agg_args.push_back(LL_INT(static_cast<int64_t>(count_distinct_descriptor.min_val)));
1773  }
1774  if (agg_info.skip_null_val) {
1775  auto null_lv = executor_->cgen_state_->castToTypeIn(
1776  (arg_ti.is_fp()
1777  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1778  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1779  64);
1780  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1781  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1782  agg_fname += "_skip_val";
1783  agg_args.push_back(null_lv);
1784  }
1785  if (device_type == ExecutorDeviceType::GPU) {
1786  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1787  agg_fname += "_gpu";
1788  const auto base_dev_addr = getAdditionalLiteral(-1);
1789  const auto base_host_addr = getAdditionalLiteral(-2);
1790  agg_args.push_back(base_dev_addr);
1791  agg_args.push_back(base_host_addr);
1792  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1793  CHECK_EQ(size_t(0),
1794  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1795  count_distinct_descriptor.sub_bitmap_count);
1796  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1797  count_distinct_descriptor.sub_bitmap_count)));
1798  }
1799  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1800  emitCall(agg_fname, agg_args);
1801  } else {
1802  executor_->cgen_state_->emitExternalCall(
1803  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1804  }
1805 }
1806 
1808  const size_t target_idx,
1809  const Analyzer::Expr* target_expr,
1810  std::vector<llvm::Value*>& agg_args,
1811  const QueryMemoryDescriptor& query_mem_desc,
1812  const ExecutorDeviceType device_type) {
1813  if (device_type == ExecutorDeviceType::GPU) {
1814  throw QueryMustRunOnCpu();
1815  }
1816  llvm::BasicBlock *calc, *skip;
1817  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1818  auto const arg_ti =
1819  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1820  bool const nullable = !arg_ti.get_notnull();
1821 
1822  auto* cs = executor_->cgen_state_.get();
1823  auto& irb = cs->ir_builder_;
1824  if (nullable) {
1825  auto* const null_value = cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1826  auto* const skip_cond = arg_ti.is_fp()
1827  ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1828  : irb.CreateICmpEQ(agg_args.back(), null_value);
1829  calc = llvm::BasicBlock::Create(cs->context_, "calc_approx_quantile");
1830  skip = llvm::BasicBlock::Create(cs->context_, "skip_approx_quantile");
1831  irb.CreateCondBr(skip_cond, skip, calc);
1832  cs->current_func_->getBasicBlockList().push_back(calc);
1833  irb.SetInsertPoint(calc);
1834  }
1835  if (!arg_ti.is_fp()) {
1836  auto const agg_info = get_target_info(target_expr, g_bigint_count);
1837  agg_args.back() = executor_->castToFP(agg_args.back(), arg_ti, agg_info.sql_type);
1838  }
1839  cs->emitExternalCall(
1840  "agg_approx_quantile", llvm::Type::getVoidTy(cs->context_), agg_args);
1841  if (nullable) {
1842  irb.CreateBr(skip);
1843  cs->current_func_->getBasicBlockList().push_back(skip);
1844  irb.SetInsertPoint(skip);
1845  }
1846 }
1847 
1848 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
1849  CHECK_LT(off, 0);
1850  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
1851  auto* bit_cast = LL_BUILDER.CreateBitCast(
1852  lit_buff_lv, llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0));
1853  auto* gep =
1854  LL_BUILDER.CreateGEP(bit_cast->getType()->getScalarType()->getPointerElementType(),
1855  bit_cast,
1856  LL_INT(off));
1857  return LL_BUILDER.CreateLoad(gep->getType()->getPointerElementType(), gep);
1858 }
1859 
1860 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
1861  const Analyzer::Expr* target_expr,
1862  const CompilationOptions& co) {
1863  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1864  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
1865  const auto func_expr = dynamic_cast<const Analyzer::FunctionOper*>(target_expr);
1866  const auto arr_expr = dynamic_cast<const Analyzer::ArrayExpr*>(target_expr);
1867 
1868  // TODO(alex): handle arrays uniformly?
1869  CodeGenerator code_generator(executor_);
1870  if (target_expr) {
1871  const auto& target_ti = target_expr->get_type_info();
1872  if (target_ti.is_buffer() &&
1873  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1874  const auto target_lvs =
1875  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1876  : code_generator.codegen(
1877  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1878  if (!func_expr && !arr_expr) {
1879  // Something with the chunk transport is code that was generated from a source
1880  // other than an ARRAY[] expression
1881  if (target_ti.is_bytes()) {
1882  CHECK_EQ(size_t(3), target_lvs.size());
1883  return {target_lvs[1], target_lvs[2]};
1884  }
1885  CHECK(target_ti.is_array());
1886  CHECK_EQ(size_t(1), target_lvs.size());
1887  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
1888  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1889  const auto i8p_ty =
1890  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1891  const auto& elem_ti = target_ti.get_elem_type();
1892  return {
1893  executor_->cgen_state_->emitExternalCall(
1894  "array_buff",
1895  i8p_ty,
1896  {target_lvs.front(), code_generator.posArg(target_expr)}),
1897  executor_->cgen_state_->emitExternalCall(
1898  "array_size",
1899  i32_ty,
1900  {target_lvs.front(),
1901  code_generator.posArg(target_expr),
1902  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
1903  } else {
1904  if (agg_expr) {
1905  throw std::runtime_error(
1906  "Using array[] operator as argument to an aggregate operator is not "
1907  "supported");
1908  }
1909  CHECK(func_expr || arr_expr);
1910  if (dynamic_cast<const Analyzer::FunctionOper*>(target_expr)) {
1911  CHECK_EQ(size_t(1), target_lvs.size());
1912  const auto prefix = target_ti.get_buffer_name();
1913  CHECK(target_ti.is_array() || target_ti.is_bytes());
1914  const auto target_lv = LL_BUILDER.CreateLoad(
1915  target_lvs[0]->getType()->getPointerElementType(), target_lvs[0]);
1916  // const auto target_lv_type = target_lvs[0]->getType();
1917  // CHECK(target_lv_type->isStructTy());
1918  // CHECK_EQ(target_lv_type->getNumContainedTypes(), 3u);
1919  const auto i8p_ty = llvm::PointerType::get(
1920  get_int_type(8, executor_->cgen_state_->context_), 0);
1921  const auto ptr = LL_BUILDER.CreatePointerCast(
1922  LL_BUILDER.CreateExtractValue(target_lv, 0), i8p_ty);
1923  const auto size = LL_BUILDER.CreateExtractValue(target_lv, 1);
1924  const auto null_flag = LL_BUILDER.CreateExtractValue(target_lv, 2);
1925  const auto nullcheck_ok_bb =
1926  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_nullcheck_ok_bb", CUR_FUNC);
1927  const auto nullcheck_fail_bb = llvm::BasicBlock::Create(
1928  LL_CONTEXT, prefix + "_nullcheck_fail_bb", CUR_FUNC);
1929 
1930  // TODO(adb): probably better to zext the bool
1931  const auto nullcheck = LL_BUILDER.CreateICmpEQ(
1932  null_flag, executor_->cgen_state_->llInt(static_cast<int8_t>(1)));
1933  LL_BUILDER.CreateCondBr(nullcheck, nullcheck_fail_bb, nullcheck_ok_bb);
1934 
1935  const auto ret_bb =
1936  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_return", CUR_FUNC);
1937  LL_BUILDER.SetInsertPoint(ret_bb);
1938  auto result_phi = LL_BUILDER.CreatePHI(i8p_ty, 2, prefix + "_ptr_return");
1939  result_phi->addIncoming(ptr, nullcheck_ok_bb);
1940  const auto null_arr_sentinel = LL_BUILDER.CreateIntToPtr(
1941  executor_->cgen_state_->llInt(static_cast<int8_t>(0)), i8p_ty);
1942  result_phi->addIncoming(null_arr_sentinel, nullcheck_fail_bb);
1943  LL_BUILDER.SetInsertPoint(nullcheck_ok_bb);
1944  executor_->cgen_state_->emitExternalCall(
1945  "register_buffer_with_executor_rsm",
1946  llvm::Type::getVoidTy(executor_->cgen_state_->context_),
1947  {executor_->cgen_state_->llInt(reinterpret_cast<int64_t>(executor_)), ptr});
1948  LL_BUILDER.CreateBr(ret_bb);
1949  LL_BUILDER.SetInsertPoint(nullcheck_fail_bb);
1950  LL_BUILDER.CreateBr(ret_bb);
1951 
1952  LL_BUILDER.SetInsertPoint(ret_bb);
1953  return {result_phi, size};
1954  }
1955  CHECK_EQ(size_t(2), target_lvs.size());
1956  return {target_lvs[0], target_lvs[1]};
1957  }
1958  }
1959  if (target_ti.is_geometry() &&
1960  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1961  auto generate_coord_lvs =
1962  [&](auto* selected_target_expr,
1963  bool const fetch_columns) -> std::vector<llvm::Value*> {
1964  const auto target_lvs =
1965  code_generator.codegen(selected_target_expr, fetch_columns, co);
1966  if (dynamic_cast<const Analyzer::GeoOperator*>(target_expr) &&
1967  target_expr->get_type_info().is_geometry()) {
1968  // return a pointer to the temporary alloca
1969  return target_lvs;
1970  }
1971  const auto geo_uoper = dynamic_cast<const Analyzer::GeoUOper*>(target_expr);
1972  const auto geo_binoper = dynamic_cast<const Analyzer::GeoBinOper*>(target_expr);
1973  if (geo_uoper || geo_binoper) {
1974  CHECK(target_expr->get_type_info().is_geometry());
1975  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
1976  target_lvs.size());
1977  return target_lvs;
1978  }
1979  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
1980  target_lvs.size());
1981 
1982  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1983  const auto i8p_ty =
1984  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1985  std::vector<llvm::Value*> coords;
1986  size_t ctr = 0;
1987  for (const auto& target_lv : target_lvs) {
1988  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
1989  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
1990  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
1991  // coords array (TINYINT). Subsequent arrays are regular INT.
1992 
1993  const size_t elem_sz = ctr == 0 ? 1 : 4;
1994  ctr++;
1995  int32_t fixlen = -1;
1996  if (target_ti.get_type() == kPOINT) {
1997  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
1998  if (col_var) {
1999  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
2000  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
2001  fixlen = coords_cd->columnType.get_size();
2002  }
2003  }
2004  }
2005  if (fixlen > 0) {
2006  coords.push_back(executor_->cgen_state_->emitExternalCall(
2007  "fast_fixlen_array_buff",
2008  i8p_ty,
2009  {target_lv, code_generator.posArg(selected_target_expr)}));
2010  coords.push_back(executor_->cgen_state_->llInt(int64_t(fixlen)));
2011  continue;
2012  }
2013  coords.push_back(executor_->cgen_state_->emitExternalCall(
2014  "array_buff",
2015  i8p_ty,
2016  {target_lv, code_generator.posArg(selected_target_expr)}));
2017  coords.push_back(executor_->cgen_state_->emitExternalCall(
2018  "array_size",
2019  i32_ty,
2020  {target_lv,
2021  code_generator.posArg(selected_target_expr),
2022  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
2023  }
2024  return coords;
2025  };
2026 
2027  if (agg_expr) {
2028  return generate_coord_lvs(agg_expr->get_arg(), true);
2029  } else {
2030  return generate_coord_lvs(target_expr,
2031  !executor_->plan_state_->allow_lazy_fetch_);
2032  }
2033  }
2034  }
2035  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2036  : code_generator.codegen(
2037  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
2038 }
2039 
2040 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
2041  const std::vector<llvm::Value*>& args) {
2042  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2043  return executor_->cgen_state_->emitCall(fname, args);
2044 }
2045 
2046 void GroupByAndAggregate::checkErrorCode(llvm::Value* retCode) {
2047  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2048  auto zero_const = llvm::ConstantInt::get(retCode->getType(), 0, true);
2049  auto rc_check_condition = executor_->cgen_state_->ir_builder_.CreateICmp(
2050  llvm::ICmpInst::ICMP_EQ, retCode, zero_const);
2051 
2052  executor_->cgen_state_->emitErrorCheck(rc_check_condition, retCode, "rc");
2053 }
2054 
2055 #undef CUR_FUNC
2056 #undef ROW_FUNC
2057 #undef LL_FP
2058 #undef LL_INT
2059 #undef LL_BOOL
2060 #undef LL_BUILDER
2061 #undef LL_CONTEXT
2062 
2064  const RelAlgExecutionUnit& ra_exe_unit,
2065  const Catalog_Namespace::Catalog& catalog) {
2066  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
2067  return 0;
2068  }
2069  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
2070  const auto grouped_col_expr =
2071  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
2072  if (!grouped_col_expr) {
2073  continue;
2074  }
2075  if (grouped_col_expr->get_table_id() <= 0) {
2076  return 0;
2077  }
2078  const auto td = catalog.getMetadataForTable(grouped_col_expr->get_table_id());
2079  if (td->shardedColumnId == grouped_col_expr->get_column_id()) {
2080  return td->nShards;
2081  }
2082  }
2083  return 0;
2084 }
RUNTIME_EXPORT void agg_approx_quantile(int64_t *agg, const double val)
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:230
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:2406
size_t g_watchdog_baseline_max_groups
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
llvm::Value * getAdditionalLiteral(const int32_t off)
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
llvm::BasicBlock * cond_false_
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
HOST DEVICE int get_size() const
Definition: sqltypes.h:414
std::string cat(Ts &&...args)
#define LL_BUILDER
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:132
RUNTIME_EXPORT void agg_count_distinct(int64_t *agg, const int64_t val)
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:113
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
#define LL_CONTEXT
bool expr_is_rowid(const Analyzer::Expr *expr, const Catalog_Namespace::Catalog &cat)
ExecutorDeviceType
KeylessInfo get_keyless_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
SQLTypeInfo sql_type
Definition: TargetInfo.h:52
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:216
std::ostream & operator<<(std::ostream &os, const SessionInfo &session_info)
Definition: SessionInfo.cpp:57
void mark_function_always_inline(llvm::Function *func)
bool is_fp() const
Definition: sqltypes.h:604
ColRangeInfo getColRangeInfo()
const std::list< Analyzer::OrderEntry > order_entries
#define LL_INT(v)
static const size_t baseline_threshold
Definition: Execute.h:1304
QueryDescriptionType hash_type_
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:550
CountDistinctDescriptors init_count_distinct_descriptors(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ExecutorDeviceType device_type, Executor *executor)
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint)
#define CHECK_GE(x, y)
Definition: Logger.h:235
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
llvm::Value * codegenVarlenOutputBuffer(const QueryMemoryDescriptor &query_mem_desc)
Expr * get_arg() const
Definition: Analyzer.h:1203
size_t getEffectiveKeyWidth() const
void codegenApproxQuantile(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
void checkErrorCode(llvm::Value *retCode)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:111
#define LLVM_ALIGN(alignment)
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
CountDistinctImplType impl_type_
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:234
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:97
std::string to_string(char const *&&v)
Helpers for codegen of target expressions.
#define LL_BOOL(v)
size_t getColOnlyOffInBytes(const size_t col_idx) const
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:74
const SQLTypeInfo get_compact_type(const TargetInfo &target)
const size_t limit
llvm::Value * codegenWindowPosition(const WindowFunctionContext *window_func_context, llvm::Value *pos_arg)
Definition: ColumnIR.cpp:230
bool isEmpty() const
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const std::optional< int64_t > &group_cardinality_estimation)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:166
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:220
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:34
size_t getGroupbyColCount() const
RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
const JoinQualsPerNestingLevel join_quals
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, llvm::Value *varlen_output_buffer, const std::vector< llvm::Value * > &agg_out_vec, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
bool g_bigint_count
Definition: sqldefs.h:76
bool g_enable_watchdog
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, llvm::Value *varlen_output_buffer, DiamondCodegen &diamond_codegen) const
int64_t g_bitmap_memory_limit
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:107
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
DEVICE void allocate()
Definition: quantile.h:601
#define AUTOMATIC_IR_METADATA(CGENSTATE)
This file includes the class specification for the buffer manager (BufferMgr), and related data struc...
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:51
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:82
QueryDescriptionType getQueryDescriptionType() const
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
ExecutorDeviceType device_type
#define RUNTIME_EXPORT
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:44
#define CHECK_LT(x, y)
Definition: Logger.h:232
Definition: sqltypes.h:67
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo &col_range_info)
#define CHECK_LE(x, y)
Definition: Logger.h:233
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
std::unordered_map< size_t, SQLTypeInfo > target_exprs_original_type_infos
Definition: sqldefs.h:77
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
Descriptor for the result set buffer layout.
CountDistinctImplType
const std::optional< int64_t > group_cardinality_estimation_
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:222
bool is_geometry() const
Definition: sqltypes.h:612
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
bool g_cluster
int64_t get_epoch_days_from_seconds(const int64_t seconds)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
#define CUR_FUNC
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:176
Definition: sqltypes.h:59
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
constexpr double n
Definition: Utm.h:38
size_t g_leaf_count
Definition: ParserNode.cpp:76
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:411
const RelAlgExecutionUnit & ra_exe_unit_
const size_t offset
Definition: sqldefs.h:75
Definition: sqldefs.h:73
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)