OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
19 
20 #include "CardinalityEstimator.h"
21 #include "CodeGenerator.h"
23 #include "ExpressionRange.h"
24 #include "ExpressionRewrite.h"
25 #include "GpuInitGroups.h"
26 #include "InPlaceSort.h"
28 #include "MaxwellCodegenPatch.h"
30 #include "TargetExprBuilder.h"
31 
32 #include "../CudaMgr/CudaMgr.h"
33 #include "../Shared/checked_alloc.h"
34 #include "../Shared/funcannotations.h"
35 #include "../Utils/ChunkIter.h"
37 #include "Execute.h"
38 #include "QueryTemplateGenerator.h"
39 #include "RuntimeFunctions.h"
40 #include "Shared/misc.h"
41 #include "StreamingTopN.h"
42 #include "TopKSort.h"
43 #include "WindowContext.h"
44 
45 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
46 
47 #include <cstring> // strcat()
48 #include <limits>
49 #include <numeric>
50 #include <string_view>
51 #include <thread>
52 
53 bool g_cluster{false};
54 bool g_bigint_count{false};
57 extern int64_t g_bitmap_memory_limit;
58 extern size_t g_leaf_count;
59 
60 bool ColRangeInfo::isEmpty() const {
61  return min == 0 && max == -1;
62 }
63 
64 std::ostream& operator<<(std::ostream& out, const ColRangeInfo& info) {
65  out << "Hash Type = " << info.hash_type_ << " min = " << info.min
66  << " max = " << info.max << " bucket = " << info.bucket
67  << " has_nulls = " << info.has_nulls << "\n";
68  return out;
69 }
70 
71 std::ostream& operator<<(std::ostream& out, const CountDistinctImplType& type) {
72  switch (type) {
74  out << "Invalid";
75  break;
77  out << "Bitmap";
78  break;
80  out << "UnorderedSet";
81  break;
82  default:
83  out << "<Unkown Type>";
84  break;
85  }
86  return out;
87 }
88 
89 std::ostream& operator<<(std::ostream& out, const CountDistinctDescriptor& desc) {
90  out << "Type = " << desc.impl_type_ << " min val = " << desc.min_val
91  << " bitmap_sz_bits = " << desc.bitmap_sz_bits
92  << " bool approximate = " << desc.approximate
93  << " device_type = " << desc.device_type
94  << " sub_bitmap_count = " << desc.sub_bitmap_count;
95  return out;
96 }
97 
98 namespace {
99 
100 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
101  int32_t agg_count{0};
102  for (auto target_expr : target_exprs) {
103  CHECK(target_expr);
104  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
105  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
106  const auto& ti = target_expr->get_type_info();
107  if (ti.is_buffer()) {
108  agg_count += 2;
109  } else if (ti.is_geometry()) {
110  agg_count += ti.get_physical_coord_cols() * 2;
111  } else {
112  ++agg_count;
113  }
114  continue;
115  }
116  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
117  agg_count += 2;
118  } else {
119  ++agg_count;
120  }
121  }
122  return agg_count;
123 }
124 
125 bool expr_is_rowid(const Analyzer::Expr* expr) {
126  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
127  if (!col) {
128  return false;
129  }
130  const auto cd = get_column_descriptor_maybe(col->getColumnKey());
131  if (!cd || !cd->isVirtualCol) {
132  return false;
133  }
134  CHECK_EQ("rowid", cd->columnName);
135  return true;
136 }
137 
138 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
139  for (const auto& target_expr : ra_exe_unit.target_exprs) {
140  const auto agg_info = get_target_info(target_expr, g_bigint_count);
141  if (agg_info.is_agg && is_distinct_target(agg_info)) {
142  return true;
143  }
144  }
145  return false;
146 }
147 
149  const int64_t max_entry_count) {
150  try {
151  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
152  checked_int64_t(col_range_info.min)) >= max_entry_count;
153  } catch (...) {
154  return true;
155  }
156 }
157 
158 bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate,
159  const ColRangeInfo& col_range_info) {
160  try {
161  // the cardinality estimate is the size of the baseline hash table. further penalize
162  // the baseline hash table by a factor of 2x due to overhead in computing baseline
163  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
164  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
165  // count of the column, we use baseline hash on the filtered set
166  return checked_int64_t(cardinality_estimate) * 2 <
167  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
168  checked_int64_t(col_range_info.min));
169  } catch (...) {
170  return false;
171  }
172 }
173 
175  const std::vector<InputTableInfo>& query_infos,
176  const Analyzer::Expr* expr,
177  Executor* executor) {
178  if (!expr) {
179  return {QueryDescriptionType::Projection, 0, 0, 0, false};
180  }
181 
182  const auto expr_range = getExpressionRange(
183  expr, query_infos, executor, boost::make_optional(ra_exe_unit.simple_quals));
184  switch (expr_range.getType()) {
186  if (expr_range.getIntMin() > expr_range.getIntMax()) {
187  return {
188  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
189  }
191  expr_range.getIntMin(),
192  expr_range.getIntMax(),
193  expr_range.getBucket(),
194  expr_range.hasNulls()};
195  }
198  if (expr_range.getFpMin() > expr_range.getFpMax()) {
199  return {
200  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
201  }
202  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
203  }
205  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
206  default:
207  CHECK(false);
208  }
209  CHECK(false);
210  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
211 }
212 
213 } // namespace
214 
216  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
217  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
218  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
219  // can expect this to be true anyway for grouped queries since the precise version
220  // uses significantly more memory.
221  const int64_t baseline_threshold =
223  // `group_cardinality_estimation_` is set as the result of (NDV) cardinality estimator
224  auto group_cardinality_estimation = group_cardinality_estimation_.value_or(0);
225  if (ra_exe_unit_.groupby_exprs.size() != 1) {
226  try {
227  checked_int64_t cardinality{1};
228  bool has_nulls{false};
229  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
230  auto col_range_info = get_expr_range_info(
231  ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
232  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
233  // going through baseline hash if a non-integer type is encountered
235  0,
236  group_cardinality_estimation,
237  0,
238  false};
239  }
240  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
241  CHECK_GE(crt_col_cardinality, 0);
242  cardinality *= crt_col_cardinality;
243  if (col_range_info.has_nulls) {
244  has_nulls = true;
245  }
246  }
247  // For zero or high cardinalities, use baseline layout.
248  if (!cardinality || cardinality > baseline_threshold) {
250  0,
251  group_cardinality_estimation,
252  0,
253  false};
254  }
255  // todo (yoonmin) : should we consider min(group_cardinality_estimation,
256  // cardinality) if we have `group_cardinality_estimation` value?
258  0,
259  int64_t(cardinality),
260  0,
261  has_nulls};
262  } catch (...) { // overflow when computing cardinality
264  0,
265  group_cardinality_estimation,
266  0,
267  false};
268  }
269  }
270  // For single column groupby on high timestamps, force baseline hash due to wide ranges
271  // we are likely to encounter when applying quals to the expression range
272  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
273  // the range is small enough
274  if (ra_exe_unit_.groupby_exprs.front() &&
275  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
276  ra_exe_unit_.simple_quals.size() > 0) {
278  0,
279  group_cardinality_estimation,
280  0,
281  false};
282  }
283  const auto col_range_info = get_expr_range_info(
285  if (!ra_exe_unit_.groupby_exprs.front()) {
286  return col_range_info;
287  }
288  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
289  const int64_t col_count =
291  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
293  max_entry_count = std::min(max_entry_count, baseline_threshold);
294  }
295  const auto& groupby_expr_ti = ra_exe_unit_.groupby_exprs.front()->get_type_info();
296  if (groupby_expr_ti.is_string() && !col_range_info.bucket) {
297  CHECK(groupby_expr_ti.get_compression() == kENCODING_DICT);
298 
299  const bool has_filters =
300  !ra_exe_unit_.quals.empty() || !ra_exe_unit_.simple_quals.empty();
301  if (has_filters &&
302  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
303  // if filters are present, we can use the filter to narrow the cardinality of the
304  // group by in the case of ranges too big for perfect hash. Otherwise, we are better
305  // off attempting perfect hash (since we know the range will be made of
306  // monotonically increasing numbers from min to max for dictionary encoded strings)
307  // and failing later due to excessive memory use.
308  // Check the conditions where baseline hash can provide a performance increase and
309  // return baseline hash (potentially forcing an estimator query) as the range type.
310  // Otherwise, return col_range_info which will likely be perfect hash, though could
311  // be baseline from a previous call of this function prior to the estimator query.
312  if (!ra_exe_unit_.sort_info.order_entries.empty()) {
313  // TODO(adb): allow some sorts to pass through this block by centralizing sort
314  // algorithm decision making
316  // always use baseline hash for column range too big for perfect hash with count
317  // distinct descriptors. We will need 8GB of CPU memory minimum for the perfect
318  // hash group by in this case.
320  col_range_info.min,
321  col_range_info.max,
322  0,
323  col_range_info.has_nulls};
324  } else {
325  // use original col range for sort
326  return col_range_info;
327  }
328  }
329  // if filters are present and the filtered range is less than the cardinality of
330  // the column, consider baseline hash
333  col_range_info)) {
335  col_range_info.min,
336  col_range_info.max,
337  0,
338  col_range_info.has_nulls};
339  }
340  }
341  } else if ((!expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get())) &&
342  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
343  !col_range_info.bucket) {
345  col_range_info.min,
346  col_range_info.max,
347  0,
348  col_range_info.has_nulls};
349  }
350  return col_range_info;
351 }
352 
354  checked_int64_t crt_col_cardinality =
355  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
356  if (col_range_info.bucket) {
357  crt_col_cardinality /= col_range_info.bucket;
358  }
359  return static_cast<int64_t>(crt_col_cardinality +
360  (1 + (col_range_info.has_nulls ? 1 : 0)));
361 }
362 
363 namespace {
364 // Like getBucketedCardinality() without counting nulls.
365 int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo& col_range_info) {
366  if (col_range_info.min <= col_range_info.max) {
367  size_t size = col_range_info.max - col_range_info.min;
368  if (col_range_info.bucket) {
369  size /= col_range_info.bucket;
370  }
371  if (size >= static_cast<size_t>(std::numeric_limits<int64_t>::max())) {
372  // try to use unordered_set instead of crashing due to CHECK failure
373  // i.e., CHECK_LT(size, std::numeric_limits<int64_t>::max());
374  return 0;
375  }
376  return static_cast<int64_t>(size + 1);
377  } else {
378  return 0;
379  }
380 }
381 } // namespace
382 
383 #define LL_CONTEXT executor_->cgen_state_->context_
384 #define LL_BUILDER executor_->cgen_state_->ir_builder_
385 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
386 #define LL_INT(v) executor_->cgen_state_->llInt(v)
387 #define LL_FP(v) executor_->cgen_state_->llFp(v)
388 #define ROW_FUNC executor_->cgen_state_->row_func_
389 #define CUR_FUNC executor_->cgen_state_->current_func_
390 
392  Executor* executor,
393  const ExecutorDeviceType device_type,
394  const RelAlgExecutionUnit& ra_exe_unit,
395  const std::vector<InputTableInfo>& query_infos,
396  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
397  const std::optional<int64_t>& group_cardinality_estimation)
398  : executor_(executor)
399  , ra_exe_unit_(ra_exe_unit)
400  , query_infos_(query_infos)
401  , row_set_mem_owner_(row_set_mem_owner)
402  , device_type_(device_type)
403  , group_cardinality_estimation_(group_cardinality_estimation) {
404  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
405  if (!groupby_expr) {
406  continue;
407  }
408  const auto& groupby_ti = groupby_expr->get_type_info();
409  if (groupby_ti.is_text_encoding_none()) {
410  throw std::runtime_error(
411  "Cannot group by string columns which are not dictionary encoded.");
412  }
413  if (groupby_ti.is_buffer()) {
414  throw std::runtime_error("Group by buffer not supported");
415  }
416  if (groupby_ti.is_geometry()) {
417  throw std::runtime_error("Group by geometry not supported");
418  }
419  }
420 }
421 
423  const size_t shard_count) const {
424  size_t device_count{0};
426  device_count = executor_->cudaMgr()->getDeviceCount();
427  CHECK_GT(device_count, 0u);
428  }
429 
430  int64_t bucket{col_range_info.bucket};
431 
432  if (shard_count) {
433  CHECK(!col_range_info.bucket);
434  /*
435  when a node has fewer devices than shard count,
436  a) In a distributed setup, the minimum distance between two keys would be
437  device_count because shards are stored consecutively across the physical tables,
438  i.e if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1
439  would have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf
440  node has only 1 device, in this case, all the keys from each node are loaded on
441  the device each.
442 
443  b) In a single node setup, the distance would be minimum of device_count or
444  difference of device_count - shard_count. For example: If a single node server
445  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
446  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9
447  device 3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum
448  of device_count or difference.
449 
450  When a node has device count equal to or more than shard count then the
451  minimum distance is always at least shard_count * no of leaf nodes.
452  */
453  if (device_count < shard_count) {
454  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
455  : std::min(device_count, shard_count - device_count);
456  } else {
457  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
458  }
459  }
460 
461  return bucket;
462 }
463 
464 namespace {
465 
476  const std::vector<InputTableInfo>& query_infos,
477  const bool is_group_by,
478  Executor* executor) {
479  bool keyless{true}, found{false};
480  int32_t num_agg_expr{0};
481  int32_t index{0};
482  for (const auto target_expr : ra_exe_unit.target_exprs) {
483  const auto agg_info = get_target_info(target_expr, g_bigint_count);
484  const auto chosen_type = get_compact_type(agg_info);
485  if (agg_info.is_agg) {
486  num_agg_expr++;
487  }
488  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
489  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
490  CHECK(agg_expr);
491  const auto arg_expr = agg_arg(target_expr);
492  const bool float_argument_input = takes_float_argument(agg_info);
493  switch (agg_info.agg_kind) {
494  case kAVG:
495  ++index;
496  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
497  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
498  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
499  expr_range_info.hasNulls()) {
500  break;
501  }
502  }
503  found = true;
504  break;
505  case kCOUNT:
506  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
507  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
508  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
509  expr_range_info.hasNulls()) {
510  break;
511  }
512  }
513  found = true;
514  break;
515  case kSUM: {
516  auto arg_ti = arg_expr->get_type_info();
517  if (constrained_not_null(arg_expr, ra_exe_unit.quals)) {
518  arg_ti.set_notnull(true);
519  }
520  if (!arg_ti.get_notnull()) {
521  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
522  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
523  !expr_range_info.hasNulls()) {
524  found = true;
525  }
526  } else {
527  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
528  switch (expr_range_info.getType()) {
531  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
532  found = true;
533  }
534  break;
536  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
537  found = true;
538  }
539  break;
540  default:
541  break;
542  }
543  }
544  break;
545  }
546  case kMIN: {
547  CHECK(agg_expr && agg_expr->get_arg());
548  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
549  if (arg_ti.is_string() || arg_ti.is_buffer()) {
550  break;
551  }
552  auto expr_range_info =
553  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
554  auto init_max = get_agg_initial_val(agg_info.agg_kind,
555  chosen_type,
556  is_group_by || float_argument_input,
557  float_argument_input ? sizeof(float) : 8);
558  switch (expr_range_info.getType()) {
561  auto double_max =
562  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
563  if (expr_range_info.getFpMax() < double_max) {
564  found = true;
565  }
566  break;
567  }
569  if (expr_range_info.getIntMax() < init_max) {
570  found = true;
571  }
572  break;
573  default:
574  break;
575  }
576  break;
577  }
578  case kMAX: {
579  CHECK(agg_expr && agg_expr->get_arg());
580  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
581  if (arg_ti.is_string() || arg_ti.is_buffer()) {
582  break;
583  }
584  auto expr_range_info =
585  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
586  // NULL sentinel and init value for kMAX are identical, which results in
587  // ambiguity in detecting empty keys in presence of nulls.
588  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
589  expr_range_info.hasNulls()) {
590  break;
591  }
592  auto init_min = get_agg_initial_val(agg_info.agg_kind,
593  chosen_type,
594  is_group_by || float_argument_input,
595  float_argument_input ? sizeof(float) : 8);
596  switch (expr_range_info.getType()) {
599  auto double_min =
600  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
601  if (expr_range_info.getFpMin() > double_min) {
602  found = true;
603  }
604  break;
605  }
607  if (expr_range_info.getIntMin() > init_min) {
608  found = true;
609  }
610  break;
611  default:
612  break;
613  }
614  break;
615  }
616  default:
617  keyless = false;
618  break;
619  }
620  }
621  if (!keyless) {
622  break;
623  }
624  if (!found) {
625  ++index;
626  }
627  }
628 
629  // shouldn't use keyless for projection only
630  return {
631  keyless && found,
632  index,
633  };
634 }
635 
637  const RelAlgExecutionUnit& ra_exe_unit,
638  const std::vector<InputTableInfo>& query_infos,
639  const ColRangeInfo& group_by_range_info,
640  const ExecutorDeviceType device_type,
641  Executor* executor) {
642  CountDistinctDescriptors count_distinct_descriptors;
643  auto compute_bytes_per_group =
644  [](size_t bitmap_sz, size_t sub_bitmap_count, ExecutorDeviceType device_type) {
645  size_t effective_size_bytes = (bitmap_sz + 7) / 8;
646  const auto padded_size =
647  (device_type == ExecutorDeviceType::GPU || sub_bitmap_count > 1)
648  ? align_to_int64(effective_size_bytes)
649  : effective_size_bytes;
650  return padded_size * sub_bitmap_count;
651  };
652  for (size_t i = 0; i < ra_exe_unit.target_exprs.size(); i++) {
653  const auto target_expr = ra_exe_unit.target_exprs[i];
654  auto agg_info = get_target_info(target_expr, g_bigint_count);
655  if (is_distinct_target(agg_info)) {
656  CHECK(agg_info.is_agg);
657  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
658  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
659  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
660  if (arg_ti.is_text_encoding_none()) {
661  throw std::runtime_error(
662  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
663  }
664  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
665  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
666  }
667  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
668  throw std::runtime_error(
669  "APPROX_COUNT_DISTINCT on geometry columns not supported");
670  }
671  if (agg_info.is_distinct && arg_ti.is_geometry()) {
672  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
673  }
674  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
675  auto arg_range_info =
676  arg_ti.is_fp() ? no_range_info
678  ra_exe_unit, query_infos, agg_expr->get_arg(), executor);
679  const auto it = ra_exe_unit.target_exprs_original_type_infos.find(i);
680  if (it != ra_exe_unit.target_exprs_original_type_infos.end()) {
681  const auto& original_target_expr_ti = it->second;
682  if (arg_ti.is_integer() && original_target_expr_ti.get_type() == kDATE &&
683  original_target_expr_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
684  // manually encode the col range of date col if necessary
685  // (see conditionally_change_arg_to_int_type function in RelAlgExecutor.cpp)
686  auto is_date_value_not_encoded = [&original_target_expr_ti](int64_t date_val) {
687  if (original_target_expr_ti.get_comp_param() == 16) {
688  return date_val < INT16_MIN || date_val > INT16_MAX;
689  } else {
690  return date_val < INT32_MIN || date_val > INT32_MIN;
691  }
692  };
693  if (is_date_value_not_encoded(arg_range_info.min)) {
694  // chunk metadata of the date column contains decoded value
695  // so we manually encode it again here to represent its column range correctly
696  arg_range_info.min =
698  }
699  if (is_date_value_not_encoded(arg_range_info.max)) {
700  arg_range_info.max =
702  }
703  // now we manually encode the value, so we need to invalidate bucket value
704  // i.e., 86000 -> 0, to correctly calculate the size of bitmap
705  arg_range_info.bucket = 0;
706  }
707  }
708 
710  int64_t bitmap_sz_bits{0};
711  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
712  const auto error_rate_expr = agg_expr->get_arg1();
713  if (error_rate_expr) {
714  CHECK(error_rate_expr->get_type_info().get_type() == kINT);
715  auto const error_rate =
716  dynamic_cast<Analyzer::Constant const*>(error_rate_expr.get());
717  CHECK(error_rate);
718  CHECK_GE(error_rate->get_constval().intval, 1);
719  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
720  } else {
721  bitmap_sz_bits = g_hll_precision_bits;
722  }
723  }
724  if (arg_range_info.isEmpty()) {
725  count_distinct_descriptors.emplace_back(
727  0,
728  arg_range_info.bucket,
729  64,
730  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
731  device_type,
732  1});
733  continue;
734  }
735  const auto sub_bitmap_count =
736  get_count_distinct_sub_bitmap_count(bitmap_sz_bits, ra_exe_unit, device_type);
737  size_t worst_case_num_groups{1};
738  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
739  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
740  // implementation for arrays
741  count_distinct_impl_type = CountDistinctImplType::Bitmap;
742  if (shared::is_any<kCOUNT, kCOUNT_IF>(agg_info.agg_kind)) {
743  bitmap_sz_bits = get_bucketed_cardinality_without_nulls(arg_range_info);
744  if (bitmap_sz_bits <= 0 || g_bitmap_memory_limit <= bitmap_sz_bits) {
745  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
746  }
747  // check a potential OOM when using bitmap-based approach
748  const auto total_bytes_per_entry =
749  compute_bytes_per_group(bitmap_sz_bits, sub_bitmap_count, device_type);
750  const auto range_bucket = std::max(group_by_range_info.bucket, (int64_t)1);
751  const auto maximum_num_groups =
752  (group_by_range_info.max - group_by_range_info.min + 1) / range_bucket;
753  const auto total_bitmap_bytes_for_groups =
754  total_bytes_per_entry * maximum_num_groups;
755  // we can estimate a potential OOM of bitmap-based count-distinct operator
756  // by using the logic "check_total_bitmap_memory"
757  if (total_bitmap_bytes_for_groups >=
758  static_cast<size_t>(g_bitmap_memory_limit)) {
759  const auto agg_expr_max_entry_count =
760  arg_range_info.max - arg_range_info.min + 1;
761  int64_t max_agg_expr_table_cardinality{1};
762  std::set<const Analyzer::ColumnVar*,
763  bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
765  agg_expr->collect_column_var(colvar_set, true);
766  for (const auto cv : colvar_set) {
767  auto it =
768  std::find_if(query_infos.begin(),
769  query_infos.end(),
770  [&](const auto& input_table_info) {
771  return input_table_info.table_key == cv->getTableKey();
772  });
773  int64_t cur_table_cardinality =
774  it != query_infos.end()
775  ? static_cast<int64_t>(it->info.getNumTuplesUpperBound())
776  : -1;
777  max_agg_expr_table_cardinality =
778  std::max(max_agg_expr_table_cardinality, cur_table_cardinality);
779  worst_case_num_groups *= cur_table_cardinality;
780  }
781  auto has_valid_stat = [agg_expr_max_entry_count, maximum_num_groups]() {
782  return agg_expr_max_entry_count > 0 && maximum_num_groups > 0;
783  };
784  // if we have valid stats regarding input expr, we can try to relax the OOM
785  if (has_valid_stat()) {
786  // a threshold related to a ratio of a range of agg expr (let's say R)
787  // and table cardinality (C), i.e., use unordered_set if the # bits to build
788  // a bitmap based on R is four times larger than that of C
789  const size_t unordered_set_threshold{2};
790  // When we detect OOM of bitmap-based approach we selectively switch it to
791  // hash set-based processing logic if one of the followings is satisfied:
792  // 1) the column range is too wide compared with the table cardinality, or
793  // 2) the column range is too wide compared with the avg of # unique values
794  // per group by entry
795  const auto bits_for_agg_entry = std::ceil(log(agg_expr_max_entry_count));
796  const auto bits_for_agg_table =
797  std::ceil(log(max_agg_expr_table_cardinality));
798  const auto avg_num_unique_entries_per_group =
799  std::ceil(max_agg_expr_table_cardinality / maximum_num_groups);
800  // case a) given a range of entry count of agg_expr and the maximum
801  // cardinality among source tables of the agg_expr , we try to detect the
802  // misleading case of too sparse column range , i.e., agg_expr has 1M column
803  // range but only has two tuples {1 and 1M} / case b) check whether
804  // using bitmap is really beneficial when considering uniform distribution
805  // of (unique) keys.
806  if ((bits_for_agg_entry - bits_for_agg_table) >= unordered_set_threshold ||
807  agg_expr_max_entry_count >= avg_num_unique_entries_per_group) {
808  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
809  } else {
810  throw std::runtime_error(
811  "Consider using approx_count_distinct operator instead of "
812  "count_distinct operator to lower the memory "
813  "requirements");
814  }
815  }
816  }
817  }
818  }
819  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
820  count_distinct_impl_type == CountDistinctImplType::UnorderedSet &&
821  !(arg_ti.is_array() || arg_ti.is_geometry())) {
822  count_distinct_impl_type = CountDistinctImplType::Bitmap;
823  }
824  const size_t too_many_entries{100000000};
825  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
826  worst_case_num_groups > too_many_entries &&
827  count_distinct_impl_type == CountDistinctImplType::UnorderedSet) {
828  throw WatchdogException(
829  "Detect too many input entries for set-based count distinct operator under "
830  "the watchdog");
831  }
832  count_distinct_descriptors.emplace_back(
833  CountDistinctDescriptor{count_distinct_impl_type,
834  arg_range_info.min,
835  arg_range_info.bucket,
836  bitmap_sz_bits,
837  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
838  device_type,
839  sub_bitmap_count});
840  } else {
841  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
842  CountDistinctImplType::Invalid, 0, 0, 0, false, device_type, 0});
843  }
844  }
845  return count_distinct_descriptors;
846 }
847 
848 } // namespace
849 
850 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
851  const bool allow_multifrag,
852  const size_t max_groups_buffer_entry_count,
853  const int8_t crt_min_byte_width,
854  RenderInfo* render_info,
855  const bool output_columnar_hint) {
856  const auto shard_count = device_type_ == ExecutorDeviceType::GPU
858  : 0;
859  bool sort_on_gpu_hint =
860  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
863  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
864  // but the total output buffer size would be too big or it's a sharded top query.
865  // For the sake of managing risk, use the new result set way very selectively for
866  // this case only (alongside the baseline layout we've enabled for a while now).
867  bool must_use_baseline_sort = shard_count;
868  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
869  while (true) {
870  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
871  max_groups_buffer_entry_count,
872  crt_min_byte_width,
873  sort_on_gpu_hint,
874  render_info,
875  must_use_baseline_sort,
876  output_columnar_hint);
877  CHECK(query_mem_desc);
878  if (query_mem_desc->sortOnGpu() &&
879  (query_mem_desc->getBufferSizeBytes(device_type_) +
880  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
881  2 * 1024 * 1024 * 1024LL) {
882  must_use_baseline_sort = true;
883  sort_on_gpu_hint = false;
884  } else {
885  break;
886  }
887  }
888  return query_mem_desc;
889 }
890 
891 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
892  const bool allow_multifrag,
893  const size_t max_groups_buffer_entry_count,
894  const int8_t crt_min_byte_width,
895  const bool sort_on_gpu_hint,
896  RenderInfo* render_info,
897  const bool must_use_baseline_sort,
898  const bool output_columnar_hint) {
899  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
900 
901  const bool threads_can_reuse_group_by_buffers =
902  device_type_ == ExecutorDeviceType::CPU && is_group_by &&
903  ra_exe_unit_.groupby_exprs.front();
904 
905  auto col_range_info_nosharding = getColRangeInfo();
906 
907  const auto shard_count = device_type_ == ExecutorDeviceType::GPU
909  : 0;
910 
911  const auto col_range_info =
912  ColRangeInfo{col_range_info_nosharding.hash_type_,
913  col_range_info_nosharding.min,
914  col_range_info_nosharding.max,
915  getShardedTopBucket(col_range_info_nosharding, shard_count),
916  col_range_info_nosharding.has_nulls};
917 
918  // Non-grouped aggregates do not support accessing aggregated ranges
919  // Keyless hash is currently only supported with single-column perfect hash
920  const auto keyless_info =
921  !(is_group_by &&
922  col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash)
923  ? KeylessInfo{false, -1}
925 
926  if (g_enable_watchdog &&
927  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
928  max_groups_buffer_entry_count > g_watchdog_baseline_max_groups) ||
929  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
930  ra_exe_unit_.groupby_exprs.size() == 1 &&
931  (col_range_info.max - col_range_info.min) /
932  std::max(col_range_info.bucket, int64_t(1)) >
933  130000000))) {
934  throw WatchdogException("Query would use too much memory");
935  }
936 
937  const auto count_distinct_descriptors = init_count_distinct_descriptors(
938  ra_exe_unit_, query_infos_, col_range_info, device_type_, executor_);
939  try {
941  ra_exe_unit_,
942  query_infos_,
943  col_range_info,
944  keyless_info,
945  allow_multifrag,
946  device_type_,
947  crt_min_byte_width,
948  sort_on_gpu_hint,
949  shard_count,
950  max_groups_buffer_entry_count,
951  render_info,
952  count_distinct_descriptors,
953  must_use_baseline_sort,
954  output_columnar_hint,
955  /*streaming_top_n_hint=*/true,
956  threads_can_reuse_group_by_buffers);
957  } catch (const StreamingTopNOOM& e) {
958  LOG(WARNING) << e.what() << " Disabling Streaming Top N.";
960  ra_exe_unit_,
961  query_infos_,
962  col_range_info,
963  keyless_info,
964  allow_multifrag,
965  device_type_,
966  crt_min_byte_width,
967  sort_on_gpu_hint,
968  shard_count,
969  max_groups_buffer_entry_count,
970  render_info,
971  count_distinct_descriptors,
972  must_use_baseline_sort,
973  output_columnar_hint,
974  /*streaming_top_n_hint=*/false,
975  threads_can_reuse_group_by_buffers);
976  }
977 }
978 
980  const std::list<Analyzer::OrderEntry>& order_entries) {
981  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
982  return false;
983  }
984  for (const auto& order_entry : order_entries) {
985  CHECK_GE(order_entry.tle_no, 1);
986  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
987  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
988  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
989  return false;
990  }
991  // TODO(alex): relax the restrictions
992  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
993  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
994  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
995  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
996  return false;
997  }
998  if (agg_expr->get_arg()) {
999  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
1000  if (arg_ti.is_fp()) {
1001  return false;
1002  }
1003  auto expr_range_info =
1004  get_expr_range_info(ra_exe_unit_, query_infos_, agg_expr->get_arg(), executor_);
1005  // TOD(adb): QMD not actually initialized here?
1006  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
1007  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
1008  expr_range_info.has_nulls) &&
1009  order_entry.is_desc == order_entry.nulls_first) {
1010  return false;
1011  }
1012  }
1013  const auto& target_ti = target_expr->get_type_info();
1014  CHECK(!target_ti.is_buffer());
1015  if (!target_ti.is_integer()) {
1016  return false;
1017  }
1018  }
1019  return true;
1020 }
1021 
1022 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
1023  llvm::BasicBlock* sc_false,
1025  const CompilationOptions& co,
1026  const GpuSharedMemoryContext& gpu_smem_context) {
1027  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1028  CHECK(filter_result);
1029 
1030  bool can_return_error = false;
1031  llvm::BasicBlock* filter_false{nullptr};
1032 
1033  {
1034  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
1035 
1036  if (executor_->isArchMaxwell(co.device_type)) {
1037  prependForceSync();
1038  }
1039  DiamondCodegen filter_cfg(filter_result,
1040  executor_,
1041  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
1042  "filter", // filter_true and filter_false basic blocks
1043  nullptr,
1044  false);
1045  filter_false = filter_cfg.cond_false_;
1046 
1047  if (is_group_by) {
1049  !query_mem_desc.useStreamingTopN()) {
1050  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
1051  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
1052  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
1053  llvm::Value* old_total_matched_val{nullptr};
1055  old_total_matched_val =
1056  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
1057  total_matched_ptr,
1058  LL_INT(int32_t(1)),
1059 #if LLVM_VERSION_MAJOR > 12
1060  LLVM_ALIGN(8),
1061 #endif
1062  llvm::AtomicOrdering::Monotonic);
1063  } else {
1064  old_total_matched_val = LL_BUILDER.CreateLoad(
1065  total_matched_ptr->getType()->getPointerElementType(), total_matched_ptr);
1066  LL_BUILDER.CreateStore(
1067  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
1068  total_matched_ptr);
1069  }
1070  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
1071  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
1072  }
1073 
1074  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
1075  auto varlen_output_buffer = codegenVarlenOutputBuffer(query_mem_desc);
1076  if (query_mem_desc.usesGetGroupValueFast() ||
1077  query_mem_desc.getQueryDescriptionType() ==
1079  if (query_mem_desc.getGroupbyColCount() > 1) {
1080  filter_cfg.setChainToNext();
1081  }
1082  // Don't generate null checks if the group slot is guaranteed to be non-null,
1083  // as it's the case for get_group_value_fast* family.
1084  can_return_error = codegenAggCalls(agg_out_ptr_w_idx,
1085  varlen_output_buffer,
1086  {},
1088  co,
1089  gpu_smem_context,
1090  filter_cfg);
1091  } else {
1092  {
1093  llvm::Value* nullcheck_cond{nullptr};
1094  if (query_mem_desc.didOutputColumnar()) {
1095  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
1096  LL_INT(int32_t(0)));
1097  } else {
1098  nullcheck_cond = LL_BUILDER.CreateICmpNE(
1099  std::get<0>(agg_out_ptr_w_idx),
1100  llvm::ConstantPointerNull::get(
1101  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
1102  }
1103  DiamondCodegen nullcheck_cfg(
1104  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
1105  codegenAggCalls(agg_out_ptr_w_idx,
1106  varlen_output_buffer,
1107  {},
1109  co,
1110  gpu_smem_context,
1111  filter_cfg);
1112  }
1113  can_return_error = true;
1114  if (query_mem_desc.getQueryDescriptionType() ==
1116  query_mem_desc.useStreamingTopN()) {
1117  // Ignore rejection on pushing current row to top-K heap.
1118  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
1119  } else {
1120  CodeGenerator code_generator(executor_);
1121  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
1122  // TODO(alex): remove the trunc once pos is converted to 32 bits
1123  code_generator.posArg(nullptr),
1124  get_int_type(32, LL_CONTEXT))));
1125  }
1126  }
1127  } else {
1128  if (ra_exe_unit_.estimator) {
1129  std::stack<llvm::BasicBlock*> array_loops;
1130  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
1131  } else {
1132  auto arg_it = ROW_FUNC->arg_begin();
1133  std::vector<llvm::Value*> agg_out_vec;
1134  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
1135  agg_out_vec.push_back(&*arg_it++);
1136  }
1137  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
1138  /*varlen_output_buffer=*/nullptr,
1139  agg_out_vec,
1140  query_mem_desc,
1141  co,
1142  gpu_smem_context,
1143  filter_cfg);
1144  }
1145  }
1146  }
1147 
1148  if (ra_exe_unit_.join_quals.empty()) {
1149  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
1150  } else if (sc_false) {
1151  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
1152  LL_BUILDER.SetInsertPoint(sc_false);
1153  LL_BUILDER.CreateBr(filter_false);
1154  LL_BUILDER.SetInsertPoint(saved_insert_block);
1155  }
1156 
1157  return can_return_error;
1158 }
1159 
1161  llvm::Value* groups_buffer,
1163  const CompilationOptions& co,
1164  DiamondCodegen& diamond_codegen) {
1165  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1167  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1168  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
1169  CHECK(!group_expr);
1170  if (!query_mem_desc.didOutputColumnar()) {
1171  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
1172  }
1173  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1174  ? 0
1175  : query_mem_desc.getRowSize() / sizeof(int64_t);
1176  CodeGenerator code_generator(executor_);
1177  if (query_mem_desc.useStreamingTopN()) {
1178  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
1179  CHECK_GE(only_order_entry.tle_no, int(1));
1180  const size_t target_idx = only_order_entry.tle_no - 1;
1181  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1182  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1183  const auto chosen_bytes =
1184  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1185  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1186  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1187  const uint32_t n =
1189  std::string fname = "get_bin_from_k_heap";
1190  const auto& oe_ti = order_entry_expr->get_type_info();
1191  llvm::Value* null_key_lv = nullptr;
1192  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1193  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1194  switch (bit_width) {
1195  case 32:
1196  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1197  break;
1198  case 64:
1199  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1200  break;
1201  default:
1202  CHECK(false);
1203  }
1204  fname += "_int" + std::to_string(bit_width) + "_t";
1205  } else {
1206  CHECK(oe_ti.is_fp());
1207  if (order_entry_lv->getType()->isDoubleTy()) {
1208  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1209  } else {
1210  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1211  }
1212  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1213  }
1214  const auto key_slot_idx =
1216  return emitCall(
1217  fname,
1218  {groups_buffer,
1219  LL_INT(n),
1220  LL_INT(row_size_quad),
1221  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1222  LL_BOOL(only_order_entry.is_desc),
1223  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1224  LL_BOOL(only_order_entry.nulls_first),
1225  null_key_lv,
1226  order_entry_lv});
1227  } else {
1228  auto* arg = get_arg_by_name(ROW_FUNC, "max_matched");
1229  const auto output_buffer_entry_count_lv =
1230  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1231  arg = get_arg_by_name(ROW_FUNC, "old_total_matched");
1232  const auto group_expr_lv =
1233  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1234  std::vector<llvm::Value*> args{groups_buffer,
1235  output_buffer_entry_count_lv,
1236  group_expr_lv,
1237  code_generator.posArg(nullptr)};
1238  if (query_mem_desc.didOutputColumnar()) {
1239  const auto columnar_output_offset =
1240  emitCall("get_columnar_scan_output_offset", args);
1241  return columnar_output_offset;
1242  }
1243  args.push_back(LL_INT(row_size_quad));
1244  return emitCall("get_scan_output_slot", args);
1245  }
1246 }
1247 
1248 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1250  const CompilationOptions& co,
1251  DiamondCodegen& diamond_codegen) {
1252  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1253  auto arg_it = ROW_FUNC->arg_begin();
1254  auto groups_buffer = arg_it++;
1255 
1256  std::stack<llvm::BasicBlock*> array_loops;
1257 
1258  // TODO(Saman): move this logic outside of this function.
1260  if (query_mem_desc.didOutputColumnar()) {
1261  return std::make_tuple(
1262  &*groups_buffer,
1263  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1264  } else {
1265  return std::make_tuple(
1266  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1267  nullptr);
1268  }
1269  }
1270 
1271  CHECK(query_mem_desc.getQueryDescriptionType() ==
1273  query_mem_desc.getQueryDescriptionType() ==
1275 
1276  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1277  ? 0
1278  : query_mem_desc.getRowSize() / sizeof(int64_t);
1279 
1280  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1281  ? sizeof(int64_t)
1282  : query_mem_desc.getEffectiveKeyWidth();
1283  // for multi-column group by
1284  llvm::Value* group_key = nullptr;
1285  llvm::Value* key_size_lv = nullptr;
1286 
1287  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1288  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1289  if (query_mem_desc.getQueryDescriptionType() ==
1291  group_key =
1292  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1293  } else if (query_mem_desc.getQueryDescriptionType() ==
1295  group_key =
1296  col_width_size == sizeof(int32_t)
1297  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1298  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1299  }
1300  CHECK(group_key);
1301  CHECK(key_size_lv);
1302  }
1303 
1304  int32_t subkey_idx = 0;
1305  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1306  for (const auto& group_expr : ra_exe_unit_.groupby_exprs) {
1307  const auto col_range_info =
1309  const auto translated_null_value = static_cast<int64_t>(
1310  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1311  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1312  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1313  : checked_int64_t(col_range_info.max) +
1314  (col_range_info.bucket ? col_range_info.bucket : 1));
1315 
1316  const bool col_has_nulls =
1317  query_mem_desc.getQueryDescriptionType() ==
1319  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1320  ? query_mem_desc.hasNulls()
1321  : col_range_info.has_nulls)
1322  : false;
1323 
1324  const auto group_expr_lvs =
1325  executor_->groupByColumnCodegen(group_expr.get(),
1326  col_width_size,
1327  co,
1328  col_has_nulls,
1329  translated_null_value,
1330  diamond_codegen,
1331  array_loops,
1332  query_mem_desc.threadsShareMemory());
1333  const auto group_expr_lv = group_expr_lvs.translated_value;
1334  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1335  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1336  return codegenSingleColumnPerfectHash(query_mem_desc,
1337  co,
1338  &*groups_buffer,
1339  group_expr_lv,
1340  group_expr_lvs.original_value,
1341  row_size_quad);
1342  } else {
1343  // store the sub-key to the buffer
1344  LL_BUILDER.CreateStore(
1345  group_expr_lv,
1346  LL_BUILDER.CreateGEP(
1347  group_key->getType()->getScalarType()->getPointerElementType(),
1348  group_key,
1349  LL_INT(subkey_idx++)));
1350  }
1351  }
1352  if (query_mem_desc.getQueryDescriptionType() ==
1354  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1356  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1357  } else if (query_mem_desc.getQueryDescriptionType() ==
1360  &*groups_buffer,
1361  group_key,
1362  key_size_lv,
1363  query_mem_desc,
1364  col_width_size,
1365  row_size_quad);
1366  }
1367  CHECK(false);
1368  return std::make_tuple(nullptr, nullptr);
1369 }
1370 
1373  if (!query_mem_desc.hasVarlenOutput()) {
1374  return nullptr;
1375  }
1376 
1377  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1378  auto arg_it = ROW_FUNC->arg_begin();
1379  arg_it++; /* groups_buffer */
1380  auto varlen_output_buffer = arg_it++;
1381  CHECK(varlen_output_buffer->getType() == llvm::Type::getInt64PtrTy(LL_CONTEXT));
1382  return varlen_output_buffer;
1383 }
1384 
1385 std::tuple<llvm::Value*, llvm::Value*>
1388  const CompilationOptions& co,
1389  llvm::Value* groups_buffer,
1390  llvm::Value* group_expr_lv_translated,
1391  llvm::Value* group_expr_lv_original,
1392  const int32_t row_size_quad) {
1393  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1394  CHECK(query_mem_desc.usesGetGroupValueFast());
1395  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1396  ? "get_columnar_group_bin_offset"
1397  : "get_group_value_fast"};
1398  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1399  get_group_fn_name += "_keyless";
1400  }
1401  if (query_mem_desc.interleavedBins(co.device_type)) {
1402  CHECK(!query_mem_desc.didOutputColumnar());
1403  CHECK(query_mem_desc.hasKeylessHash());
1404  get_group_fn_name += "_semiprivate";
1405  }
1406  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1407  &*group_expr_lv_translated};
1408  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1409  query_mem_desc.mustUseBaselineSort()) {
1410  get_group_fn_name += "_with_original_key";
1411  get_group_fn_args.push_back(group_expr_lv_original);
1412  }
1413  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1414  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1415  if (!query_mem_desc.hasKeylessHash()) {
1416  if (!query_mem_desc.didOutputColumnar()) {
1417  get_group_fn_args.push_back(LL_INT(row_size_quad));
1418  }
1419  } else {
1420  if (!query_mem_desc.didOutputColumnar()) {
1421  get_group_fn_args.push_back(LL_INT(row_size_quad));
1422  }
1423  if (query_mem_desc.interleavedBins(co.device_type)) {
1424  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1425  get_group_fn_args.push_back(warp_idx);
1426  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1427  }
1428  }
1429  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1430  return std::make_tuple(&*groups_buffer,
1431  emitCall(get_group_fn_name, get_group_fn_args));
1432  }
1433  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1434 }
1435 
1436 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1437  llvm::Value* groups_buffer,
1438  llvm::Value* group_key,
1439  llvm::Value* key_size_lv,
1440  const QueryMemoryDescriptor& query_mem_desc,
1441  const int32_t row_size_quad) {
1442  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1443  CHECK(query_mem_desc.getQueryDescriptionType() ==
1445  // compute the index (perfect hash)
1446  auto perfect_hash_func = codegenPerfectHashFunction();
1447  auto hash_lv =
1448  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1449 
1450  if (query_mem_desc.didOutputColumnar()) {
1451  if (!query_mem_desc.hasKeylessHash()) {
1452  const std::string set_matching_func_name{
1453  "set_matching_group_value_perfect_hash_columnar"};
1454  const std::vector<llvm::Value*> set_matching_func_arg{
1455  groups_buffer,
1456  hash_lv,
1457  group_key,
1458  key_size_lv,
1459  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1460  query_mem_desc.getEntryCount())};
1461  emitCall(set_matching_func_name, set_matching_func_arg);
1462  }
1463  return std::make_tuple(groups_buffer, hash_lv);
1464  } else {
1465  if (query_mem_desc.hasKeylessHash()) {
1466  return std::make_tuple(emitCall("get_matching_group_value_perfect_hash_keyless",
1467  {groups_buffer, hash_lv, LL_INT(row_size_quad)}),
1468  nullptr);
1469  } else {
1470  return std::make_tuple(
1471  emitCall(
1472  "get_matching_group_value_perfect_hash",
1473  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1474  nullptr);
1475  }
1476  }
1477 }
1478 
1479 std::tuple<llvm::Value*, llvm::Value*>
1481  const CompilationOptions& co,
1482  llvm::Value* groups_buffer,
1483  llvm::Value* group_key,
1484  llvm::Value* key_size_lv,
1485  const QueryMemoryDescriptor& query_mem_desc,
1486  const size_t key_width,
1487  const int32_t row_size_quad) {
1488  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1489  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1490  CHECK(key_width == sizeof(int32_t));
1491  group_key =
1492  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1493  }
1494  std::vector<llvm::Value*> func_args{
1495  groups_buffer,
1496  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1497  &*group_key,
1498  &*key_size_lv,
1499  LL_INT(static_cast<int32_t>(key_width))};
1500  std::string func_name{"get_group_value"};
1501  if (query_mem_desc.didOutputColumnar()) {
1502  func_name += "_columnar_slot";
1503  } else {
1504  func_args.push_back(LL_INT(row_size_quad));
1505  }
1506  if (co.with_dynamic_watchdog) {
1507  func_name += "_with_watchdog";
1508  }
1509  if (query_mem_desc.didOutputColumnar()) {
1510  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1511  } else {
1512  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1513  }
1514 }
1515 
1517  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1518  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1519  auto ft = llvm::FunctionType::get(
1520  get_int_type(32, LL_CONTEXT),
1521  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1522  false);
1523  auto key_hash_func = llvm::Function::Create(ft,
1524  llvm::Function::ExternalLinkage,
1525  "perfect_key_hash",
1526  executor_->cgen_state_->module_);
1527  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1528  mark_function_always_inline(key_hash_func);
1529  auto& key_buff_arg = *key_hash_func->args().begin();
1530  llvm::Value* key_buff_lv = &key_buff_arg;
1531  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1532  llvm::IRBuilder<> key_hash_func_builder(bb);
1533  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1534  std::vector<int64_t> cardinalities;
1535  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1536  auto col_range_info =
1537  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1538  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1539  cardinalities.push_back(getBucketedCardinality(col_range_info));
1540  }
1541  size_t dim_idx = 0;
1542  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1543  auto* gep = key_hash_func_builder.CreateGEP(
1544  key_buff_lv->getType()->getScalarType()->getPointerElementType(),
1545  key_buff_lv,
1546  LL_INT(dim_idx));
1547  auto key_comp_lv =
1548  key_hash_func_builder.CreateLoad(gep->getType()->getPointerElementType(), gep);
1549  auto col_range_info =
1550  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1551  auto crt_term_lv =
1552  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1553  if (col_range_info.bucket) {
1554  crt_term_lv =
1555  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1556  }
1557  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1558  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1559  LL_INT(cardinalities[prev_dim_idx]));
1560  }
1561  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1562  ++dim_idx;
1563  }
1564  key_hash_func_builder.CreateRet(
1565  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1566  return key_hash_func;
1567 }
1568 
1570  const TargetInfo& agg_info,
1571  llvm::Value* target) {
1572  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1573  const auto& agg_type = agg_info.sql_type;
1574  const size_t chosen_bytes = agg_type.get_size();
1575 
1576  bool need_conversion{false};
1577  llvm::Value* arg_null{nullptr};
1578  llvm::Value* agg_null{nullptr};
1579  llvm::Value* target_to_cast{target};
1580  if (arg_type.is_fp()) {
1581  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1582  if (agg_type.is_fp()) {
1583  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1584  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1585  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1586  need_conversion = true;
1587  }
1588  } else {
1589  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1590  return target;
1591  }
1592  } else {
1593  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1594  if (agg_type.is_fp()) {
1595  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1596  need_conversion = true;
1597  target_to_cast = executor_->castToFP(target, arg_type, agg_type);
1598  } else {
1599  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1600  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1601  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1602  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1603  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1604  need_conversion = true;
1605  }
1606  }
1607  }
1608  if (need_conversion) {
1609  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1610  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1611  return LL_BUILDER.CreateSelect(
1612  cmp,
1613  agg_null,
1614  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1615  } else {
1616  return target;
1617  }
1618 }
1619 
1621  const Analyzer::WindowFunction* window_func,
1622  const QueryMemoryDescriptor& query_mem_desc,
1623  const CompilationOptions& co,
1624  DiamondCodegen& diamond_codegen) {
1625  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1626  const auto window_func_context =
1628  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1629  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1630  ? 0
1631  : query_mem_desc.getRowSize() / sizeof(int64_t);
1632  auto arg_it = ROW_FUNC->arg_begin();
1633  auto groups_buffer = arg_it++;
1634  CodeGenerator code_generator(executor_);
1635  auto window_pos_lv = code_generator.codegenWindowPosition(
1636  window_func_context, code_generator.posArg(nullptr));
1637  const auto pos_in_window =
1638  LL_BUILDER.CreateTrunc(window_pos_lv, get_int_type(32, LL_CONTEXT));
1639  llvm::Value* entry_count_lv =
1640  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1641  std::vector<llvm::Value*> args{
1642  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1643  if (query_mem_desc.didOutputColumnar()) {
1644  const auto columnar_output_offset =
1645  emitCall("get_columnar_scan_output_offset", args);
1646  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1647  }
1648  args.push_back(LL_INT(row_size_quad));
1649  return emitCall("get_scan_output_slot", args);
1650  }
1651  auto arg_it = ROW_FUNC->arg_begin();
1652  auto groups_buffer = arg_it++;
1653  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1654 }
1655 
1657  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1658  llvm::Value* varlen_output_buffer,
1659  const std::vector<llvm::Value*>& agg_out_vec,
1660  QueryMemoryDescriptor& query_mem_desc,
1661  const CompilationOptions& co,
1662  const GpuSharedMemoryContext& gpu_smem_context,
1663  DiamondCodegen& diamond_codegen) {
1664  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1665  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1666  // TODO(alex): unify the two cases, the output for non-group by queries
1667  // should be a contiguous buffer
1668  const bool is_group_by = std::get<0>(agg_out_ptr_w_idx);
1669  bool can_return_error = false;
1670  if (is_group_by) {
1671  CHECK(agg_out_vec.empty());
1672  } else {
1673  CHECK(!agg_out_vec.empty());
1674  }
1675 
1676  // output buffer is casted into a byte stream to be able to handle data elements of
1677  // different sizes (only used when actual column width sizes are used)
1678  llvm::Value* output_buffer_byte_stream{nullptr};
1679  llvm::Value* out_row_idx{nullptr};
1680  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1682  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1683  std::get<0>(agg_out_ptr_w_idx),
1684  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1685  output_buffer_byte_stream->setName("out_buff_b_stream");
1686  CHECK(std::get<1>(agg_out_ptr_w_idx));
1687  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1688  llvm::Type::getInt64Ty(LL_CONTEXT));
1689  out_row_idx->setName("out_row_idx");
1690  }
1691 
1692  TargetExprCodegenBuilder target_builder(ra_exe_unit_, is_group_by);
1693  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1694  ++target_idx) {
1695  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1696  CHECK(target_expr);
1697 
1698  target_builder(target_expr, executor_, query_mem_desc, co);
1699  }
1700 
1701  target_builder.codegen(this,
1702  executor_,
1703  query_mem_desc,
1704  co,
1705  gpu_smem_context,
1706  agg_out_ptr_w_idx,
1707  agg_out_vec,
1708  output_buffer_byte_stream,
1709  out_row_idx,
1710  varlen_output_buffer,
1711  diamond_codegen);
1712 
1713  return can_return_error;
1714 }
1715 
1720  llvm::Value* output_buffer_byte_stream,
1721  llvm::Value* out_row_idx,
1722  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1723  const QueryMemoryDescriptor& query_mem_desc,
1724  const size_t chosen_bytes,
1725  const size_t agg_out_off,
1726  const size_t target_idx) {
1727  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1728  llvm::Value* agg_col_ptr{nullptr};
1729  if (query_mem_desc.didOutputColumnar()) {
1730  // TODO(Saman): remove the second columnar branch, and support all query description
1731  // types through the first branch. Then, input arguments should also be cleaned up
1732  if (!g_cluster &&
1734  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1735  chosen_bytes == 8);
1736  CHECK(output_buffer_byte_stream);
1737  CHECK(out_row_idx);
1738  size_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1739  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1740  auto out_per_col_byte_idx =
1741 #ifdef _WIN32
1742  LL_BUILDER.CreateShl(out_row_idx, __lzcnt(chosen_bytes) - 1);
1743 #else
1744  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1745 #endif
1746  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1747  LL_INT(static_cast<int64_t>(col_off)));
1748  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1749  auto output_ptr = LL_BUILDER.CreateGEP(
1750  output_buffer_byte_stream->getType()->getScalarType()->getPointerElementType(),
1751  output_buffer_byte_stream,
1752  byte_offset);
1753  agg_col_ptr = LL_BUILDER.CreateBitCast(
1754  output_ptr,
1755  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1756  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1757  } else {
1758  auto const col_off_in_bytes = query_mem_desc.getColOffInBytes(agg_out_off);
1759  auto const col_off = col_off_in_bytes / chosen_bytes;
1760  auto const col_rem = col_off_in_bytes % chosen_bytes;
1761  CHECK_EQ(col_rem, 0u) << col_off_in_bytes << " % " << chosen_bytes;
1762  CHECK(std::get<1>(agg_out_ptr_w_idx));
1763  auto* agg_out_idx = LL_BUILDER.CreateZExt(
1764  std::get<1>(agg_out_ptr_w_idx),
1765  get_int_type(8 * sizeof(col_off), executor_->cgen_state_->context_));
1766  auto* offset = LL_BUILDER.CreateAdd(agg_out_idx, LL_INT(col_off));
1767  auto* bit_cast = LL_BUILDER.CreateBitCast(
1768  std::get<0>(agg_out_ptr_w_idx),
1769  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1770  agg_col_ptr = LL_BUILDER.CreateGEP(
1771  bit_cast->getType()->getScalarType()->getPointerElementType(),
1772  bit_cast,
1773  offset);
1774  }
1775  } else {
1776  auto const col_off_in_bytes = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1777  auto const col_off = col_off_in_bytes / chosen_bytes;
1778  auto const col_rem = col_off_in_bytes % chosen_bytes;
1779  CHECK_EQ(col_rem, 0u) << col_off_in_bytes << " % " << chosen_bytes;
1780  auto* bit_cast = LL_BUILDER.CreateBitCast(
1781  std::get<0>(agg_out_ptr_w_idx),
1782  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1783  agg_col_ptr = LL_BUILDER.CreateGEP(
1784  bit_cast->getType()->getScalarType()->getPointerElementType(),
1785  bit_cast,
1786  LL_INT(col_off));
1787  }
1788  CHECK(agg_col_ptr);
1789  return agg_col_ptr;
1790 }
1791 
1792 void GroupByAndAggregate::codegenEstimator(std::stack<llvm::BasicBlock*>& array_loops,
1793  DiamondCodegen& diamond_codegen,
1794  const QueryMemoryDescriptor& query_mem_desc,
1795  const CompilationOptions& co) {
1796  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1797  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1798  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1799  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1800  estimator_comp_count_lv);
1801  int32_t subkey_idx = 0;
1802  for (const auto& estimator_arg_comp : estimator_arg) {
1803  const auto estimator_arg_comp_lvs =
1804  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1805  query_mem_desc.getEffectiveKeyWidth(),
1806  co,
1807  false,
1808  0,
1809  diamond_codegen,
1810  array_loops,
1811  true);
1812  CHECK(!estimator_arg_comp_lvs.original_value);
1813  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1814  // store the sub-key to the buffer
1815  LL_BUILDER.CreateStore(
1816  estimator_arg_comp_lv,
1817  LL_BUILDER.CreateGEP(
1818  estimator_key_lv->getType()->getScalarType()->getPointerElementType(),
1819  estimator_key_lv,
1820  LL_INT(subkey_idx++)));
1821  }
1822  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1823  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1824  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1825  const auto estimator_comp_bytes_lv =
1826  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1827  const auto bitmap_size_lv =
1828  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1829  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1830  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1831 }
1832 
1833 extern "C" RUNTIME_EXPORT void agg_count_distinct(int64_t* agg, const int64_t val) {
1834  reinterpret_cast<CountDistinctSet*>(*agg)->insert(val);
1835 }
1836 
1837 extern "C" RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t* agg,
1838  const int64_t val,
1839  const int64_t skip_val) {
1840  if (val != skip_val) {
1841  agg_count_distinct(agg, val);
1842  }
1843 }
1844 
1845 extern "C" RUNTIME_EXPORT void agg_approx_quantile(int64_t* agg, const double val) {
1846  auto* t_digest = reinterpret_cast<quantile::TDigest*>(*agg);
1847  t_digest->allocate();
1848  t_digest->add(val);
1849 }
1850 
1851 extern "C" RUNTIME_EXPORT void agg_mode_func(int64_t* agg, const int64_t val) {
1852  auto* mode_map = reinterpret_cast<AggMode*>(*agg);
1853  mode_map->add(val);
1854 }
1855 
1857  const size_t target_idx,
1858  const Analyzer::Expr* target_expr,
1859  std::vector<llvm::Value*>& agg_args,
1860  const QueryMemoryDescriptor& query_mem_desc,
1861  const ExecutorDeviceType device_type) {
1862  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1863  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1864  const auto& arg_ti =
1865  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1866  if (arg_ti.is_fp()) {
1867  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1868  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1869  }
1870  const auto& count_distinct_descriptor =
1871  query_mem_desc.getCountDistinctDescriptor(target_idx);
1872  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1873  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1874  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1875  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1876  if (device_type == ExecutorDeviceType::GPU) {
1877  const auto base_dev_addr = getAdditionalLiteral(-1);
1878  const auto base_host_addr = getAdditionalLiteral(-2);
1879  agg_args.push_back(base_dev_addr);
1880  agg_args.push_back(base_host_addr);
1881  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1882  } else {
1883  emitCall("agg_approximate_count_distinct", agg_args);
1884  }
1885  return;
1886  }
1887  std::string agg_fname{"agg_count_distinct"};
1888  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1889  agg_fname += "_bitmap";
1890  agg_args.push_back(LL_INT(count_distinct_descriptor.min_val));
1891  agg_args.push_back(LL_INT(count_distinct_descriptor.bucket_size));
1892  }
1893  if (agg_info.skip_null_val) {
1894  auto null_lv = executor_->cgen_state_->castToTypeIn(
1895  (arg_ti.is_fp()
1896  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1897  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1898  64);
1899  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1900  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1901  agg_fname += "_skip_val";
1902  agg_args.push_back(null_lv);
1903  }
1904  if (device_type == ExecutorDeviceType::GPU) {
1905  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1906  agg_fname += "_gpu";
1907  const auto base_dev_addr = getAdditionalLiteral(-1);
1908  const auto base_host_addr = getAdditionalLiteral(-2);
1909  agg_args.push_back(base_dev_addr);
1910  agg_args.push_back(base_host_addr);
1911  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1912  CHECK_EQ(size_t(0),
1913  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1914  count_distinct_descriptor.sub_bitmap_count);
1915  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1916  count_distinct_descriptor.sub_bitmap_count)));
1917  }
1918  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1919  emitCall(agg_fname, agg_args);
1920  } else {
1921  executor_->cgen_state_->emitExternalCall(
1922  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1923  }
1924 }
1925 
1927  const size_t target_idx,
1928  const Analyzer::Expr* target_expr,
1929  std::vector<llvm::Value*>& agg_args,
1930  const QueryMemoryDescriptor& query_mem_desc,
1931  const ExecutorDeviceType device_type) {
1932  if (device_type == ExecutorDeviceType::GPU) {
1933  throw QueryMustRunOnCpu();
1934  }
1935  llvm::BasicBlock *calc, *skip{nullptr};
1936  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1937  auto const arg_ti =
1938  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1939  bool const nullable = !arg_ti.get_notnull();
1940 
1941  auto* cs = executor_->cgen_state_.get();
1942  auto& irb = cs->ir_builder_;
1943  if (nullable) {
1944  auto* const null_value = cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1945  auto* const skip_cond = arg_ti.is_fp()
1946  ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1947  : irb.CreateICmpEQ(agg_args.back(), null_value);
1948  calc = llvm::BasicBlock::Create(cs->context_, "calc_approx_quantile");
1949  skip = llvm::BasicBlock::Create(cs->context_, "skip_approx_quantile");
1950  irb.CreateCondBr(skip_cond, skip, calc);
1951  cs->current_func_->getBasicBlockList().push_back(calc);
1952  irb.SetInsertPoint(calc);
1953  }
1954  if (!arg_ti.is_fp()) {
1955  auto const agg_info = get_target_info(target_expr, g_bigint_count);
1956  agg_args.back() = executor_->castToFP(agg_args.back(), arg_ti, agg_info.sql_type);
1957  }
1958  cs->emitExternalCall(
1959  "agg_approx_quantile", llvm::Type::getVoidTy(cs->context_), agg_args);
1960  if (nullable) {
1961  irb.CreateBr(skip);
1962  cs->current_func_->getBasicBlockList().push_back(skip);
1963  irb.SetInsertPoint(skip);
1964  }
1965 }
1966 
1967 void GroupByAndAggregate::codegenMode(const size_t target_idx,
1968  const Analyzer::Expr* target_expr,
1969  std::vector<llvm::Value*>& agg_args,
1970  const QueryMemoryDescriptor& query_mem_desc,
1971  const ExecutorDeviceType device_type) {
1972  if (device_type == ExecutorDeviceType::GPU) {
1973  throw QueryMustRunOnCpu();
1974  }
1975  llvm::BasicBlock *calc, *skip{nullptr};
1976  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1977  auto const arg_ti =
1978  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1979  bool const nullable = !arg_ti.get_notnull();
1980  bool const is_fp = arg_ti.is_fp();
1981  auto* cs = executor_->cgen_state_.get();
1982  auto& irb = cs->ir_builder_;
1983  if (nullable) {
1984  auto* const null_value =
1985  is_fp ? cs->inlineNull(arg_ti) : cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1986  auto* const skip_cond = is_fp ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1987  : irb.CreateICmpEQ(agg_args.back(), null_value);
1988  calc = llvm::BasicBlock::Create(cs->context_, "calc_mode");
1989  skip = llvm::BasicBlock::Create(cs->context_, "skip_mode");
1990  irb.CreateCondBr(skip_cond, skip, calc);
1991  cs->current_func_->getBasicBlockList().push_back(calc);
1992  irb.SetInsertPoint(calc);
1993  }
1994  if (is_fp) {
1995  auto* const int_type = get_int_type(8 * arg_ti.get_size(), cs->context_);
1996  agg_args.back() = irb.CreateBitCast(agg_args.back(), int_type);
1997  }
1998  // "agg_mode" collides with existing names, so non-standard suffix "_func" is added.
1999  cs->emitExternalCall("agg_mode_func", llvm::Type::getVoidTy(cs->context_), agg_args);
2000  if (nullable) {
2001  irb.CreateBr(skip);
2002  cs->current_func_->getBasicBlockList().push_back(skip);
2003  irb.SetInsertPoint(skip);
2004  }
2005 }
2006 
2007 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
2008  CHECK_LT(off, 0);
2009  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
2010  auto* bit_cast = LL_BUILDER.CreateBitCast(
2011  lit_buff_lv, llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0));
2012  auto* gep =
2013  LL_BUILDER.CreateGEP(bit_cast->getType()->getScalarType()->getPointerElementType(),
2014  bit_cast,
2015  LL_INT(off));
2016  return LL_BUILDER.CreateLoad(gep->getType()->getPointerElementType(), gep);
2017 }
2018 
2019 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
2020  const Analyzer::Expr* target_expr,
2021  const CompilationOptions& co) {
2022  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2023  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
2024  const auto func_expr = dynamic_cast<const Analyzer::FunctionOper*>(target_expr);
2025  const auto arr_expr = dynamic_cast<const Analyzer::ArrayExpr*>(target_expr);
2026 
2027  // TODO(alex): handle arrays uniformly?
2028  CodeGenerator code_generator(executor_);
2029  if (target_expr) {
2030  const auto& target_ti = target_expr->get_type_info();
2031  if (target_ti.is_buffer() &&
2032  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
2033  const auto target_lvs =
2034  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2035  : code_generator.codegen(
2036  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
2037  if (!func_expr && !arr_expr) {
2038  // Something with the chunk transport is code that was generated from a source
2039  // other than an ARRAY[] expression
2040  if (target_ti.is_text_encoding_none()) {
2041  CHECK_EQ(size_t(3), target_lvs.size());
2042  return {target_lvs[1], target_lvs[2]};
2043  }
2044  CHECK(target_ti.is_array());
2045  CHECK_EQ(size_t(1), target_lvs.size());
2046  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
2047  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
2048  const auto i8p_ty =
2049  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
2050  const auto& elem_ti = target_ti.get_elem_type();
2051  return {
2052  executor_->cgen_state_->emitExternalCall(
2053  "array_buff",
2054  i8p_ty,
2055  {target_lvs.front(), code_generator.posArg(target_expr)}),
2056  executor_->cgen_state_->emitExternalCall(
2057  "array_size",
2058  i32_ty,
2059  {target_lvs.front(),
2060  code_generator.posArg(target_expr),
2061  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
2062  } else {
2063  if (agg_expr) {
2064  throw std::runtime_error(
2065  "Using array[] operator as argument to an aggregate operator is not "
2066  "supported");
2067  }
2068  CHECK(func_expr || arr_expr);
2069  if (dynamic_cast<const Analyzer::FunctionOper*>(target_expr)) {
2070  CHECK_EQ(size_t(1), target_lvs.size());
2071  const auto prefix = target_ti.get_buffer_name();
2072  CHECK(target_ti.is_array() || target_ti.is_text_encoding_none());
2073  const auto target_lv = LL_BUILDER.CreateLoad(
2074  target_lvs[0]->getType()->getPointerElementType(), target_lvs[0]);
2075  // const auto target_lv_type = target_lvs[0]->getType();
2076  // CHECK(target_lv_type->isStructTy());
2077  // CHECK_EQ(target_lv_type->getNumContainedTypes(), 3u);
2078  const auto i8p_ty = llvm::PointerType::get(
2079  get_int_type(8, executor_->cgen_state_->context_), 0);
2080  const auto ptr = LL_BUILDER.CreatePointerCast(
2081  LL_BUILDER.CreateExtractValue(target_lv, 0), i8p_ty);
2082  const auto size = LL_BUILDER.CreateExtractValue(target_lv, 1);
2083  const auto null_flag = LL_BUILDER.CreateExtractValue(target_lv, 2);
2084  const auto nullcheck_ok_bb =
2085  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_nullcheck_ok_bb", CUR_FUNC);
2086  const auto nullcheck_fail_bb = llvm::BasicBlock::Create(
2087  LL_CONTEXT, prefix + "_nullcheck_fail_bb", CUR_FUNC);
2088 
2089  // TODO(adb): probably better to zext the bool
2090  const auto nullcheck = LL_BUILDER.CreateICmpEQ(
2091  null_flag, executor_->cgen_state_->llInt(static_cast<int8_t>(1)));
2092  LL_BUILDER.CreateCondBr(nullcheck, nullcheck_fail_bb, nullcheck_ok_bb);
2093 
2094  const auto ret_bb =
2095  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_return", CUR_FUNC);
2096  LL_BUILDER.SetInsertPoint(ret_bb);
2097  auto result_phi = LL_BUILDER.CreatePHI(i8p_ty, 2, prefix + "_ptr_return");
2098  result_phi->addIncoming(ptr, nullcheck_ok_bb);
2099  const auto null_arr_sentinel = LL_BUILDER.CreateIntToPtr(
2100  executor_->cgen_state_->llInt(static_cast<int8_t>(0)), i8p_ty);
2101  result_phi->addIncoming(null_arr_sentinel, nullcheck_fail_bb);
2102  LL_BUILDER.SetInsertPoint(nullcheck_ok_bb);
2103  executor_->cgen_state_->emitExternalCall(
2104  "register_buffer_with_executor_rsm",
2105  llvm::Type::getVoidTy(executor_->cgen_state_->context_),
2106  {executor_->cgen_state_->llInt(reinterpret_cast<int64_t>(executor_)), ptr});
2107  LL_BUILDER.CreateBr(ret_bb);
2108  LL_BUILDER.SetInsertPoint(nullcheck_fail_bb);
2109  LL_BUILDER.CreateBr(ret_bb);
2110 
2111  LL_BUILDER.SetInsertPoint(ret_bb);
2112  return {result_phi, size};
2113  }
2114  CHECK_EQ(size_t(2), target_lvs.size());
2115  return {target_lvs[0], target_lvs[1]};
2116  }
2117  }
2118  if (target_ti.is_geometry() &&
2119  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
2120  auto generate_coord_lvs =
2121  [&](auto* selected_target_expr,
2122  bool const fetch_columns) -> std::vector<llvm::Value*> {
2123  const auto target_lvs =
2124  code_generator.codegen(selected_target_expr, fetch_columns, co);
2125  if (dynamic_cast<const Analyzer::GeoOperator*>(target_expr) &&
2126  target_expr->get_type_info().is_geometry()) {
2127  // return a pointer to the temporary alloca
2128  return target_lvs;
2129  }
2130  const auto geo_uoper = dynamic_cast<const Analyzer::GeoUOper*>(target_expr);
2131  const auto geo_binoper = dynamic_cast<const Analyzer::GeoBinOper*>(target_expr);
2132  if (geo_uoper || geo_binoper) {
2133  CHECK(target_expr->get_type_info().is_geometry());
2134  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
2135  target_lvs.size());
2136  return target_lvs;
2137  }
2138  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
2139  target_lvs.size());
2140 
2141  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
2142  const auto i8p_ty =
2143  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
2144  std::vector<llvm::Value*> coords;
2145  size_t ctr = 0;
2146  for (const auto& target_lv : target_lvs) {
2147  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
2148  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
2149  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
2150  // coords array (TINYINT). Subsequent arrays are regular INT.
2151 
2152  const size_t elem_sz = ctr == 0 ? 1 : 4;
2153  ctr++;
2154  int32_t fixlen = -1;
2155  if (target_ti.get_type() == kPOINT) {
2156  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
2157  if (col_var) {
2158  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
2159  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
2160  fixlen = coords_cd->columnType.get_size();
2161  }
2162  }
2163  }
2164  if (fixlen > 0) {
2165  coords.push_back(executor_->cgen_state_->emitExternalCall(
2166  "fast_fixlen_array_buff",
2167  i8p_ty,
2168  {target_lv, code_generator.posArg(selected_target_expr)}));
2169  auto fixed_len_lv = executor_->cgen_state_->emitExternalCall(
2170  "determine_fixed_array_len",
2171  llvm::IntegerType::get(code_generator.cgen_state_->context_, 64),
2172  {target_lv, executor_->cgen_state_->llInt(int64_t(fixlen))});
2173  coords.push_back(fixed_len_lv);
2174  continue;
2175  }
2176  coords.push_back(executor_->cgen_state_->emitExternalCall(
2177  "array_buff",
2178  i8p_ty,
2179  {target_lv, code_generator.posArg(selected_target_expr)}));
2180  coords.push_back(executor_->cgen_state_->emitExternalCall(
2181  "array_size",
2182  i32_ty,
2183  {target_lv,
2184  code_generator.posArg(selected_target_expr),
2185  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
2186  }
2187  return coords;
2188  };
2189 
2190  if (agg_expr) {
2191  return generate_coord_lvs(agg_expr->get_arg(), true);
2192  } else {
2193  return generate_coord_lvs(target_expr,
2194  !executor_->plan_state_->allow_lazy_fetch_);
2195  }
2196  }
2197  }
2198  bool fetch_column = !executor_->plan_state_->allow_lazy_fetch_;
2199  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2200  : code_generator.codegen(target_expr, fetch_column, co);
2201 }
2202 
2203 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
2204  const std::vector<llvm::Value*>& args) {
2205  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2206  return executor_->cgen_state_->emitCall(fname, args);
2207 }
2208 
2209 void GroupByAndAggregate::checkErrorCode(llvm::Value* retCode) {
2210  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2211  auto zero_const = llvm::ConstantInt::get(retCode->getType(), 0, true);
2212  auto rc_check_condition = executor_->cgen_state_->ir_builder_.CreateICmp(
2213  llvm::ICmpInst::ICMP_EQ, retCode, zero_const);
2214 
2215  executor_->cgen_state_->emitErrorCheck(rc_check_condition, retCode, "rc");
2216 }
2217 
2218 #undef CUR_FUNC
2219 #undef ROW_FUNC
2220 #undef LL_FP
2221 #undef LL_INT
2222 #undef LL_BOOL
2223 #undef LL_BUILDER
2224 #undef LL_CONTEXT
2225 
2227  const RelAlgExecutionUnit& ra_exe_unit) {
2228  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
2229  return 0;
2230  }
2231  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
2232  const auto grouped_col_expr =
2233  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
2234  if (!grouped_col_expr) {
2235  continue;
2236  }
2237  const auto& column_key = grouped_col_expr->getColumnKey();
2238  if (column_key.table_id <= 0) {
2239  return 0;
2240  }
2242  {column_key.db_id, column_key.table_id});
2243  if (td->shardedColumnId == column_key.column_id) {
2244  return td->nShards;
2245  }
2246  }
2247  return 0;
2248 }
RUNTIME_EXPORT void agg_approx_quantile(int64_t *agg, const double val)
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:2794
size_t g_watchdog_baseline_max_groups
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
llvm::Value * getAdditionalLiteral(const int32_t off)
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
llvm::BasicBlock * cond_false_
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
HOST DEVICE int get_size() const
Definition: sqltypes.h:403
#define LL_BUILDER
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
Definition: Analyzer.h:215
RUNTIME_EXPORT void agg_count_distinct(int64_t *agg, const int64_t val)
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:113
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
#define LL_CONTEXT
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
Definition: Analyzer.h:222
CgenState * cgen_state_
KeylessInfo get_keyless_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
void codegenMode(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
SQLTypeInfo sql_type
Definition: TargetInfo.h:52
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:285
std::ostream & operator<<(std::ostream &os, const SessionInfo &session_info)
Definition: SessionInfo.cpp:57
void mark_function_always_inline(llvm::Function *func)
bool is_fp() const
Definition: sqltypes.h:571
ColRangeInfo getColRangeInfo()
#define LL_INT(v)
QueryDescriptionType hash_type_
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:590
const TableDescriptor * get_metadata_for_table(const ::shared::TableKey &table_key, bool populate_fragmenter)
#define CHECK_GE(x, y)
Definition: Logger.h:306
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
llvm::Value * codegenVarlenOutputBuffer(const QueryMemoryDescriptor &query_mem_desc)
Expr * get_arg() const
Definition: Analyzer.h:1330
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint, const bool threads_can_reuse_group_by_buffers)
size_t getEffectiveKeyWidth() const
void codegenApproxQuantile(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
void checkErrorCode(llvm::Value *retCode)
CountDistinctDescriptors init_count_distinct_descriptors(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &group_by_range_info, const ExecutorDeviceType device_type, Executor *executor)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:106
#define LLVM_ALIGN(alignment)
RUNTIME_EXPORT void agg_mode_func(int64_t *agg, const int64_t val)
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
CountDistinctImplType impl_type_
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:305
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
ExecutorDeviceType
std::string to_string(char const *&&v)
Helpers for codegen of target expressions.
#define LL_BOOL(v)
size_t getColOnlyOffInBytes(const size_t col_idx) const
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:75
const SQLTypeInfo get_compact_type(const TargetInfo &target)
llvm::Value * codegenWindowPosition(const WindowFunctionContext *window_func_context, llvm::Value *pos_arg)
Definition: ColumnIR.cpp:235
llvm::LLVMContext & context_
Definition: CgenState.h:382
bool isEmpty() const
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const std::optional< int64_t > &group_cardinality_estimation)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:168
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:34
size_t getGroupbyColCount() const
const ColumnDescriptor * get_column_descriptor_maybe(const shared::ColumnKey &column_key)
Definition: Execute.h:241
RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
const JoinQualsPerNestingLevel join_quals
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
std::optional< size_t > limit
std::list< Analyzer::OrderEntry > order_entries
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
executor_(executor)
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, llvm::Value *varlen_output_buffer, const std::vector< llvm::Value * > &agg_out_vec, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
bool g_bigint_count
Definition: sqldefs.h:77
bool g_enable_watchdog
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, llvm::Value *varlen_output_buffer, DiamondCodegen &diamond_codegen) const
int64_t g_bitmap_memory_limit
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
DEVICE void allocate()
Definition: quantile.h:613
#define AUTOMATIC_IR_METADATA(CGENSTATE)
This file includes the class specification for the buffer manager (BufferMgr), and related data struc...
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:51
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
QueryDescriptionType getQueryDescriptionType() const
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
ExecutorDeviceType device_type
#define RUNTIME_EXPORT
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:43
#define CHECK_LT(x, y)
Definition: Logger.h:303
Definition: sqltypes.h:80
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
const shared::ColumnKey & getColumnKey() const
Definition: Analyzer.h:198
int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo &col_range_info)
#define CHECK_LE(x, y)
Definition: Logger.h:304
bool expr_is_rowid(const Analyzer::Expr *expr)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit)
std::unordered_map< size_t, SQLTypeInfo > target_exprs_original_type_infos
Definition: sqldefs.h:78
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
void add(Value const value)
Definition: AggMode.h:40
Descriptor for the result set buffer layout.
#define MAX_BUFFER_SIZE
TO bit_cast(FROM &&from)
Definition: misc.h:298
CountDistinctImplType
const std::optional< int64_t > group_cardinality_estimation_
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:291
bool is_geometry() const
Definition: sqltypes.h:595
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
bool g_cluster
int64_t get_epoch_days_from_seconds(const int64_t seconds)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
static size_t getBaselineThreshold(bool for_count_distinct, ExecutorDeviceType device_type)
Definition: Execute.h:1448
#define CUR_FUNC
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:198
Definition: sqltypes.h:72
constexpr double n
Definition: Utm.h:38
size_t g_leaf_count
Definition: ParserNode.cpp:78
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:398
const RelAlgExecutionUnit & ra_exe_unit_
Definition: sqldefs.h:76
Definition: sqldefs.h:74
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals