OmniSciDB  f17484ade4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
19 
20 #include "CardinalityEstimator.h"
21 #include "CodeGenerator.h"
23 #include "ExpressionRange.h"
24 #include "ExpressionRewrite.h"
25 #include "GpuInitGroups.h"
26 #include "InPlaceSort.h"
28 #include "MaxwellCodegenPatch.h"
30 #include "TargetExprBuilder.h"
31 
32 #include "../CudaMgr/CudaMgr.h"
33 #include "../Shared/checked_alloc.h"
34 #include "../Shared/funcannotations.h"
35 #include "../Utils/ChunkIter.h"
37 #include "Execute.h"
38 #include "QueryTemplateGenerator.h"
39 #include "RuntimeFunctions.h"
40 #include "Shared/misc.h"
41 #include "StreamingTopN.h"
42 #include "TopKSort.h"
43 #include "WindowContext.h"
44 
45 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
46 
47 #include <cstring> // strcat()
48 #include <limits>
49 #include <numeric>
50 #include <string_view>
51 #include <thread>
52 
53 bool g_cluster{false};
54 bool g_bigint_count{false};
57 extern int64_t g_bitmap_memory_limit;
58 extern size_t g_leaf_count;
59 
60 bool ColRangeInfo::isEmpty() const {
61  return min == 0 && max == -1;
62 }
63 
64 std::ostream& operator<<(std::ostream& out, const ColRangeInfo& info) {
65  out << "Hash Type = " << info.hash_type_ << " min = " << info.min
66  << " max = " << info.max << " bucket = " << info.bucket
67  << " has_nulls = " << info.has_nulls << "\n";
68  return out;
69 }
70 
71 std::ostream& operator<<(std::ostream& out, const CountDistinctImplType& type) {
72  switch (type) {
74  out << "Invalid";
75  break;
77  out << "Bitmap";
78  break;
80  out << "UnorderedSet";
81  break;
82  default:
83  out << "<Unkown Type>";
84  break;
85  }
86  return out;
87 }
88 
89 std::ostream& operator<<(std::ostream& out, const CountDistinctDescriptor& desc) {
90  out << "Type = " << desc.impl_type_ << " min val = " << desc.min_val
91  << " bitmap_sz_bits = " << desc.bitmap_sz_bits
92  << " bool approximate = " << desc.approximate
93  << " device_type = " << desc.device_type
94  << " sub_bitmap_count = " << desc.sub_bitmap_count;
95  return out;
96 }
97 
98 namespace {
99 
100 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
101  int32_t agg_count{0};
102  for (auto target_expr : target_exprs) {
103  CHECK(target_expr);
104  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
105  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
106  const auto& ti = target_expr->get_type_info();
107  if (ti.is_buffer()) {
108  agg_count += 2;
109  } else if (ti.is_geometry()) {
110  agg_count += ti.get_physical_coord_cols() * 2;
111  } else {
112  ++agg_count;
113  }
114  continue;
115  }
116  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
117  agg_count += 2;
118  } else {
119  ++agg_count;
120  }
121  }
122  return agg_count;
123 }
124 
125 bool expr_is_rowid(const Analyzer::Expr* expr) {
126  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
127  if (!col) {
128  return false;
129  }
130  const auto cd = get_column_descriptor_maybe(col->getColumnKey());
131  if (!cd || !cd->isVirtualCol) {
132  return false;
133  }
134  CHECK_EQ("rowid", cd->columnName);
135  return true;
136 }
137 
138 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
139  for (const auto& target_expr : ra_exe_unit.target_exprs) {
140  const auto agg_info = get_target_info(target_expr, g_bigint_count);
141  if (agg_info.is_agg && is_distinct_target(agg_info)) {
142  return true;
143  }
144  }
145  return false;
146 }
147 
149  const int64_t max_entry_count) {
150  try {
151  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
152  checked_int64_t(col_range_info.min)) >= max_entry_count;
153  } catch (...) {
154  return true;
155  }
156 }
157 
158 bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate,
159  const ColRangeInfo& col_range_info) {
160  try {
161  // the cardinality estimate is the size of the baseline hash table. further penalize
162  // the baseline hash table by a factor of 2x due to overhead in computing baseline
163  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
164  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
165  // count of the column, we use baseline hash on the filtered set
166  return checked_int64_t(cardinality_estimate) * 2 <
167  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
168  checked_int64_t(col_range_info.min));
169  } catch (...) {
170  return false;
171  }
172 }
173 
175  const std::vector<InputTableInfo>& query_infos,
176  const Analyzer::Expr* expr,
177  Executor* executor) {
178  if (!expr) {
179  return {QueryDescriptionType::Projection, 0, 0, 0, false};
180  }
181 
182  const auto expr_range = getExpressionRange(
183  expr, query_infos, executor, boost::make_optional(ra_exe_unit.simple_quals));
184  switch (expr_range.getType()) {
186  if (expr_range.getIntMin() > expr_range.getIntMax()) {
187  return {
188  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
189  }
191  expr_range.getIntMin(),
192  expr_range.getIntMax(),
193  expr_range.getBucket(),
194  expr_range.hasNulls()};
195  }
198  if (expr_range.getFpMin() > expr_range.getFpMax()) {
199  return {
200  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
201  }
202  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
203  }
205  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
206  default:
207  CHECK(false);
208  }
209  CHECK(false);
210  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
211 }
212 
213 } // namespace
214 
216  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
217  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
218  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
219  // can expect this to be true anyway for grouped queries since the precise version
220  // uses significantly more memory.
221  const int64_t baseline_threshold =
223  // `group_cardinality_estimation_` is set as the result of (NDV) cardinality estimator
224  auto group_cardinality_estimation = group_cardinality_estimation_.value_or(0);
225  if (ra_exe_unit_.groupby_exprs.size() != 1) {
226  try {
227  checked_int64_t cardinality{1};
228  bool has_nulls{false};
229  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
230  auto col_range_info = get_expr_range_info(
231  ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
232  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
233  // going through baseline hash if a non-integer type is encountered
235  0,
236  group_cardinality_estimation,
237  0,
238  false};
239  }
240  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
241  CHECK_GE(crt_col_cardinality, 0);
242  cardinality *= crt_col_cardinality;
243  if (col_range_info.has_nulls) {
244  has_nulls = true;
245  }
246  }
247  // For zero or high cardinalities, use baseline layout.
248  if (!cardinality || cardinality > baseline_threshold) {
250  0,
251  group_cardinality_estimation,
252  0,
253  false};
254  }
255  // todo (yoonmin) : should we consider min(group_cardinality_estimation,
256  // cardinality) if we have `group_cardinality_estimation` value?
258  0,
259  int64_t(cardinality),
260  0,
261  has_nulls};
262  } catch (...) { // overflow when computing cardinality
264  0,
265  group_cardinality_estimation,
266  0,
267  false};
268  }
269  }
270  // For single column groupby on high timestamps, force baseline hash due to wide ranges
271  // we are likely to encounter when applying quals to the expression range
272  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
273  // the range is small enough
274  if (ra_exe_unit_.groupby_exprs.front() &&
275  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
276  ra_exe_unit_.simple_quals.size() > 0) {
278  0,
279  group_cardinality_estimation,
280  0,
281  false};
282  }
283  const auto col_range_info = get_expr_range_info(
285  if (!ra_exe_unit_.groupby_exprs.front()) {
286  return col_range_info;
287  }
288  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
289  const int64_t col_count =
291  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
293  max_entry_count = std::min(max_entry_count, baseline_threshold);
294  }
295  const auto& groupby_expr_ti = ra_exe_unit_.groupby_exprs.front()->get_type_info();
296  if (groupby_expr_ti.is_string() && !col_range_info.bucket) {
297  CHECK(groupby_expr_ti.get_compression() == kENCODING_DICT);
298 
299  const bool has_filters =
300  !ra_exe_unit_.quals.empty() || !ra_exe_unit_.simple_quals.empty();
301  if (has_filters &&
302  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
303  // if filters are present, we can use the filter to narrow the cardinality of the
304  // group by in the case of ranges too big for perfect hash. Otherwise, we are better
305  // off attempting perfect hash (since we know the range will be made of
306  // monotonically increasing numbers from min to max for dictionary encoded strings)
307  // and failing later due to excessive memory use.
308  // Check the conditions where baseline hash can provide a performance increase and
309  // return baseline hash (potentially forcing an estimator query) as the range type.
310  // Otherwise, return col_range_info which will likely be perfect hash, though could
311  // be baseline from a previous call of this function prior to the estimator query.
312  if (!ra_exe_unit_.sort_info.order_entries.empty()) {
313  // TODO(adb): allow some sorts to pass through this block by centralizing sort
314  // algorithm decision making
316  // always use baseline hash for column range too big for perfect hash with count
317  // distinct descriptors. We will need 8GB of CPU memory minimum for the perfect
318  // hash group by in this case.
320  col_range_info.min,
321  col_range_info.max,
322  0,
323  col_range_info.has_nulls};
324  } else {
325  // use original col range for sort
326  return col_range_info;
327  }
328  }
329  // if filters are present and the filtered range is less than the cardinality of
330  // the column, consider baseline hash
333  col_range_info)) {
335  col_range_info.min,
336  col_range_info.max,
337  0,
338  col_range_info.has_nulls};
339  }
340  }
341  } else if ((!expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get())) &&
342  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
343  !col_range_info.bucket) {
345  col_range_info.min,
346  col_range_info.max,
347  0,
348  col_range_info.has_nulls};
349  }
350  return col_range_info;
351 }
352 
354  checked_int64_t crt_col_cardinality =
355  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
356  if (col_range_info.bucket) {
357  crt_col_cardinality /= col_range_info.bucket;
358  }
359  return static_cast<int64_t>(crt_col_cardinality +
360  (1 + (col_range_info.has_nulls ? 1 : 0)));
361 }
362 
363 namespace {
364 // Like getBucketedCardinality() without counting nulls.
365 int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo& col_range_info) {
366  if (col_range_info.min <= col_range_info.max) {
367  size_t size = col_range_info.max - col_range_info.min;
368  if (col_range_info.bucket) {
369  size /= col_range_info.bucket;
370  }
371  if (size >= static_cast<size_t>(std::numeric_limits<int64_t>::max())) {
372  // try to use unordered_set instead of crashing due to CHECK failure
373  // i.e., CHECK_LT(size, std::numeric_limits<int64_t>::max());
374  return 0;
375  }
376  return static_cast<int64_t>(size + 1);
377  } else {
378  return 0;
379  }
380 }
381 } // namespace
382 
383 #define LL_CONTEXT executor_->cgen_state_->context_
384 #define LL_BUILDER executor_->cgen_state_->ir_builder_
385 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
386 #define LL_INT(v) executor_->cgen_state_->llInt(v)
387 #define LL_FP(v) executor_->cgen_state_->llFp(v)
388 #define ROW_FUNC executor_->cgen_state_->row_func_
389 #define CUR_FUNC executor_->cgen_state_->current_func_
390 
392  Executor* executor,
393  const ExecutorDeviceType device_type,
394  const RelAlgExecutionUnit& ra_exe_unit,
395  const std::vector<InputTableInfo>& query_infos,
396  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
397  const std::optional<int64_t>& group_cardinality_estimation)
398  : executor_(executor)
399  , ra_exe_unit_(ra_exe_unit)
400  , query_infos_(query_infos)
401  , row_set_mem_owner_(row_set_mem_owner)
402  , device_type_(device_type)
403  , group_cardinality_estimation_(group_cardinality_estimation) {
404  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
405  if (!groupby_expr) {
406  continue;
407  }
408  const auto& groupby_ti = groupby_expr->get_type_info();
409  if (groupby_ti.is_text_encoding_none()) {
410  throw std::runtime_error(
411  "Cannot group by string columns which are not dictionary encoded.");
412  }
413  if (groupby_ti.is_buffer()) {
414  throw std::runtime_error("Group by buffer not supported");
415  }
416  if (groupby_ti.is_geometry()) {
417  throw std::runtime_error("Group by geometry not supported");
418  }
419  }
420 }
421 
423  const size_t shard_count) const {
424  size_t device_count{0};
426  device_count = executor_->cudaMgr()->getDeviceCount();
427  CHECK_GT(device_count, 0u);
428  }
429 
430  int64_t bucket{col_range_info.bucket};
431 
432  if (shard_count) {
433  CHECK(!col_range_info.bucket);
434  /*
435  when a node has fewer devices than shard count,
436  a) In a distributed setup, the minimum distance between two keys would be
437  device_count because shards are stored consecutively across the physical tables,
438  i.e if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1
439  would have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf
440  node has only 1 device, in this case, all the keys from each node are loaded on
441  the device each.
442 
443  b) In a single node setup, the distance would be minimum of device_count or
444  difference of device_count - shard_count. For example: If a single node server
445  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
446  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9
447  device 3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum
448  of device_count or difference.
449 
450  When a node has device count equal to or more than shard count then the
451  minimum distance is always at least shard_count * no of leaf nodes.
452  */
453  if (device_count < shard_count) {
454  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
455  : std::min(device_count, shard_count - device_count);
456  } else {
457  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
458  }
459  }
460 
461  return bucket;
462 }
463 
464 namespace {
465 
476  const std::vector<InputTableInfo>& query_infos,
477  const bool is_group_by,
478  Executor* executor) {
479  bool keyless{true}, found{false};
480  int32_t num_agg_expr{0};
481  int32_t index{0};
482  for (const auto target_expr : ra_exe_unit.target_exprs) {
483  const auto agg_info = get_target_info(target_expr, g_bigint_count);
484  const auto chosen_type = get_compact_type(agg_info);
485  if (agg_info.is_agg) {
486  num_agg_expr++;
487  }
488  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
489  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
490  CHECK(agg_expr);
491  const auto arg_expr = agg_arg(target_expr);
492  const bool float_argument_input = takes_float_argument(agg_info);
493  switch (agg_info.agg_kind) {
494  case kAVG:
495  ++index;
496  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
497  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
498  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
499  expr_range_info.hasNulls()) {
500  break;
501  }
502  }
503  found = true;
504  break;
505  case kCOUNT:
506  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
507  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
508  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
509  expr_range_info.hasNulls()) {
510  break;
511  }
512  }
513  found = true;
514  break;
515  case kSUM: {
516  auto arg_ti = arg_expr->get_type_info();
517  if (constrained_not_null(arg_expr, ra_exe_unit.quals)) {
518  arg_ti.set_notnull(true);
519  }
520  if (!arg_ti.get_notnull()) {
521  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
522  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
523  !expr_range_info.hasNulls()) {
524  found = true;
525  }
526  } else {
527  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
528  switch (expr_range_info.getType()) {
531  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
532  found = true;
533  }
534  break;
536  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
537  found = true;
538  }
539  break;
540  default:
541  break;
542  }
543  }
544  break;
545  }
546  case kMIN: {
547  CHECK(agg_expr && agg_expr->get_arg());
548  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
549  if (arg_ti.is_string() || arg_ti.is_buffer()) {
550  break;
551  }
552  auto expr_range_info =
553  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
554  auto init_max = get_agg_initial_val(agg_info.agg_kind,
555  chosen_type,
556  is_group_by || float_argument_input,
557  float_argument_input ? sizeof(float) : 8);
558  switch (expr_range_info.getType()) {
561  auto double_max =
562  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
563  if (expr_range_info.getFpMax() < double_max) {
564  found = true;
565  }
566  break;
567  }
569  if (expr_range_info.getIntMax() < init_max) {
570  found = true;
571  }
572  break;
573  default:
574  break;
575  }
576  break;
577  }
578  case kMAX: {
579  CHECK(agg_expr && agg_expr->get_arg());
580  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
581  if (arg_ti.is_string() || arg_ti.is_buffer()) {
582  break;
583  }
584  auto expr_range_info =
585  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
586  // NULL sentinel and init value for kMAX are identical, which results in
587  // ambiguity in detecting empty keys in presence of nulls.
588  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
589  expr_range_info.hasNulls()) {
590  break;
591  }
592  auto init_min = get_agg_initial_val(agg_info.agg_kind,
593  chosen_type,
594  is_group_by || float_argument_input,
595  float_argument_input ? sizeof(float) : 8);
596  switch (expr_range_info.getType()) {
599  auto double_min =
600  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
601  if (expr_range_info.getFpMin() > double_min) {
602  found = true;
603  }
604  break;
605  }
607  if (expr_range_info.getIntMin() > init_min) {
608  found = true;
609  }
610  break;
611  default:
612  break;
613  }
614  break;
615  }
616  default:
617  keyless = false;
618  break;
619  }
620  }
621  if (!keyless) {
622  break;
623  }
624  if (!found) {
625  ++index;
626  }
627  }
628 
629  // shouldn't use keyless for projection only
630  return {
631  keyless && found,
632  index,
633  };
634 }
635 
637  const RelAlgExecutionUnit& ra_exe_unit,
638  const std::vector<InputTableInfo>& query_infos,
639  const ColRangeInfo& group_by_range_info,
640  const ExecutorDeviceType device_type,
641  Executor* executor) {
642  CountDistinctDescriptors count_distinct_descriptors;
643  auto compute_bytes_per_group =
644  [](size_t bitmap_sz, size_t sub_bitmap_count, ExecutorDeviceType device_type) {
645  size_t effective_size_bytes = (bitmap_sz + 7) / 8;
646  const auto padded_size =
647  (device_type == ExecutorDeviceType::GPU || sub_bitmap_count > 1)
648  ? align_to_int64(effective_size_bytes)
649  : effective_size_bytes;
650  return padded_size * sub_bitmap_count;
651  };
652  for (size_t i = 0; i < ra_exe_unit.target_exprs.size(); i++) {
653  const auto target_expr = ra_exe_unit.target_exprs[i];
654  auto agg_info = get_target_info(target_expr, g_bigint_count);
655  if (is_distinct_target(agg_info)) {
656  CHECK(agg_info.is_agg);
657  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
658  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
659  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
660  if (arg_ti.is_text_encoding_none()) {
661  throw std::runtime_error(
662  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
663  }
664  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
665  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
666  }
667  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
668  throw std::runtime_error(
669  "APPROX_COUNT_DISTINCT on geometry columns not supported");
670  }
671  if (agg_info.is_distinct && arg_ti.is_geometry()) {
672  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
673  }
674  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
675  auto arg_range_info =
676  arg_ti.is_fp() ? no_range_info
678  ra_exe_unit, query_infos, agg_expr->get_arg(), executor);
679  const auto it = ra_exe_unit.target_exprs_original_type_infos.find(i);
680  if (it != ra_exe_unit.target_exprs_original_type_infos.end()) {
681  const auto& original_target_expr_ti = it->second;
682  if (arg_ti.is_integer() && original_target_expr_ti.get_type() == kDATE &&
683  original_target_expr_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
684  // manually encode the col range of date col if necessary
685  // (see conditionally_change_arg_to_int_type function in RelAlgExecutor.cpp)
686  auto is_date_value_not_encoded = [&original_target_expr_ti](int64_t date_val) {
687  if (original_target_expr_ti.get_comp_param() == 16) {
688  return date_val < INT16_MIN || date_val > INT16_MAX;
689  } else {
690  return date_val < INT32_MIN || date_val > INT32_MIN;
691  }
692  };
693  if (is_date_value_not_encoded(arg_range_info.min)) {
694  // chunk metadata of the date column contains decoded value
695  // so we manually encode it again here to represent its column range correctly
696  arg_range_info.min =
698  }
699  if (is_date_value_not_encoded(arg_range_info.max)) {
700  arg_range_info.max =
702  }
703  // now we manually encode the value, so we need to invalidate bucket value
704  // i.e., 86000 -> 0, to correctly calculate the size of bitmap
705  arg_range_info.bucket = 0;
706  }
707  }
708 
710  int64_t bitmap_sz_bits{0};
711  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
712  const auto error_rate_expr = agg_expr->get_arg1();
713  if (error_rate_expr) {
714  CHECK(error_rate_expr->get_type_info().get_type() == kINT);
715  auto const error_rate =
716  dynamic_cast<Analyzer::Constant const*>(error_rate_expr.get());
717  CHECK(error_rate);
718  CHECK_GE(error_rate->get_constval().intval, 1);
719  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
720  } else {
721  bitmap_sz_bits = g_hll_precision_bits;
722  }
723  }
724  if (arg_range_info.isEmpty()) {
725  count_distinct_descriptors.emplace_back(
727  0,
728  64,
729  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
730  device_type,
731  1});
732  continue;
733  }
734  const auto sub_bitmap_count =
735  get_count_distinct_sub_bitmap_count(bitmap_sz_bits, ra_exe_unit, device_type);
736  size_t worst_case_num_groups{1};
737  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
738  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
739  // implementation for arrays
740  count_distinct_impl_type = CountDistinctImplType::Bitmap;
741  if (shared::is_any<kCOUNT, kCOUNT_IF>(agg_info.agg_kind)) {
742  bitmap_sz_bits = get_bucketed_cardinality_without_nulls(arg_range_info);
743  if (bitmap_sz_bits <= 0 || g_bitmap_memory_limit <= bitmap_sz_bits) {
744  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
745  }
746  // check a potential OOM when using bitmap-based approach
747  const auto total_bytes_per_entry =
748  compute_bytes_per_group(bitmap_sz_bits, sub_bitmap_count, device_type);
749  const auto range_bucket = std::max(group_by_range_info.bucket, (int64_t)1);
750  const auto maximum_num_groups =
751  (group_by_range_info.max - group_by_range_info.min + 1) / range_bucket;
752  const auto total_bitmap_bytes_for_groups =
753  total_bytes_per_entry * maximum_num_groups;
754  // we can estimate a potential OOM of bitmap-based count-distinct operator
755  // by using the logic "check_total_bitmap_memory"
756  if (total_bitmap_bytes_for_groups >=
757  static_cast<size_t>(g_bitmap_memory_limit)) {
758  const auto agg_expr_max_entry_count =
759  arg_range_info.max - arg_range_info.min + 1;
760  int64_t max_agg_expr_table_cardinality{1};
761  std::set<const Analyzer::ColumnVar*,
762  bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
764  agg_expr->collect_column_var(colvar_set, true);
765  for (const auto cv : colvar_set) {
766  auto it =
767  std::find_if(query_infos.begin(),
768  query_infos.end(),
769  [&](const auto& input_table_info) {
770  return input_table_info.table_key == cv->getTableKey();
771  });
772  int64_t cur_table_cardinality =
773  it != query_infos.end()
774  ? static_cast<int64_t>(it->info.getNumTuplesUpperBound())
775  : -1;
776  max_agg_expr_table_cardinality =
777  std::max(max_agg_expr_table_cardinality, cur_table_cardinality);
778  worst_case_num_groups *= cur_table_cardinality;
779  }
780  auto has_valid_stat = [agg_expr_max_entry_count, maximum_num_groups]() {
781  return agg_expr_max_entry_count > 0 && maximum_num_groups > 0;
782  };
783  // if we have valid stats regarding input expr, we can try to relax the OOM
784  if (has_valid_stat()) {
785  // a threshold related to a ratio of a range of agg expr (let's say R)
786  // and table cardinality (C), i.e., use unordered_set if the # bits to build
787  // a bitmap based on R is four times larger than that of C
788  const size_t unordered_set_threshold{2};
789  // When we detect OOM of bitmap-based approach we selectively switch it to
790  // hash set-based processing logic if one of the followings is satisfied:
791  // 1) the column range is too wide compared with the table cardinality, or
792  // 2) the column range is too wide compared with the avg of # unique values
793  // per group by entry
794  const auto bits_for_agg_entry = std::ceil(log(agg_expr_max_entry_count));
795  const auto bits_for_agg_table =
796  std::ceil(log(max_agg_expr_table_cardinality));
797  const auto avg_num_unique_entries_per_group =
798  std::ceil(max_agg_expr_table_cardinality / maximum_num_groups);
799  // case a) given a range of entry count of agg_expr and the maximum
800  // cardinality among source tables of the agg_expr , we try to detect the
801  // misleading case of too sparse column range , i.e., agg_expr has 1M column
802  // range but only has two tuples {1 and 1M} / case b) check whether
803  // using bitmap is really beneficial when considering uniform distribution
804  // of (unique) keys.
805  if ((bits_for_agg_entry - bits_for_agg_table) >= unordered_set_threshold ||
806  agg_expr_max_entry_count >= avg_num_unique_entries_per_group) {
807  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
808  } else {
809  throw std::runtime_error(
810  "Consider using approx_count_distinct operator instead of "
811  "count_distinct operator to lower the memory "
812  "requirements");
813  }
814  }
815  }
816  }
817  }
818  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
819  count_distinct_impl_type == CountDistinctImplType::UnorderedSet &&
820  !(arg_ti.is_array() || arg_ti.is_geometry())) {
821  count_distinct_impl_type = CountDistinctImplType::Bitmap;
822  }
823  const size_t too_many_entries{100000000};
824  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
825  worst_case_num_groups > too_many_entries &&
826  count_distinct_impl_type == CountDistinctImplType::UnorderedSet) {
827  throw WatchdogException(
828  "Detect too many input entries for set-based count distinct operator under "
829  "the watchdog");
830  }
831  count_distinct_descriptors.emplace_back(
832  CountDistinctDescriptor{count_distinct_impl_type,
833  arg_range_info.min,
834  bitmap_sz_bits,
835  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
836  device_type,
837  sub_bitmap_count});
838  } else {
839  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
840  CountDistinctImplType::Invalid, 0, 0, false, device_type, 0});
841  }
842  }
843  return count_distinct_descriptors;
844 }
845 
846 } // namespace
847 
848 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
849  const bool allow_multifrag,
850  const size_t max_groups_buffer_entry_count,
851  const int8_t crt_min_byte_width,
852  RenderInfo* render_info,
853  const bool output_columnar_hint) {
854  const auto shard_count = device_type_ == ExecutorDeviceType::GPU
856  : 0;
857  bool sort_on_gpu_hint =
858  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
861  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
862  // but the total output buffer size would be too big or it's a sharded top query.
863  // For the sake of managing risk, use the new result set way very selectively for
864  // this case only (alongside the baseline layout we've enabled for a while now).
865  bool must_use_baseline_sort = shard_count;
866  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
867  while (true) {
868  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
869  max_groups_buffer_entry_count,
870  crt_min_byte_width,
871  sort_on_gpu_hint,
872  render_info,
873  must_use_baseline_sort,
874  output_columnar_hint);
875  CHECK(query_mem_desc);
876  if (query_mem_desc->sortOnGpu() &&
877  (query_mem_desc->getBufferSizeBytes(device_type_) +
878  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
879  2 * 1024 * 1024 * 1024LL) {
880  must_use_baseline_sort = true;
881  sort_on_gpu_hint = false;
882  } else {
883  break;
884  }
885  }
886  return query_mem_desc;
887 }
888 
889 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
890  const bool allow_multifrag,
891  const size_t max_groups_buffer_entry_count,
892  const int8_t crt_min_byte_width,
893  const bool sort_on_gpu_hint,
894  RenderInfo* render_info,
895  const bool must_use_baseline_sort,
896  const bool output_columnar_hint) {
897  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
898 
899  const bool threads_can_reuse_group_by_buffers =
900  device_type_ == ExecutorDeviceType::CPU && is_group_by &&
901  ra_exe_unit_.groupby_exprs.front();
902 
903  auto col_range_info_nosharding = getColRangeInfo();
904 
905  const auto shard_count = device_type_ == ExecutorDeviceType::GPU
907  : 0;
908 
909  const auto col_range_info =
910  ColRangeInfo{col_range_info_nosharding.hash_type_,
911  col_range_info_nosharding.min,
912  col_range_info_nosharding.max,
913  getShardedTopBucket(col_range_info_nosharding, shard_count),
914  col_range_info_nosharding.has_nulls};
915 
916  // Non-grouped aggregates do not support accessing aggregated ranges
917  // Keyless hash is currently only supported with single-column perfect hash
918  const auto keyless_info =
919  !(is_group_by &&
920  col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash)
921  ? KeylessInfo{false, -1}
923 
924  if (g_enable_watchdog &&
925  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
926  max_groups_buffer_entry_count > g_watchdog_baseline_max_groups) ||
927  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
928  ra_exe_unit_.groupby_exprs.size() == 1 &&
929  (col_range_info.max - col_range_info.min) /
930  std::max(col_range_info.bucket, int64_t(1)) >
931  130000000))) {
932  throw WatchdogException("Query would use too much memory");
933  }
934 
935  const auto count_distinct_descriptors = init_count_distinct_descriptors(
936  ra_exe_unit_, query_infos_, col_range_info, device_type_, executor_);
937  try {
939  ra_exe_unit_,
940  query_infos_,
941  col_range_info,
942  keyless_info,
943  allow_multifrag,
944  device_type_,
945  crt_min_byte_width,
946  sort_on_gpu_hint,
947  shard_count,
948  max_groups_buffer_entry_count,
949  render_info,
950  count_distinct_descriptors,
951  must_use_baseline_sort,
952  output_columnar_hint,
953  /*streaming_top_n_hint=*/true,
954  threads_can_reuse_group_by_buffers);
955  } catch (const StreamingTopNOOM& e) {
956  LOG(WARNING) << e.what() << " Disabling Streaming Top N.";
958  ra_exe_unit_,
959  query_infos_,
960  col_range_info,
961  keyless_info,
962  allow_multifrag,
963  device_type_,
964  crt_min_byte_width,
965  sort_on_gpu_hint,
966  shard_count,
967  max_groups_buffer_entry_count,
968  render_info,
969  count_distinct_descriptors,
970  must_use_baseline_sort,
971  output_columnar_hint,
972  /*streaming_top_n_hint=*/false,
973  threads_can_reuse_group_by_buffers);
974  }
975 }
976 
978  const std::list<Analyzer::OrderEntry>& order_entries) {
979  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
980  return false;
981  }
982  for (const auto& order_entry : order_entries) {
983  CHECK_GE(order_entry.tle_no, 1);
984  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
985  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
986  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
987  return false;
988  }
989  // TODO(alex): relax the restrictions
990  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
991  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
992  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
993  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
994  return false;
995  }
996  if (agg_expr->get_arg()) {
997  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
998  if (arg_ti.is_fp()) {
999  return false;
1000  }
1001  auto expr_range_info =
1002  get_expr_range_info(ra_exe_unit_, query_infos_, agg_expr->get_arg(), executor_);
1003  // TOD(adb): QMD not actually initialized here?
1004  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
1005  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
1006  expr_range_info.has_nulls) &&
1007  order_entry.is_desc == order_entry.nulls_first) {
1008  return false;
1009  }
1010  }
1011  const auto& target_ti = target_expr->get_type_info();
1012  CHECK(!target_ti.is_buffer());
1013  if (!target_ti.is_integer()) {
1014  return false;
1015  }
1016  }
1017  return true;
1018 }
1019 
1020 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
1021  llvm::BasicBlock* sc_false,
1023  const CompilationOptions& co,
1024  const GpuSharedMemoryContext& gpu_smem_context) {
1025  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1026  CHECK(filter_result);
1027 
1028  bool can_return_error = false;
1029  llvm::BasicBlock* filter_false{nullptr};
1030 
1031  {
1032  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
1033 
1034  if (executor_->isArchMaxwell(co.device_type)) {
1035  prependForceSync();
1036  }
1037  DiamondCodegen filter_cfg(filter_result,
1038  executor_,
1039  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
1040  "filter", // filter_true and filter_false basic blocks
1041  nullptr,
1042  false);
1043  filter_false = filter_cfg.cond_false_;
1044 
1045  if (is_group_by) {
1047  !query_mem_desc.useStreamingTopN()) {
1048  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
1049  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
1050  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
1051  llvm::Value* old_total_matched_val{nullptr};
1053  old_total_matched_val =
1054  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
1055  total_matched_ptr,
1056  LL_INT(int32_t(1)),
1057 #if LLVM_VERSION_MAJOR > 12
1058  LLVM_ALIGN(8),
1059 #endif
1060  llvm::AtomicOrdering::Monotonic);
1061  } else {
1062  old_total_matched_val = LL_BUILDER.CreateLoad(
1063  total_matched_ptr->getType()->getPointerElementType(), total_matched_ptr);
1064  LL_BUILDER.CreateStore(
1065  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
1066  total_matched_ptr);
1067  }
1068  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
1069  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
1070  }
1071 
1072  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
1073  auto varlen_output_buffer = codegenVarlenOutputBuffer(query_mem_desc);
1074  if (query_mem_desc.usesGetGroupValueFast() ||
1075  query_mem_desc.getQueryDescriptionType() ==
1077  if (query_mem_desc.getGroupbyColCount() > 1) {
1078  filter_cfg.setChainToNext();
1079  }
1080  // Don't generate null checks if the group slot is guaranteed to be non-null,
1081  // as it's the case for get_group_value_fast* family.
1082  can_return_error = codegenAggCalls(agg_out_ptr_w_idx,
1083  varlen_output_buffer,
1084  {},
1086  co,
1087  gpu_smem_context,
1088  filter_cfg);
1089  } else {
1090  {
1091  llvm::Value* nullcheck_cond{nullptr};
1092  if (query_mem_desc.didOutputColumnar()) {
1093  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
1094  LL_INT(int32_t(0)));
1095  } else {
1096  nullcheck_cond = LL_BUILDER.CreateICmpNE(
1097  std::get<0>(agg_out_ptr_w_idx),
1098  llvm::ConstantPointerNull::get(
1099  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
1100  }
1101  DiamondCodegen nullcheck_cfg(
1102  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
1103  codegenAggCalls(agg_out_ptr_w_idx,
1104  varlen_output_buffer,
1105  {},
1107  co,
1108  gpu_smem_context,
1109  filter_cfg);
1110  }
1111  can_return_error = true;
1112  if (query_mem_desc.getQueryDescriptionType() ==
1114  query_mem_desc.useStreamingTopN()) {
1115  // Ignore rejection on pushing current row to top-K heap.
1116  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
1117  } else {
1118  CodeGenerator code_generator(executor_);
1119  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
1120  // TODO(alex): remove the trunc once pos is converted to 32 bits
1121  code_generator.posArg(nullptr),
1122  get_int_type(32, LL_CONTEXT))));
1123  }
1124  }
1125  } else {
1126  if (ra_exe_unit_.estimator) {
1127  std::stack<llvm::BasicBlock*> array_loops;
1128  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
1129  } else {
1130  auto arg_it = ROW_FUNC->arg_begin();
1131  std::vector<llvm::Value*> agg_out_vec;
1132  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
1133  agg_out_vec.push_back(&*arg_it++);
1134  }
1135  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
1136  /*varlen_output_buffer=*/nullptr,
1137  agg_out_vec,
1138  query_mem_desc,
1139  co,
1140  gpu_smem_context,
1141  filter_cfg);
1142  }
1143  }
1144  }
1145 
1146  if (ra_exe_unit_.join_quals.empty()) {
1147  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
1148  } else if (sc_false) {
1149  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
1150  LL_BUILDER.SetInsertPoint(sc_false);
1151  LL_BUILDER.CreateBr(filter_false);
1152  LL_BUILDER.SetInsertPoint(saved_insert_block);
1153  }
1154 
1155  return can_return_error;
1156 }
1157 
1159  llvm::Value* groups_buffer,
1161  const CompilationOptions& co,
1162  DiamondCodegen& diamond_codegen) {
1163  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1165  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1166  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
1167  CHECK(!group_expr);
1168  if (!query_mem_desc.didOutputColumnar()) {
1169  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
1170  }
1171  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1172  ? 0
1173  : query_mem_desc.getRowSize() / sizeof(int64_t);
1174  CodeGenerator code_generator(executor_);
1175  if (query_mem_desc.useStreamingTopN()) {
1176  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
1177  CHECK_GE(only_order_entry.tle_no, int(1));
1178  const size_t target_idx = only_order_entry.tle_no - 1;
1179  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1180  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1181  const auto chosen_bytes =
1182  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1183  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1184  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1185  const uint32_t n =
1187  std::string fname = "get_bin_from_k_heap";
1188  const auto& oe_ti = order_entry_expr->get_type_info();
1189  llvm::Value* null_key_lv = nullptr;
1190  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1191  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1192  switch (bit_width) {
1193  case 32:
1194  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1195  break;
1196  case 64:
1197  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1198  break;
1199  default:
1200  CHECK(false);
1201  }
1202  fname += "_int" + std::to_string(bit_width) + "_t";
1203  } else {
1204  CHECK(oe_ti.is_fp());
1205  if (order_entry_lv->getType()->isDoubleTy()) {
1206  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1207  } else {
1208  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1209  }
1210  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1211  }
1212  const auto key_slot_idx =
1214  return emitCall(
1215  fname,
1216  {groups_buffer,
1217  LL_INT(n),
1218  LL_INT(row_size_quad),
1219  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1220  LL_BOOL(only_order_entry.is_desc),
1221  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1222  LL_BOOL(only_order_entry.nulls_first),
1223  null_key_lv,
1224  order_entry_lv});
1225  } else {
1226  auto* arg = get_arg_by_name(ROW_FUNC, "max_matched");
1227  const auto output_buffer_entry_count_lv =
1228  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1229  arg = get_arg_by_name(ROW_FUNC, "old_total_matched");
1230  const auto group_expr_lv =
1231  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1232  std::vector<llvm::Value*> args{groups_buffer,
1233  output_buffer_entry_count_lv,
1234  group_expr_lv,
1235  code_generator.posArg(nullptr)};
1236  if (query_mem_desc.didOutputColumnar()) {
1237  const auto columnar_output_offset =
1238  emitCall("get_columnar_scan_output_offset", args);
1239  return columnar_output_offset;
1240  }
1241  args.push_back(LL_INT(row_size_quad));
1242  return emitCall("get_scan_output_slot", args);
1243  }
1244 }
1245 
1246 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1248  const CompilationOptions& co,
1249  DiamondCodegen& diamond_codegen) {
1250  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1251  auto arg_it = ROW_FUNC->arg_begin();
1252  auto groups_buffer = arg_it++;
1253 
1254  std::stack<llvm::BasicBlock*> array_loops;
1255 
1256  // TODO(Saman): move this logic outside of this function.
1258  if (query_mem_desc.didOutputColumnar()) {
1259  return std::make_tuple(
1260  &*groups_buffer,
1261  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1262  } else {
1263  return std::make_tuple(
1264  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1265  nullptr);
1266  }
1267  }
1268 
1269  CHECK(query_mem_desc.getQueryDescriptionType() ==
1271  query_mem_desc.getQueryDescriptionType() ==
1273 
1274  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1275  ? 0
1276  : query_mem_desc.getRowSize() / sizeof(int64_t);
1277 
1278  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1279  ? sizeof(int64_t)
1280  : query_mem_desc.getEffectiveKeyWidth();
1281  // for multi-column group by
1282  llvm::Value* group_key = nullptr;
1283  llvm::Value* key_size_lv = nullptr;
1284 
1285  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1286  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1287  if (query_mem_desc.getQueryDescriptionType() ==
1289  group_key =
1290  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1291  } else if (query_mem_desc.getQueryDescriptionType() ==
1293  group_key =
1294  col_width_size == sizeof(int32_t)
1295  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1296  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1297  }
1298  CHECK(group_key);
1299  CHECK(key_size_lv);
1300  }
1301 
1302  int32_t subkey_idx = 0;
1303  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1304  for (const auto& group_expr : ra_exe_unit_.groupby_exprs) {
1305  const auto col_range_info =
1307  const auto translated_null_value = static_cast<int64_t>(
1308  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1309  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1310  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1311  : checked_int64_t(col_range_info.max) +
1312  (col_range_info.bucket ? col_range_info.bucket : 1));
1313 
1314  const bool col_has_nulls =
1315  query_mem_desc.getQueryDescriptionType() ==
1317  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1318  ? query_mem_desc.hasNulls()
1319  : col_range_info.has_nulls)
1320  : false;
1321 
1322  const auto group_expr_lvs =
1323  executor_->groupByColumnCodegen(group_expr.get(),
1324  col_width_size,
1325  co,
1326  col_has_nulls,
1327  translated_null_value,
1328  diamond_codegen,
1329  array_loops,
1330  query_mem_desc.threadsShareMemory());
1331  const auto group_expr_lv = group_expr_lvs.translated_value;
1332  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1333  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1334  return codegenSingleColumnPerfectHash(query_mem_desc,
1335  co,
1336  &*groups_buffer,
1337  group_expr_lv,
1338  group_expr_lvs.original_value,
1339  row_size_quad);
1340  } else {
1341  // store the sub-key to the buffer
1342  LL_BUILDER.CreateStore(
1343  group_expr_lv,
1344  LL_BUILDER.CreateGEP(
1345  group_key->getType()->getScalarType()->getPointerElementType(),
1346  group_key,
1347  LL_INT(subkey_idx++)));
1348  }
1349  }
1350  if (query_mem_desc.getQueryDescriptionType() ==
1352  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1354  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1355  } else if (query_mem_desc.getQueryDescriptionType() ==
1358  &*groups_buffer,
1359  group_key,
1360  key_size_lv,
1361  query_mem_desc,
1362  col_width_size,
1363  row_size_quad);
1364  }
1365  CHECK(false);
1366  return std::make_tuple(nullptr, nullptr);
1367 }
1368 
1371  if (!query_mem_desc.hasVarlenOutput()) {
1372  return nullptr;
1373  }
1374 
1375  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1376  auto arg_it = ROW_FUNC->arg_begin();
1377  arg_it++; /* groups_buffer */
1378  auto varlen_output_buffer = arg_it++;
1379  CHECK(varlen_output_buffer->getType() == llvm::Type::getInt64PtrTy(LL_CONTEXT));
1380  return varlen_output_buffer;
1381 }
1382 
1383 std::tuple<llvm::Value*, llvm::Value*>
1386  const CompilationOptions& co,
1387  llvm::Value* groups_buffer,
1388  llvm::Value* group_expr_lv_translated,
1389  llvm::Value* group_expr_lv_original,
1390  const int32_t row_size_quad) {
1391  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1392  CHECK(query_mem_desc.usesGetGroupValueFast());
1393  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1394  ? "get_columnar_group_bin_offset"
1395  : "get_group_value_fast"};
1396  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1397  get_group_fn_name += "_keyless";
1398  }
1399  if (query_mem_desc.interleavedBins(co.device_type)) {
1400  CHECK(!query_mem_desc.didOutputColumnar());
1401  CHECK(query_mem_desc.hasKeylessHash());
1402  get_group_fn_name += "_semiprivate";
1403  }
1404  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1405  &*group_expr_lv_translated};
1406  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1407  query_mem_desc.mustUseBaselineSort()) {
1408  get_group_fn_name += "_with_original_key";
1409  get_group_fn_args.push_back(group_expr_lv_original);
1410  }
1411  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1412  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1413  if (!query_mem_desc.hasKeylessHash()) {
1414  if (!query_mem_desc.didOutputColumnar()) {
1415  get_group_fn_args.push_back(LL_INT(row_size_quad));
1416  }
1417  } else {
1418  if (!query_mem_desc.didOutputColumnar()) {
1419  get_group_fn_args.push_back(LL_INT(row_size_quad));
1420  }
1421  if (query_mem_desc.interleavedBins(co.device_type)) {
1422  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1423  get_group_fn_args.push_back(warp_idx);
1424  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1425  }
1426  }
1427  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1428  return std::make_tuple(&*groups_buffer,
1429  emitCall(get_group_fn_name, get_group_fn_args));
1430  }
1431  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1432 }
1433 
1434 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1435  llvm::Value* groups_buffer,
1436  llvm::Value* group_key,
1437  llvm::Value* key_size_lv,
1438  const QueryMemoryDescriptor& query_mem_desc,
1439  const int32_t row_size_quad) {
1440  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1441  CHECK(query_mem_desc.getQueryDescriptionType() ==
1443  // compute the index (perfect hash)
1444  auto perfect_hash_func = codegenPerfectHashFunction();
1445  auto hash_lv =
1446  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1447 
1448  if (query_mem_desc.didOutputColumnar()) {
1449  if (!query_mem_desc.hasKeylessHash()) {
1450  const std::string set_matching_func_name{
1451  "set_matching_group_value_perfect_hash_columnar"};
1452  const std::vector<llvm::Value*> set_matching_func_arg{
1453  groups_buffer,
1454  hash_lv,
1455  group_key,
1456  key_size_lv,
1457  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1458  query_mem_desc.getEntryCount())};
1459  emitCall(set_matching_func_name, set_matching_func_arg);
1460  }
1461  return std::make_tuple(groups_buffer, hash_lv);
1462  } else {
1463  if (query_mem_desc.hasKeylessHash()) {
1464  return std::make_tuple(emitCall("get_matching_group_value_perfect_hash_keyless",
1465  {groups_buffer, hash_lv, LL_INT(row_size_quad)}),
1466  nullptr);
1467  } else {
1468  return std::make_tuple(
1469  emitCall(
1470  "get_matching_group_value_perfect_hash",
1471  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1472  nullptr);
1473  }
1474  }
1475 }
1476 
1477 std::tuple<llvm::Value*, llvm::Value*>
1479  const CompilationOptions& co,
1480  llvm::Value* groups_buffer,
1481  llvm::Value* group_key,
1482  llvm::Value* key_size_lv,
1483  const QueryMemoryDescriptor& query_mem_desc,
1484  const size_t key_width,
1485  const int32_t row_size_quad) {
1486  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1487  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1488  CHECK(key_width == sizeof(int32_t));
1489  group_key =
1490  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1491  }
1492  std::vector<llvm::Value*> func_args{
1493  groups_buffer,
1494  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1495  &*group_key,
1496  &*key_size_lv,
1497  LL_INT(static_cast<int32_t>(key_width))};
1498  std::string func_name{"get_group_value"};
1499  if (query_mem_desc.didOutputColumnar()) {
1500  func_name += "_columnar_slot";
1501  } else {
1502  func_args.push_back(LL_INT(row_size_quad));
1503  }
1504  if (co.with_dynamic_watchdog) {
1505  func_name += "_with_watchdog";
1506  }
1507  if (query_mem_desc.didOutputColumnar()) {
1508  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1509  } else {
1510  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1511  }
1512 }
1513 
1515  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1516  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1517  auto ft = llvm::FunctionType::get(
1518  get_int_type(32, LL_CONTEXT),
1519  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1520  false);
1521  auto key_hash_func = llvm::Function::Create(ft,
1522  llvm::Function::ExternalLinkage,
1523  "perfect_key_hash",
1524  executor_->cgen_state_->module_);
1525  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1526  mark_function_always_inline(key_hash_func);
1527  auto& key_buff_arg = *key_hash_func->args().begin();
1528  llvm::Value* key_buff_lv = &key_buff_arg;
1529  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1530  llvm::IRBuilder<> key_hash_func_builder(bb);
1531  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1532  std::vector<int64_t> cardinalities;
1533  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1534  auto col_range_info =
1535  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1536  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1537  cardinalities.push_back(getBucketedCardinality(col_range_info));
1538  }
1539  size_t dim_idx = 0;
1540  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1541  auto* gep = key_hash_func_builder.CreateGEP(
1542  key_buff_lv->getType()->getScalarType()->getPointerElementType(),
1543  key_buff_lv,
1544  LL_INT(dim_idx));
1545  auto key_comp_lv =
1546  key_hash_func_builder.CreateLoad(gep->getType()->getPointerElementType(), gep);
1547  auto col_range_info =
1548  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1549  auto crt_term_lv =
1550  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1551  if (col_range_info.bucket) {
1552  crt_term_lv =
1553  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1554  }
1555  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1556  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1557  LL_INT(cardinalities[prev_dim_idx]));
1558  }
1559  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1560  ++dim_idx;
1561  }
1562  key_hash_func_builder.CreateRet(
1563  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1564  return key_hash_func;
1565 }
1566 
1568  const TargetInfo& agg_info,
1569  llvm::Value* target) {
1570  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1571  const auto& agg_type = agg_info.sql_type;
1572  const size_t chosen_bytes = agg_type.get_size();
1573 
1574  bool need_conversion{false};
1575  llvm::Value* arg_null{nullptr};
1576  llvm::Value* agg_null{nullptr};
1577  llvm::Value* target_to_cast{target};
1578  if (arg_type.is_fp()) {
1579  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1580  if (agg_type.is_fp()) {
1581  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1582  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1583  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1584  need_conversion = true;
1585  }
1586  } else {
1587  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1588  return target;
1589  }
1590  } else {
1591  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1592  if (agg_type.is_fp()) {
1593  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1594  need_conversion = true;
1595  target_to_cast = executor_->castToFP(target, arg_type, agg_type);
1596  } else {
1597  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1598  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1599  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1600  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1601  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1602  need_conversion = true;
1603  }
1604  }
1605  }
1606  if (need_conversion) {
1607  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1608  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1609  return LL_BUILDER.CreateSelect(
1610  cmp,
1611  agg_null,
1612  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1613  } else {
1614  return target;
1615  }
1616 }
1617 
1619  const Analyzer::WindowFunction* window_func,
1620  const QueryMemoryDescriptor& query_mem_desc,
1621  const CompilationOptions& co,
1622  DiamondCodegen& diamond_codegen) {
1623  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1624  const auto window_func_context =
1626  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1627  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1628  ? 0
1629  : query_mem_desc.getRowSize() / sizeof(int64_t);
1630  auto arg_it = ROW_FUNC->arg_begin();
1631  auto groups_buffer = arg_it++;
1632  CodeGenerator code_generator(executor_);
1633  auto window_pos_lv = code_generator.codegenWindowPosition(
1634  window_func_context, code_generator.posArg(nullptr));
1635  const auto pos_in_window =
1636  LL_BUILDER.CreateTrunc(window_pos_lv, get_int_type(32, LL_CONTEXT));
1637  llvm::Value* entry_count_lv =
1638  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1639  std::vector<llvm::Value*> args{
1640  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1641  if (query_mem_desc.didOutputColumnar()) {
1642  const auto columnar_output_offset =
1643  emitCall("get_columnar_scan_output_offset", args);
1644  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1645  }
1646  args.push_back(LL_INT(row_size_quad));
1647  return emitCall("get_scan_output_slot", args);
1648  }
1649  auto arg_it = ROW_FUNC->arg_begin();
1650  auto groups_buffer = arg_it++;
1651  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1652 }
1653 
1655  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1656  llvm::Value* varlen_output_buffer,
1657  const std::vector<llvm::Value*>& agg_out_vec,
1658  QueryMemoryDescriptor& query_mem_desc,
1659  const CompilationOptions& co,
1660  const GpuSharedMemoryContext& gpu_smem_context,
1661  DiamondCodegen& diamond_codegen) {
1662  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1663  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1664  // TODO(alex): unify the two cases, the output for non-group by queries
1665  // should be a contiguous buffer
1666  const bool is_group_by = std::get<0>(agg_out_ptr_w_idx);
1667  bool can_return_error = false;
1668  if (is_group_by) {
1669  CHECK(agg_out_vec.empty());
1670  } else {
1671  CHECK(!agg_out_vec.empty());
1672  }
1673 
1674  // output buffer is casted into a byte stream to be able to handle data elements of
1675  // different sizes (only used when actual column width sizes are used)
1676  llvm::Value* output_buffer_byte_stream{nullptr};
1677  llvm::Value* out_row_idx{nullptr};
1678  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1680  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1681  std::get<0>(agg_out_ptr_w_idx),
1682  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1683  output_buffer_byte_stream->setName("out_buff_b_stream");
1684  CHECK(std::get<1>(agg_out_ptr_w_idx));
1685  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1686  llvm::Type::getInt64Ty(LL_CONTEXT));
1687  out_row_idx->setName("out_row_idx");
1688  }
1689 
1690  TargetExprCodegenBuilder target_builder(ra_exe_unit_, is_group_by);
1691  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1692  ++target_idx) {
1693  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1694  CHECK(target_expr);
1695 
1696  target_builder(target_expr, executor_, query_mem_desc, co);
1697  }
1698 
1699  target_builder.codegen(this,
1700  executor_,
1701  query_mem_desc,
1702  co,
1703  gpu_smem_context,
1704  agg_out_ptr_w_idx,
1705  agg_out_vec,
1706  output_buffer_byte_stream,
1707  out_row_idx,
1708  varlen_output_buffer,
1709  diamond_codegen);
1710 
1711  return can_return_error;
1712 }
1713 
1718  llvm::Value* output_buffer_byte_stream,
1719  llvm::Value* out_row_idx,
1720  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1721  const QueryMemoryDescriptor& query_mem_desc,
1722  const size_t chosen_bytes,
1723  const size_t agg_out_off,
1724  const size_t target_idx) {
1725  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1726  llvm::Value* agg_col_ptr{nullptr};
1727  if (query_mem_desc.didOutputColumnar()) {
1728  // TODO(Saman): remove the second columnar branch, and support all query description
1729  // types through the first branch. Then, input arguments should also be cleaned up
1730  if (!g_cluster &&
1732  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1733  chosen_bytes == 8);
1734  CHECK(output_buffer_byte_stream);
1735  CHECK(out_row_idx);
1736  size_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1737  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1738  auto out_per_col_byte_idx =
1739 #ifdef _WIN32
1740  LL_BUILDER.CreateShl(out_row_idx, __lzcnt(chosen_bytes) - 1);
1741 #else
1742  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1743 #endif
1744  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1745  LL_INT(static_cast<int64_t>(col_off)));
1746  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1747  auto output_ptr = LL_BUILDER.CreateGEP(
1748  output_buffer_byte_stream->getType()->getScalarType()->getPointerElementType(),
1749  output_buffer_byte_stream,
1750  byte_offset);
1751  agg_col_ptr = LL_BUILDER.CreateBitCast(
1752  output_ptr,
1753  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1754  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1755  } else {
1756  auto const col_off_in_bytes = query_mem_desc.getColOffInBytes(agg_out_off);
1757  auto const col_off = col_off_in_bytes / chosen_bytes;
1758  auto const col_rem = col_off_in_bytes % chosen_bytes;
1759  CHECK_EQ(col_rem, 0u) << col_off_in_bytes << " % " << chosen_bytes;
1760  CHECK(std::get<1>(agg_out_ptr_w_idx));
1761  auto* agg_out_idx = LL_BUILDER.CreateZExt(
1762  std::get<1>(agg_out_ptr_w_idx),
1763  get_int_type(8 * sizeof(col_off), executor_->cgen_state_->context_));
1764  auto* offset = LL_BUILDER.CreateAdd(agg_out_idx, LL_INT(col_off));
1765  auto* bit_cast = LL_BUILDER.CreateBitCast(
1766  std::get<0>(agg_out_ptr_w_idx),
1767  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1768  agg_col_ptr = LL_BUILDER.CreateGEP(
1769  bit_cast->getType()->getScalarType()->getPointerElementType(),
1770  bit_cast,
1771  offset);
1772  }
1773  } else {
1774  auto const col_off_in_bytes = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1775  auto const col_off = col_off_in_bytes / chosen_bytes;
1776  auto const col_rem = col_off_in_bytes % chosen_bytes;
1777  CHECK_EQ(col_rem, 0u) << col_off_in_bytes << " % " << chosen_bytes;
1778  auto* bit_cast = LL_BUILDER.CreateBitCast(
1779  std::get<0>(agg_out_ptr_w_idx),
1780  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1781  agg_col_ptr = LL_BUILDER.CreateGEP(
1782  bit_cast->getType()->getScalarType()->getPointerElementType(),
1783  bit_cast,
1784  LL_INT(col_off));
1785  }
1786  CHECK(agg_col_ptr);
1787  return agg_col_ptr;
1788 }
1789 
1790 void GroupByAndAggregate::codegenEstimator(std::stack<llvm::BasicBlock*>& array_loops,
1791  DiamondCodegen& diamond_codegen,
1792  const QueryMemoryDescriptor& query_mem_desc,
1793  const CompilationOptions& co) {
1794  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1795  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1796  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1797  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1798  estimator_comp_count_lv);
1799  int32_t subkey_idx = 0;
1800  for (const auto& estimator_arg_comp : estimator_arg) {
1801  const auto estimator_arg_comp_lvs =
1802  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1803  query_mem_desc.getEffectiveKeyWidth(),
1804  co,
1805  false,
1806  0,
1807  diamond_codegen,
1808  array_loops,
1809  true);
1810  CHECK(!estimator_arg_comp_lvs.original_value);
1811  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1812  // store the sub-key to the buffer
1813  LL_BUILDER.CreateStore(
1814  estimator_arg_comp_lv,
1815  LL_BUILDER.CreateGEP(
1816  estimator_key_lv->getType()->getScalarType()->getPointerElementType(),
1817  estimator_key_lv,
1818  LL_INT(subkey_idx++)));
1819  }
1820  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1821  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1822  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1823  const auto estimator_comp_bytes_lv =
1824  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1825  const auto bitmap_size_lv =
1826  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1827  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1828  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1829 }
1830 
1831 extern "C" RUNTIME_EXPORT void agg_count_distinct(int64_t* agg, const int64_t val) {
1832  reinterpret_cast<CountDistinctSet*>(*agg)->insert(val);
1833 }
1834 
1835 extern "C" RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t* agg,
1836  const int64_t val,
1837  const int64_t skip_val) {
1838  if (val != skip_val) {
1839  agg_count_distinct(agg, val);
1840  }
1841 }
1842 
1843 extern "C" RUNTIME_EXPORT void agg_approx_quantile(int64_t* agg, const double val) {
1844  auto* t_digest = reinterpret_cast<quantile::TDigest*>(*agg);
1845  t_digest->allocate();
1846  t_digest->add(val);
1847 }
1848 
1849 extern "C" RUNTIME_EXPORT void agg_mode_func(int64_t* agg, const int64_t val) {
1850  auto* mode_map = reinterpret_cast<AggMode*>(*agg);
1851  mode_map->add(val);
1852 }
1853 
1855  const size_t target_idx,
1856  const Analyzer::Expr* target_expr,
1857  std::vector<llvm::Value*>& agg_args,
1858  const QueryMemoryDescriptor& query_mem_desc,
1859  const ExecutorDeviceType device_type) {
1860  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1861  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1862  const auto& arg_ti =
1863  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1864  if (arg_ti.is_fp()) {
1865  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1866  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1867  }
1868  const auto& count_distinct_descriptor =
1869  query_mem_desc.getCountDistinctDescriptor(target_idx);
1870  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1871  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1872  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1873  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1874  if (device_type == ExecutorDeviceType::GPU) {
1875  const auto base_dev_addr = getAdditionalLiteral(-1);
1876  const auto base_host_addr = getAdditionalLiteral(-2);
1877  agg_args.push_back(base_dev_addr);
1878  agg_args.push_back(base_host_addr);
1879  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1880  } else {
1881  emitCall("agg_approximate_count_distinct", agg_args);
1882  }
1883  return;
1884  }
1885  std::string agg_fname{"agg_count_distinct"};
1886  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1887  agg_fname += "_bitmap";
1888  agg_args.push_back(LL_INT(static_cast<int64_t>(count_distinct_descriptor.min_val)));
1889  }
1890  if (agg_info.skip_null_val) {
1891  auto null_lv = executor_->cgen_state_->castToTypeIn(
1892  (arg_ti.is_fp()
1893  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1894  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1895  64);
1896  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1897  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1898  agg_fname += "_skip_val";
1899  agg_args.push_back(null_lv);
1900  }
1901  if (device_type == ExecutorDeviceType::GPU) {
1902  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1903  agg_fname += "_gpu";
1904  const auto base_dev_addr = getAdditionalLiteral(-1);
1905  const auto base_host_addr = getAdditionalLiteral(-2);
1906  agg_args.push_back(base_dev_addr);
1907  agg_args.push_back(base_host_addr);
1908  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1909  CHECK_EQ(size_t(0),
1910  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1911  count_distinct_descriptor.sub_bitmap_count);
1912  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1913  count_distinct_descriptor.sub_bitmap_count)));
1914  }
1915  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1916  emitCall(agg_fname, agg_args);
1917  } else {
1918  executor_->cgen_state_->emitExternalCall(
1919  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1920  }
1921 }
1922 
1924  const size_t target_idx,
1925  const Analyzer::Expr* target_expr,
1926  std::vector<llvm::Value*>& agg_args,
1927  const QueryMemoryDescriptor& query_mem_desc,
1928  const ExecutorDeviceType device_type) {
1929  if (device_type == ExecutorDeviceType::GPU) {
1930  throw QueryMustRunOnCpu();
1931  }
1932  llvm::BasicBlock *calc, *skip{nullptr};
1933  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1934  auto const arg_ti =
1935  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1936  bool const nullable = !arg_ti.get_notnull();
1937 
1938  auto* cs = executor_->cgen_state_.get();
1939  auto& irb = cs->ir_builder_;
1940  if (nullable) {
1941  auto* const null_value = cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1942  auto* const skip_cond = arg_ti.is_fp()
1943  ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1944  : irb.CreateICmpEQ(agg_args.back(), null_value);
1945  calc = llvm::BasicBlock::Create(cs->context_, "calc_approx_quantile");
1946  skip = llvm::BasicBlock::Create(cs->context_, "skip_approx_quantile");
1947  irb.CreateCondBr(skip_cond, skip, calc);
1948  cs->current_func_->getBasicBlockList().push_back(calc);
1949  irb.SetInsertPoint(calc);
1950  }
1951  if (!arg_ti.is_fp()) {
1952  auto const agg_info = get_target_info(target_expr, g_bigint_count);
1953  agg_args.back() = executor_->castToFP(agg_args.back(), arg_ti, agg_info.sql_type);
1954  }
1955  cs->emitExternalCall(
1956  "agg_approx_quantile", llvm::Type::getVoidTy(cs->context_), agg_args);
1957  if (nullable) {
1958  irb.CreateBr(skip);
1959  cs->current_func_->getBasicBlockList().push_back(skip);
1960  irb.SetInsertPoint(skip);
1961  }
1962 }
1963 
1964 void GroupByAndAggregate::codegenMode(const size_t target_idx,
1965  const Analyzer::Expr* target_expr,
1966  std::vector<llvm::Value*>& agg_args,
1967  const QueryMemoryDescriptor& query_mem_desc,
1968  const ExecutorDeviceType device_type) {
1969  if (device_type == ExecutorDeviceType::GPU) {
1970  throw QueryMustRunOnCpu();
1971  }
1972  llvm::BasicBlock *calc, *skip{nullptr};
1973  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1974  auto const arg_ti =
1975  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1976  bool const nullable = !arg_ti.get_notnull();
1977  bool const is_fp = arg_ti.is_fp();
1978  auto* cs = executor_->cgen_state_.get();
1979  auto& irb = cs->ir_builder_;
1980  if (nullable) {
1981  auto* const null_value =
1982  is_fp ? cs->inlineNull(arg_ti) : cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1983  auto* const skip_cond = is_fp ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1984  : irb.CreateICmpEQ(agg_args.back(), null_value);
1985  calc = llvm::BasicBlock::Create(cs->context_, "calc_mode");
1986  skip = llvm::BasicBlock::Create(cs->context_, "skip_mode");
1987  irb.CreateCondBr(skip_cond, skip, calc);
1988  cs->current_func_->getBasicBlockList().push_back(calc);
1989  irb.SetInsertPoint(calc);
1990  }
1991  if (is_fp) {
1992  auto* const int_type = get_int_type(8 * arg_ti.get_size(), cs->context_);
1993  agg_args.back() = irb.CreateBitCast(agg_args.back(), int_type);
1994  }
1995  // "agg_mode" collides with existing names, so non-standard suffix "_func" is added.
1996  cs->emitExternalCall("agg_mode_func", llvm::Type::getVoidTy(cs->context_), agg_args);
1997  if (nullable) {
1998  irb.CreateBr(skip);
1999  cs->current_func_->getBasicBlockList().push_back(skip);
2000  irb.SetInsertPoint(skip);
2001  }
2002 }
2003 
2004 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
2005  CHECK_LT(off, 0);
2006  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
2007  auto* bit_cast = LL_BUILDER.CreateBitCast(
2008  lit_buff_lv, llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0));
2009  auto* gep =
2010  LL_BUILDER.CreateGEP(bit_cast->getType()->getScalarType()->getPointerElementType(),
2011  bit_cast,
2012  LL_INT(off));
2013  return LL_BUILDER.CreateLoad(gep->getType()->getPointerElementType(), gep);
2014 }
2015 
2016 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
2017  const Analyzer::Expr* target_expr,
2018  const CompilationOptions& co) {
2019  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2020  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
2021  const auto func_expr = dynamic_cast<const Analyzer::FunctionOper*>(target_expr);
2022  const auto arr_expr = dynamic_cast<const Analyzer::ArrayExpr*>(target_expr);
2023 
2024  // TODO(alex): handle arrays uniformly?
2025  CodeGenerator code_generator(executor_);
2026  if (target_expr) {
2027  const auto& target_ti = target_expr->get_type_info();
2028  if (target_ti.is_buffer() &&
2029  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
2030  const auto target_lvs =
2031  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2032  : code_generator.codegen(
2033  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
2034  if (!func_expr && !arr_expr) {
2035  // Something with the chunk transport is code that was generated from a source
2036  // other than an ARRAY[] expression
2037  if (target_ti.is_text_encoding_none()) {
2038  CHECK_EQ(size_t(3), target_lvs.size());
2039  return {target_lvs[1], target_lvs[2]};
2040  }
2041  CHECK(target_ti.is_array());
2042  CHECK_EQ(size_t(1), target_lvs.size());
2043  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
2044  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
2045  const auto i8p_ty =
2046  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
2047  const auto& elem_ti = target_ti.get_elem_type();
2048  return {
2049  executor_->cgen_state_->emitExternalCall(
2050  "array_buff",
2051  i8p_ty,
2052  {target_lvs.front(), code_generator.posArg(target_expr)}),
2053  executor_->cgen_state_->emitExternalCall(
2054  "array_size",
2055  i32_ty,
2056  {target_lvs.front(),
2057  code_generator.posArg(target_expr),
2058  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
2059  } else {
2060  if (agg_expr) {
2061  throw std::runtime_error(
2062  "Using array[] operator as argument to an aggregate operator is not "
2063  "supported");
2064  }
2065  CHECK(func_expr || arr_expr);
2066  if (dynamic_cast<const Analyzer::FunctionOper*>(target_expr)) {
2067  CHECK_EQ(size_t(1), target_lvs.size());
2068  const auto prefix = target_ti.get_buffer_name();
2069  CHECK(target_ti.is_array() || target_ti.is_text_encoding_none());
2070  const auto target_lv = LL_BUILDER.CreateLoad(
2071  target_lvs[0]->getType()->getPointerElementType(), target_lvs[0]);
2072  // const auto target_lv_type = target_lvs[0]->getType();
2073  // CHECK(target_lv_type->isStructTy());
2074  // CHECK_EQ(target_lv_type->getNumContainedTypes(), 3u);
2075  const auto i8p_ty = llvm::PointerType::get(
2076  get_int_type(8, executor_->cgen_state_->context_), 0);
2077  const auto ptr = LL_BUILDER.CreatePointerCast(
2078  LL_BUILDER.CreateExtractValue(target_lv, 0), i8p_ty);
2079  const auto size = LL_BUILDER.CreateExtractValue(target_lv, 1);
2080  const auto null_flag = LL_BUILDER.CreateExtractValue(target_lv, 2);
2081  const auto nullcheck_ok_bb =
2082  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_nullcheck_ok_bb", CUR_FUNC);
2083  const auto nullcheck_fail_bb = llvm::BasicBlock::Create(
2084  LL_CONTEXT, prefix + "_nullcheck_fail_bb", CUR_FUNC);
2085 
2086  // TODO(adb): probably better to zext the bool
2087  const auto nullcheck = LL_BUILDER.CreateICmpEQ(
2088  null_flag, executor_->cgen_state_->llInt(static_cast<int8_t>(1)));
2089  LL_BUILDER.CreateCondBr(nullcheck, nullcheck_fail_bb, nullcheck_ok_bb);
2090 
2091  const auto ret_bb =
2092  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_return", CUR_FUNC);
2093  LL_BUILDER.SetInsertPoint(ret_bb);
2094  auto result_phi = LL_BUILDER.CreatePHI(i8p_ty, 2, prefix + "_ptr_return");
2095  result_phi->addIncoming(ptr, nullcheck_ok_bb);
2096  const auto null_arr_sentinel = LL_BUILDER.CreateIntToPtr(
2097  executor_->cgen_state_->llInt(static_cast<int8_t>(0)), i8p_ty);
2098  result_phi->addIncoming(null_arr_sentinel, nullcheck_fail_bb);
2099  LL_BUILDER.SetInsertPoint(nullcheck_ok_bb);
2100  executor_->cgen_state_->emitExternalCall(
2101  "register_buffer_with_executor_rsm",
2102  llvm::Type::getVoidTy(executor_->cgen_state_->context_),
2103  {executor_->cgen_state_->llInt(reinterpret_cast<int64_t>(executor_)), ptr});
2104  LL_BUILDER.CreateBr(ret_bb);
2105  LL_BUILDER.SetInsertPoint(nullcheck_fail_bb);
2106  LL_BUILDER.CreateBr(ret_bb);
2107 
2108  LL_BUILDER.SetInsertPoint(ret_bb);
2109  return {result_phi, size};
2110  }
2111  CHECK_EQ(size_t(2), target_lvs.size());
2112  return {target_lvs[0], target_lvs[1]};
2113  }
2114  }
2115  if (target_ti.is_geometry() &&
2116  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
2117  auto generate_coord_lvs =
2118  [&](auto* selected_target_expr,
2119  bool const fetch_columns) -> std::vector<llvm::Value*> {
2120  const auto target_lvs =
2121  code_generator.codegen(selected_target_expr, fetch_columns, co);
2122  if (dynamic_cast<const Analyzer::GeoOperator*>(target_expr) &&
2123  target_expr->get_type_info().is_geometry()) {
2124  // return a pointer to the temporary alloca
2125  return target_lvs;
2126  }
2127  const auto geo_uoper = dynamic_cast<const Analyzer::GeoUOper*>(target_expr);
2128  const auto geo_binoper = dynamic_cast<const Analyzer::GeoBinOper*>(target_expr);
2129  if (geo_uoper || geo_binoper) {
2130  CHECK(target_expr->get_type_info().is_geometry());
2131  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
2132  target_lvs.size());
2133  return target_lvs;
2134  }
2135  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
2136  target_lvs.size());
2137 
2138  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
2139  const auto i8p_ty =
2140  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
2141  std::vector<llvm::Value*> coords;
2142  size_t ctr = 0;
2143  for (const auto& target_lv : target_lvs) {
2144  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
2145  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
2146  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
2147  // coords array (TINYINT). Subsequent arrays are regular INT.
2148 
2149  const size_t elem_sz = ctr == 0 ? 1 : 4;
2150  ctr++;
2151  int32_t fixlen = -1;
2152  if (target_ti.get_type() == kPOINT) {
2153  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
2154  if (col_var) {
2155  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
2156  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
2157  fixlen = coords_cd->columnType.get_size();
2158  }
2159  }
2160  }
2161  if (fixlen > 0) {
2162  coords.push_back(executor_->cgen_state_->emitExternalCall(
2163  "fast_fixlen_array_buff",
2164  i8p_ty,
2165  {target_lv, code_generator.posArg(selected_target_expr)}));
2166  auto fixed_len_lv = executor_->cgen_state_->emitExternalCall(
2167  "determine_fixed_array_len",
2168  llvm::IntegerType::get(code_generator.cgen_state_->context_, 64),
2169  {target_lv, executor_->cgen_state_->llInt(int64_t(fixlen))});
2170  coords.push_back(fixed_len_lv);
2171  continue;
2172  }
2173  coords.push_back(executor_->cgen_state_->emitExternalCall(
2174  "array_buff",
2175  i8p_ty,
2176  {target_lv, code_generator.posArg(selected_target_expr)}));
2177  coords.push_back(executor_->cgen_state_->emitExternalCall(
2178  "array_size",
2179  i32_ty,
2180  {target_lv,
2181  code_generator.posArg(selected_target_expr),
2182  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
2183  }
2184  return coords;
2185  };
2186 
2187  if (agg_expr) {
2188  return generate_coord_lvs(agg_expr->get_arg(), true);
2189  } else {
2190  return generate_coord_lvs(target_expr,
2191  !executor_->plan_state_->allow_lazy_fetch_);
2192  }
2193  }
2194  }
2195  bool fetch_column = !executor_->plan_state_->allow_lazy_fetch_;
2196  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2197  : code_generator.codegen(target_expr, fetch_column, co);
2198 }
2199 
2200 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
2201  const std::vector<llvm::Value*>& args) {
2202  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2203  return executor_->cgen_state_->emitCall(fname, args);
2204 }
2205 
2206 void GroupByAndAggregate::checkErrorCode(llvm::Value* retCode) {
2207  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2208  auto zero_const = llvm::ConstantInt::get(retCode->getType(), 0, true);
2209  auto rc_check_condition = executor_->cgen_state_->ir_builder_.CreateICmp(
2210  llvm::ICmpInst::ICMP_EQ, retCode, zero_const);
2211 
2212  executor_->cgen_state_->emitErrorCheck(rc_check_condition, retCode, "rc");
2213 }
2214 
2215 #undef CUR_FUNC
2216 #undef ROW_FUNC
2217 #undef LL_FP
2218 #undef LL_INT
2219 #undef LL_BOOL
2220 #undef LL_BUILDER
2221 #undef LL_CONTEXT
2222 
2224  const RelAlgExecutionUnit& ra_exe_unit) {
2225  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
2226  return 0;
2227  }
2228  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
2229  const auto grouped_col_expr =
2230  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
2231  if (!grouped_col_expr) {
2232  continue;
2233  }
2234  const auto& column_key = grouped_col_expr->getColumnKey();
2235  if (column_key.table_id <= 0) {
2236  return 0;
2237  }
2239  {column_key.db_id, column_key.table_id});
2240  if (td->shardedColumnId == column_key.column_id) {
2241  return td->nShards;
2242  }
2243  }
2244  return 0;
2245 }
RUNTIME_EXPORT void agg_approx_quantile(int64_t *agg, const double val)
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:2709
size_t g_watchdog_baseline_max_groups
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
llvm::Value * getAdditionalLiteral(const int32_t off)
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
llvm::BasicBlock * cond_false_
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
HOST DEVICE int get_size() const
Definition: sqltypes.h:403
#define LL_BUILDER
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
Definition: Analyzer.h:215
RUNTIME_EXPORT void agg_count_distinct(int64_t *agg, const int64_t val)
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:113
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
#define LL_CONTEXT
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
Definition: Analyzer.h:222
CgenState * cgen_state_
KeylessInfo get_keyless_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
void codegenMode(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
SQLTypeInfo sql_type
Definition: TargetInfo.h:52
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:285
std::ostream & operator<<(std::ostream &os, const SessionInfo &session_info)
Definition: SessionInfo.cpp:57
void mark_function_always_inline(llvm::Function *func)
bool is_fp() const
Definition: sqltypes.h:571
ColRangeInfo getColRangeInfo()
#define LL_INT(v)
QueryDescriptionType hash_type_
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:585
const TableDescriptor * get_metadata_for_table(const ::shared::TableKey &table_key, bool populate_fragmenter)
#define CHECK_GE(x, y)
Definition: Logger.h:306
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
llvm::Value * codegenVarlenOutputBuffer(const QueryMemoryDescriptor &query_mem_desc)
Expr * get_arg() const
Definition: Analyzer.h:1330
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint, const bool threads_can_reuse_group_by_buffers)
size_t getEffectiveKeyWidth() const
void codegenApproxQuantile(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
void checkErrorCode(llvm::Value *retCode)
CountDistinctDescriptors init_count_distinct_descriptors(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &group_by_range_info, const ExecutorDeviceType device_type, Executor *executor)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:106
#define LLVM_ALIGN(alignment)
RUNTIME_EXPORT void agg_mode_func(int64_t *agg, const int64_t val)
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
CountDistinctImplType impl_type_
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:305
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
ExecutorDeviceType
std::string to_string(char const *&&v)
Helpers for codegen of target expressions.
#define LL_BOOL(v)
size_t getColOnlyOffInBytes(const size_t col_idx) const
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:75
const SQLTypeInfo get_compact_type(const TargetInfo &target)
llvm::Value * codegenWindowPosition(const WindowFunctionContext *window_func_context, llvm::Value *pos_arg)
Definition: ColumnIR.cpp:230
llvm::LLVMContext & context_
Definition: CgenState.h:382
bool isEmpty() const
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const std::optional< int64_t > &group_cardinality_estimation)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:168
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:34
size_t getGroupbyColCount() const
const ColumnDescriptor * get_column_descriptor_maybe(const shared::ColumnKey &column_key)
Definition: Execute.h:241
RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
const JoinQualsPerNestingLevel join_quals
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
std::optional< size_t > limit
std::list< Analyzer::OrderEntry > order_entries
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
executor_(executor)
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, llvm::Value *varlen_output_buffer, const std::vector< llvm::Value * > &agg_out_vec, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
bool g_bigint_count
Definition: sqldefs.h:77
bool g_enable_watchdog
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, llvm::Value *varlen_output_buffer, DiamondCodegen &diamond_codegen) const
int64_t g_bitmap_memory_limit
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
DEVICE void allocate()
Definition: quantile.h:613
#define AUTOMATIC_IR_METADATA(CGENSTATE)
This file includes the class specification for the buffer manager (BufferMgr), and related data struc...
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:51
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
QueryDescriptionType getQueryDescriptionType() const
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
ExecutorDeviceType device_type
#define RUNTIME_EXPORT
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:43
#define CHECK_LT(x, y)
Definition: Logger.h:303
Definition: sqltypes.h:80
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
const shared::ColumnKey & getColumnKey() const
Definition: Analyzer.h:198
int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo &col_range_info)
#define CHECK_LE(x, y)
Definition: Logger.h:304
bool expr_is_rowid(const Analyzer::Expr *expr)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit)
std::unordered_map< size_t, SQLTypeInfo > target_exprs_original_type_infos
Definition: sqldefs.h:78
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
void add(Value const value)
Definition: AggMode.h:40
Descriptor for the result set buffer layout.
TO bit_cast(FROM &&from)
Definition: misc.h:298
CountDistinctImplType
const std::optional< int64_t > group_cardinality_estimation_
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:291
bool is_geometry() const
Definition: sqltypes.h:595
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
bool g_cluster
int64_t get_epoch_days_from_seconds(const int64_t seconds)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
static size_t getBaselineThreshold(bool for_count_distinct, ExecutorDeviceType device_type)
Definition: Execute.h:1446
#define CUR_FUNC
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:198
Definition: sqltypes.h:72
constexpr double n
Definition: Utm.h:38
size_t g_leaf_count
Definition: ParserNode.cpp:78
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:398
const RelAlgExecutionUnit & ra_exe_unit_
Definition: sqldefs.h:76
Definition: sqldefs.h:74
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals