OmniSciDB  b28c0d5765
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
19 
20 #include "CardinalityEstimator.h"
21 #include "CodeGenerator.h"
23 #include "ExpressionRange.h"
24 #include "ExpressionRewrite.h"
25 #include "GpuInitGroups.h"
26 #include "InPlaceSort.h"
28 #include "MaxwellCodegenPatch.h"
30 #include "TargetExprBuilder.h"
31 
32 #include "../CudaMgr/CudaMgr.h"
33 #include "../Shared/checked_alloc.h"
34 #include "../Shared/funcannotations.h"
35 #include "../Utils/ChunkIter.h"
37 #include "Execute.h"
38 #include "QueryTemplateGenerator.h"
39 #include "RuntimeFunctions.h"
40 #include "Shared/misc.h"
41 #include "StreamingTopN.h"
42 #include "TopKSort.h"
43 #include "WindowContext.h"
44 
45 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
46 
47 #include <cstring> // strcat()
48 #include <limits>
49 #include <numeric>
50 #include <string_view>
51 #include <thread>
52 
53 bool g_cluster{false};
54 bool g_bigint_count{false};
57 extern int64_t g_bitmap_memory_limit;
58 extern size_t g_leaf_count;
59 
60 bool ColRangeInfo::isEmpty() const {
61  return min == 0 && max == -1;
62 }
63 
64 std::ostream& operator<<(std::ostream& out, const ColRangeInfo& info) {
65  out << "Hash Type = " << info.hash_type_ << " min = " << info.min
66  << " max = " << info.max << " bucket = " << info.bucket
67  << " has_nulls = " << info.has_nulls << "\n";
68  return out;
69 }
70 
71 std::ostream& operator<<(std::ostream& out, const CountDistinctImplType& type) {
72  switch (type) {
74  out << "Invalid";
75  break;
77  out << "Bitmap";
78  break;
80  out << "UnorderedSet";
81  break;
82  default:
83  out << "<Unkown Type>";
84  break;
85  }
86  return out;
87 }
88 
89 std::ostream& operator<<(std::ostream& out, const CountDistinctDescriptor& desc) {
90  out << "Type = " << desc.impl_type_ << " min val = " << desc.min_val
91  << " bitmap_sz_bits = " << desc.bitmap_sz_bits
92  << " bool approximate = " << desc.approximate
93  << " device_type = " << desc.device_type
94  << " sub_bitmap_count = " << desc.sub_bitmap_count;
95  return out;
96 }
97 
98 namespace {
99 
100 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
101  int32_t agg_count{0};
102  for (auto target_expr : target_exprs) {
103  CHECK(target_expr);
104  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
105  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
106  const auto& ti = target_expr->get_type_info();
107  if (ti.is_buffer()) {
108  agg_count += 2;
109  } else if (ti.is_geometry()) {
110  agg_count += ti.get_physical_coord_cols() * 2;
111  } else {
112  ++agg_count;
113  }
114  continue;
115  }
116  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
117  agg_count += 2;
118  } else {
119  ++agg_count;
120  }
121  }
122  return agg_count;
123 }
124 
126  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
127  if (!col) {
128  return false;
129  }
130  const auto cd =
131  get_column_descriptor_maybe(col->get_column_id(), col->get_table_id(), cat);
132  if (!cd || !cd->isVirtualCol) {
133  return false;
134  }
135  CHECK_EQ("rowid", cd->columnName);
136  return true;
137 }
138 
139 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
140  for (const auto& target_expr : ra_exe_unit.target_exprs) {
141  const auto agg_info = get_target_info(target_expr, g_bigint_count);
142  if (agg_info.is_agg && is_distinct_target(agg_info)) {
143  return true;
144  }
145  }
146  return false;
147 }
148 
150  const int64_t max_entry_count) {
151  try {
152  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
153  checked_int64_t(col_range_info.min)) >= max_entry_count;
154  } catch (...) {
155  return true;
156  }
157 }
158 
159 bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate,
160  const ColRangeInfo& col_range_info) {
161  try {
162  // the cardinality estimate is the size of the baseline hash table. further penalize
163  // the baseline hash table by a factor of 2x due to overhead in computing baseline
164  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
165  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
166  // count of the column, we use baseline hash on the filtered set
167  return checked_int64_t(cardinality_estimate) * 2 <
168  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
169  checked_int64_t(col_range_info.min));
170  } catch (...) {
171  return false;
172  }
173 }
174 
176  const std::vector<InputTableInfo>& query_infos,
177  const Analyzer::Expr* expr,
178  Executor* executor) {
179  if (!expr) {
180  return {QueryDescriptionType::Projection, 0, 0, 0, false};
181  }
182 
183  const auto expr_range = getExpressionRange(
184  expr, query_infos, executor, boost::make_optional(ra_exe_unit.simple_quals));
185  switch (expr_range.getType()) {
187  if (expr_range.getIntMin() > expr_range.getIntMax()) {
188  return {
189  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
190  }
192  expr_range.getIntMin(),
193  expr_range.getIntMax(),
194  expr_range.getBucket(),
195  expr_range.hasNulls()};
196  }
199  if (expr_range.getFpMin() > expr_range.getFpMax()) {
200  return {
201  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
202  }
203  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
204  }
206  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
207  default:
208  CHECK(false);
209  }
210  CHECK(false);
211  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
212 }
213 
214 } // namespace
215 
217  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
218  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
219  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
220  // can expect this to be true anyway for grouped queries since the precise version
221  // uses significantly more memory.
222  const int64_t baseline_threshold =
224  if (ra_exe_unit_.groupby_exprs.size() != 1) {
225  try {
226  checked_int64_t cardinality{1};
227  bool has_nulls{false};
228  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
229  auto col_range_info = get_expr_range_info(
230  ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
231  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
232  // going through baseline hash if a non-integer type is encountered
233  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
234  }
235  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
236  CHECK_GE(crt_col_cardinality, 0);
237  cardinality *= crt_col_cardinality;
238  if (col_range_info.has_nulls) {
239  has_nulls = true;
240  }
241  }
242  // For zero or high cardinalities, use baseline layout.
243  if (!cardinality || cardinality > baseline_threshold) {
244  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
245  }
247  0,
248  int64_t(cardinality),
249  0,
250  has_nulls};
251  } catch (...) { // overflow when computing cardinality
252  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
253  }
254  }
255  // For single column groupby on high timestamps, force baseline hash due to wide ranges
256  // we are likely to encounter when applying quals to the expression range
257  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
258  // the range is small enough
259  if (ra_exe_unit_.groupby_exprs.front() &&
260  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
261  ra_exe_unit_.simple_quals.size() > 0) {
262  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
263  }
264  const auto col_range_info = get_expr_range_info(
266  if (!ra_exe_unit_.groupby_exprs.front()) {
267  return col_range_info;
268  }
269  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
270  const int64_t col_count =
272  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
274  max_entry_count = std::min(max_entry_count, baseline_threshold);
275  }
276  const auto& groupby_expr_ti = ra_exe_unit_.groupby_exprs.front()->get_type_info();
277  if (groupby_expr_ti.is_string() && !col_range_info.bucket) {
278  CHECK(groupby_expr_ti.get_compression() == kENCODING_DICT);
279 
280  const bool has_filters =
281  !ra_exe_unit_.quals.empty() || !ra_exe_unit_.simple_quals.empty();
282  if (has_filters &&
283  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
284  // if filters are present, we can use the filter to narrow the cardinality of the
285  // group by in the case of ranges too big for perfect hash. Otherwise, we are better
286  // off attempting perfect hash (since we know the range will be made of
287  // monotonically increasing numbers from min to max for dictionary encoded strings)
288  // and failing later due to excessive memory use.
289  // Check the conditions where baseline hash can provide a performance increase and
290  // return baseline hash (potentially forcing an estimator query) as the range type.
291  // Otherwise, return col_range_info which will likely be perfect hash, though could
292  // be baseline from a previous call of this function prior to the estimator query.
293  if (!ra_exe_unit_.sort_info.order_entries.empty()) {
294  // TODO(adb): allow some sorts to pass through this block by centralizing sort
295  // algorithm decision making
297  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
298  // always use baseline hash for column range too big for perfect hash with count
299  // distinct descriptors. We will need 8GB of CPU memory minimum for the perfect
300  // hash group by in this case.
302  col_range_info.min,
303  col_range_info.max,
304  0,
305  col_range_info.has_nulls};
306  } else {
307  // use original col range for sort
308  return col_range_info;
309  }
310  }
311  // if filters are present and the filtered range is less than the cardinality of
312  // the column, consider baseline hash
315  col_range_info)) {
317  col_range_info.min,
318  col_range_info.max,
319  0,
320  col_range_info.has_nulls};
321  }
322  }
323  } else if ((!expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get(),
324  *executor_->catalog_)) &&
325  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
326  !col_range_info.bucket) {
328  col_range_info.min,
329  col_range_info.max,
330  0,
331  col_range_info.has_nulls};
332  }
333  return col_range_info;
334 }
335 
337  checked_int64_t crt_col_cardinality =
338  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
339  if (col_range_info.bucket) {
340  crt_col_cardinality /= col_range_info.bucket;
341  }
342  return static_cast<int64_t>(crt_col_cardinality +
343  (1 + (col_range_info.has_nulls ? 1 : 0)));
344 }
345 
346 namespace {
347 // Like getBucketedCardinality() without counting nulls.
348 int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo& col_range_info) {
349  if (col_range_info.min <= col_range_info.max) {
350  size_t size = col_range_info.max - col_range_info.min;
351  if (col_range_info.bucket) {
352  size /= col_range_info.bucket;
353  }
354  if (size >= static_cast<size_t>(std::numeric_limits<int64_t>::max())) {
355  // try to use unordered_set instead of crashing due to CHECK failure
356  // i.e., CHECK_LT(size, std::numeric_limits<int64_t>::max());
357  return 0;
358  }
359  return static_cast<int64_t>(size + 1);
360  } else {
361  return 0;
362  }
363 }
364 } // namespace
365 
366 #define LL_CONTEXT executor_->cgen_state_->context_
367 #define LL_BUILDER executor_->cgen_state_->ir_builder_
368 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
369 #define LL_INT(v) executor_->cgen_state_->llInt(v)
370 #define LL_FP(v) executor_->cgen_state_->llFp(v)
371 #define ROW_FUNC executor_->cgen_state_->row_func_
372 #define CUR_FUNC executor_->cgen_state_->current_func_
373 
375  Executor* executor,
376  const ExecutorDeviceType device_type,
377  const RelAlgExecutionUnit& ra_exe_unit,
378  const std::vector<InputTableInfo>& query_infos,
379  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
380  const std::optional<int64_t>& group_cardinality_estimation)
381  : executor_(executor)
382  , ra_exe_unit_(ra_exe_unit)
383  , query_infos_(query_infos)
384  , row_set_mem_owner_(row_set_mem_owner)
385  , device_type_(device_type)
386  , group_cardinality_estimation_(group_cardinality_estimation) {
387  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
388  if (!groupby_expr) {
389  continue;
390  }
391  const auto& groupby_ti = groupby_expr->get_type_info();
392  if (groupby_ti.is_bytes()) {
393  throw std::runtime_error(
394  "Cannot group by string columns which are not dictionary encoded.");
395  }
396  if (groupby_ti.is_buffer()) {
397  throw std::runtime_error("Group by buffer not supported");
398  }
399  if (groupby_ti.is_geometry()) {
400  throw std::runtime_error("Group by geometry not supported");
401  }
402  }
403 }
404 
406  const size_t shard_count) const {
407  size_t device_count{0};
409  device_count = executor_->cudaMgr()->getDeviceCount();
410  CHECK_GT(device_count, 0u);
411  }
412 
413  int64_t bucket{col_range_info.bucket};
414 
415  if (shard_count) {
416  CHECK(!col_range_info.bucket);
417  /*
418  when a node has fewer devices than shard count,
419  a) In a distributed setup, the minimum distance between two keys would be
420  device_count because shards are stored consecutively across the physical tables,
421  i.e if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1
422  would have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf
423  node has only 1 device, in this case, all the keys from each node are loaded on
424  the device each.
425 
426  b) In a single node setup, the distance would be minimum of device_count or
427  difference of device_count - shard_count. For example: If a single node server
428  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
429  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9
430  device 3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum
431  of device_count or difference.
432 
433  When a node has device count equal to or more than shard count then the
434  minimum distance is always at least shard_count * no of leaf nodes.
435  */
436  if (device_count < shard_count) {
437  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
438  : std::min(device_count, shard_count - device_count);
439  } else {
440  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
441  }
442  }
443 
444  return bucket;
445 }
446 
447 namespace {
448 
459  const std::vector<InputTableInfo>& query_infos,
460  const bool is_group_by,
461  Executor* executor) {
462  bool keyless{true}, found{false};
463  int32_t num_agg_expr{0};
464  int32_t index{0};
465  for (const auto target_expr : ra_exe_unit.target_exprs) {
466  const auto agg_info = get_target_info(target_expr, g_bigint_count);
467  const auto chosen_type = get_compact_type(agg_info);
468  if (agg_info.is_agg) {
469  num_agg_expr++;
470  }
471  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
472  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
473  CHECK(agg_expr);
474  const auto arg_expr = agg_arg(target_expr);
475  const bool float_argument_input = takes_float_argument(agg_info);
476  switch (agg_info.agg_kind) {
477  case kAVG:
478  ++index;
479  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
480  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
481  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
482  expr_range_info.hasNulls()) {
483  break;
484  }
485  }
486  found = true;
487  break;
488  case kCOUNT:
489  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
490  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
491  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
492  expr_range_info.hasNulls()) {
493  break;
494  }
495  }
496  found = true;
497  break;
498  case kSUM: {
499  auto arg_ti = arg_expr->get_type_info();
500  if (constrained_not_null(arg_expr, ra_exe_unit.quals)) {
501  arg_ti.set_notnull(true);
502  }
503  if (!arg_ti.get_notnull()) {
504  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
505  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
506  !expr_range_info.hasNulls()) {
507  found = true;
508  }
509  } else {
510  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
511  switch (expr_range_info.getType()) {
514  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
515  found = true;
516  }
517  break;
519  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
520  found = true;
521  }
522  break;
523  default:
524  break;
525  }
526  }
527  break;
528  }
529  case kMIN: {
530  CHECK(agg_expr && agg_expr->get_arg());
531  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
532  if (arg_ti.is_string() || arg_ti.is_buffer()) {
533  break;
534  }
535  auto expr_range_info =
536  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
537  auto init_max = get_agg_initial_val(agg_info.agg_kind,
538  chosen_type,
539  is_group_by || float_argument_input,
540  float_argument_input ? sizeof(float) : 8);
541  switch (expr_range_info.getType()) {
544  auto double_max =
545  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
546  if (expr_range_info.getFpMax() < double_max) {
547  found = true;
548  }
549  break;
550  }
552  if (expr_range_info.getIntMax() < init_max) {
553  found = true;
554  }
555  break;
556  default:
557  break;
558  }
559  break;
560  }
561  case kMAX: {
562  CHECK(agg_expr && agg_expr->get_arg());
563  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
564  if (arg_ti.is_string() || arg_ti.is_buffer()) {
565  break;
566  }
567  auto expr_range_info =
568  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
569  // NULL sentinel and init value for kMAX are identical, which results in
570  // ambiguity in detecting empty keys in presence of nulls.
571  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
572  expr_range_info.hasNulls()) {
573  break;
574  }
575  auto init_min = get_agg_initial_val(agg_info.agg_kind,
576  chosen_type,
577  is_group_by || float_argument_input,
578  float_argument_input ? sizeof(float) : 8);
579  switch (expr_range_info.getType()) {
582  auto double_min =
583  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
584  if (expr_range_info.getFpMin() > double_min) {
585  found = true;
586  }
587  break;
588  }
590  if (expr_range_info.getIntMin() > init_min) {
591  found = true;
592  }
593  break;
594  default:
595  break;
596  }
597  break;
598  }
599  default:
600  keyless = false;
601  break;
602  }
603  }
604  if (!keyless) {
605  break;
606  }
607  if (!found) {
608  ++index;
609  }
610  }
611 
612  // shouldn't use keyless for projection only
613  return {
614  keyless && found,
615  index,
616  };
617 }
618 
620  const RelAlgExecutionUnit& ra_exe_unit,
621  const std::vector<InputTableInfo>& query_infos,
622  const ColRangeInfo& group_by_range_info,
623  const ExecutorDeviceType device_type,
624  Executor* executor) {
625  CountDistinctDescriptors count_distinct_descriptors;
626  auto compute_bytes_per_group =
627  [](size_t bitmap_sz, size_t sub_bitmap_count, ExecutorDeviceType device_type) {
628  size_t effective_size_bytes = (bitmap_sz + 7) / 8;
629  const auto padded_size =
630  (device_type == ExecutorDeviceType::GPU || sub_bitmap_count > 1)
631  ? align_to_int64(effective_size_bytes)
632  : effective_size_bytes;
633  return padded_size * sub_bitmap_count;
634  };
635  for (size_t i = 0; i < ra_exe_unit.target_exprs.size(); i++) {
636  const auto target_expr = ra_exe_unit.target_exprs[i];
637  auto agg_info = get_target_info(target_expr, g_bigint_count);
638  if (is_distinct_target(agg_info)) {
639  CHECK(agg_info.is_agg);
640  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
641  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
642  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
643  if (arg_ti.is_bytes()) {
644  throw std::runtime_error(
645  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
646  }
647  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
648  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
649  }
650  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
651  throw std::runtime_error(
652  "APPROX_COUNT_DISTINCT on geometry columns not supported");
653  }
654  if (agg_info.is_distinct && arg_ti.is_geometry()) {
655  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
656  }
657  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
658  auto arg_range_info =
659  arg_ti.is_fp() ? no_range_info
661  ra_exe_unit, query_infos, agg_expr->get_arg(), executor);
662  const auto it = ra_exe_unit.target_exprs_original_type_infos.find(i);
663  if (it != ra_exe_unit.target_exprs_original_type_infos.end()) {
664  const auto& original_target_expr_ti = it->second;
665  if (arg_ti.is_integer() && original_target_expr_ti.get_type() == kDATE &&
666  original_target_expr_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
667  // manually encode the col range of date col if necessary
668  // (see conditionally_change_arg_to_int_type function in RelAlgExecutor.cpp)
669  auto is_date_value_not_encoded = [&original_target_expr_ti](int64_t date_val) {
670  if (original_target_expr_ti.get_comp_param() == 16) {
671  return date_val < INT16_MIN || date_val > INT16_MAX;
672  } else {
673  return date_val < INT32_MIN || date_val > INT32_MIN;
674  }
675  };
676  if (is_date_value_not_encoded(arg_range_info.min)) {
677  // chunk metadata of the date column contains decoded value
678  // so we manually encode it again here to represent its column range correctly
679  arg_range_info.min =
681  }
682  if (is_date_value_not_encoded(arg_range_info.max)) {
683  arg_range_info.max =
685  }
686  // now we manually encode the value, so we need to invalidate bucket value
687  // i.e., 86000 -> 0, to correctly calculate the size of bitmap
688  arg_range_info.bucket = 0;
689  }
690  }
691 
693  int64_t bitmap_sz_bits{0};
694  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
695  const auto error_rate = agg_expr->get_arg1();
696  if (error_rate) {
697  CHECK(error_rate->get_type_info().get_type() == kINT);
698  CHECK_GE(error_rate->get_constval().intval, 1);
699  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
700  } else {
701  bitmap_sz_bits = g_hll_precision_bits;
702  }
703  }
704  if (arg_range_info.isEmpty()) {
705  count_distinct_descriptors.emplace_back(
707  0,
708  64,
709  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
710  device_type,
711  1});
712  continue;
713  }
714  const auto sub_bitmap_count =
715  get_count_distinct_sub_bitmap_count(bitmap_sz_bits, ra_exe_unit, device_type);
716  size_t worst_case_num_groups{1};
717  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
718  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
719  // implementation for arrays
720  count_distinct_impl_type = CountDistinctImplType::Bitmap;
721  if (shared::is_any<kCOUNT, kCOUNT_IF>(agg_info.agg_kind)) {
722  bitmap_sz_bits = get_bucketed_cardinality_without_nulls(arg_range_info);
723  if (bitmap_sz_bits <= 0 || g_bitmap_memory_limit <= bitmap_sz_bits) {
724  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
725  }
726  // check a potential OOM when using bitmap-based approach
727  const auto total_bytes_per_entry =
728  compute_bytes_per_group(bitmap_sz_bits, sub_bitmap_count, device_type);
729  const auto range_bucket = std::max(group_by_range_info.bucket, (int64_t)1);
730  const auto maximum_num_groups =
731  (group_by_range_info.max - group_by_range_info.min + 1) / range_bucket;
732  const auto total_bitmap_bytes_for_groups =
733  total_bytes_per_entry * maximum_num_groups;
734  // we can estimate a potential OOM of bitmap-based count-distinct operator
735  // by using the logic "check_total_bitmap_memory"
736  if (total_bitmap_bytes_for_groups >=
737  static_cast<size_t>(g_bitmap_memory_limit)) {
738  const auto agg_expr_max_entry_count =
739  arg_range_info.max - arg_range_info.min + 1;
740  int64_t max_agg_expr_table_cardinality{1};
741  std::set<const Analyzer::ColumnVar*,
742  bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
744  agg_expr->collect_column_var(colvar_set, true);
745  for (const auto cv : colvar_set) {
746  auto it =
747  std::find_if(query_infos.begin(),
748  query_infos.end(),
749  [&](const auto& input_table_info) {
750  return input_table_info.table_id == cv->get_table_id();
751  });
752  int64_t cur_table_cardinality =
753  it != query_infos.end()
754  ? static_cast<int64_t>(it->info.getNumTuplesUpperBound())
755  : -1;
756  max_agg_expr_table_cardinality =
757  std::max(max_agg_expr_table_cardinality, cur_table_cardinality);
758  worst_case_num_groups *= cur_table_cardinality;
759  }
760  auto has_valid_stat = [agg_expr_max_entry_count, maximum_num_groups]() {
761  return agg_expr_max_entry_count > 0 && maximum_num_groups > 0;
762  };
763  // if we have valid stats regarding input expr, we can try to relax the OOM
764  if (has_valid_stat()) {
765  // a threshold related to a ratio of a range of agg expr (let's say R)
766  // and table cardinality (C), i.e., use unordered_set if the # bits to build
767  // a bitmap based on R is four times larger than that of C
768  const size_t unordered_set_threshold{2};
769  // When we detect OOM of bitmap-based approach we selectively switch it to
770  // hash set-based processing logic if one of the followings is satisfied:
771  // 1) the column range is too wide compared with the table cardinality, or
772  // 2) the column range is too wide compared with the avg of # unique values
773  // per group by entry
774  const auto bits_for_agg_entry = std::ceil(log(agg_expr_max_entry_count));
775  const auto bits_for_agg_table =
776  std::ceil(log(max_agg_expr_table_cardinality));
777  const auto avg_num_unique_entries_per_group =
778  std::ceil(max_agg_expr_table_cardinality / maximum_num_groups);
779  // case a) given a range of entry count of agg_expr and the maximum
780  // cardinality among source tables of the agg_expr , we try to detect the
781  // misleading case of too sparse column range , i.e., agg_expr has 1M column
782  // range but only has two tuples {1 and 1M} / case b) check whether
783  // using bitmap is really beneficial when considering uniform distribution
784  // of (unique) keys.
785  if ((bits_for_agg_entry - bits_for_agg_table) >= unordered_set_threshold ||
786  agg_expr_max_entry_count >= avg_num_unique_entries_per_group) {
787  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
788  } else {
789  throw std::runtime_error(
790  "Consider using approx_count_distinct operator instead of "
791  "count_distinct operator to lower the memory "
792  "requirements");
793  }
794  }
795  }
796  }
797  }
798  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
799  count_distinct_impl_type == CountDistinctImplType::UnorderedSet &&
800  !(arg_ti.is_array() || arg_ti.is_geometry())) {
801  count_distinct_impl_type = CountDistinctImplType::Bitmap;
802  }
803  const size_t too_many_entries{100000000};
804  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
805  worst_case_num_groups > too_many_entries &&
806  count_distinct_impl_type == CountDistinctImplType::UnorderedSet) {
807  throw WatchdogException(
808  "Detect too many input entries for set-based count distinct operator under "
809  "the watchdog");
810  }
811  count_distinct_descriptors.emplace_back(
812  CountDistinctDescriptor{count_distinct_impl_type,
813  arg_range_info.min,
814  bitmap_sz_bits,
815  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
816  device_type,
817  sub_bitmap_count});
818  } else {
819  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
820  CountDistinctImplType::Invalid, 0, 0, false, device_type, 0});
821  }
822  }
823  return count_distinct_descriptors;
824 }
825 
826 } // namespace
827 
828 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
829  const bool allow_multifrag,
830  const size_t max_groups_buffer_entry_count,
831  const int8_t crt_min_byte_width,
832  RenderInfo* render_info,
833  const bool output_columnar_hint) {
834  const auto shard_count =
837  : 0;
838  bool sort_on_gpu_hint =
839  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
842  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
843  // but the total output buffer size would be too big or it's a sharded top query.
844  // For the sake of managing risk, use the new result set way very selectively for
845  // this case only (alongside the baseline layout we've enabled for a while now).
846  bool must_use_baseline_sort = shard_count;
847  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
848  while (true) {
849  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
850  max_groups_buffer_entry_count,
851  crt_min_byte_width,
852  sort_on_gpu_hint,
853  render_info,
854  must_use_baseline_sort,
855  output_columnar_hint);
856  CHECK(query_mem_desc);
857  if (query_mem_desc->sortOnGpu() &&
858  (query_mem_desc->getBufferSizeBytes(device_type_) +
859  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
860  2 * 1024 * 1024 * 1024LL) {
861  must_use_baseline_sort = true;
862  sort_on_gpu_hint = false;
863  } else {
864  break;
865  }
866  }
867  return query_mem_desc;
868 }
869 
870 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
871  const bool allow_multifrag,
872  const size_t max_groups_buffer_entry_count,
873  const int8_t crt_min_byte_width,
874  const bool sort_on_gpu_hint,
875  RenderInfo* render_info,
876  const bool must_use_baseline_sort,
877  const bool output_columnar_hint) {
878  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
879 
880  auto col_range_info_nosharding = getColRangeInfo();
881 
882  const auto shard_count =
885  : 0;
886 
887  const auto col_range_info =
888  ColRangeInfo{col_range_info_nosharding.hash_type_,
889  col_range_info_nosharding.min,
890  col_range_info_nosharding.max,
891  getShardedTopBucket(col_range_info_nosharding, shard_count),
892  col_range_info_nosharding.has_nulls};
893 
894  // Non-grouped aggregates do not support accessing aggregated ranges
895  // Keyless hash is currently only supported with single-column perfect hash
896  const auto keyless_info =
897  !(is_group_by &&
898  col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash)
899  ? KeylessInfo{false, -1}
901 
902  if (g_enable_watchdog &&
903  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
904  max_groups_buffer_entry_count > g_watchdog_baseline_max_groups) ||
905  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
906  ra_exe_unit_.groupby_exprs.size() == 1 &&
907  (col_range_info.max - col_range_info.min) /
908  std::max(col_range_info.bucket, int64_t(1)) >
909  130000000))) {
910  throw WatchdogException("Query would use too much memory");
911  }
912 
913  const auto count_distinct_descriptors = init_count_distinct_descriptors(
914  ra_exe_unit_, query_infos_, col_range_info, device_type_, executor_);
915  try {
917  ra_exe_unit_,
918  query_infos_,
919  col_range_info,
920  keyless_info,
921  allow_multifrag,
922  device_type_,
923  crt_min_byte_width,
924  sort_on_gpu_hint,
925  shard_count,
926  max_groups_buffer_entry_count,
927  render_info,
928  count_distinct_descriptors,
929  must_use_baseline_sort,
930  output_columnar_hint,
931  /*streaming_top_n_hint=*/true);
932  } catch (const StreamingTopNOOM& e) {
933  LOG(WARNING) << e.what() << " Disabling Streaming Top N.";
935  ra_exe_unit_,
936  query_infos_,
937  col_range_info,
938  keyless_info,
939  allow_multifrag,
940  device_type_,
941  crt_min_byte_width,
942  sort_on_gpu_hint,
943  shard_count,
944  max_groups_buffer_entry_count,
945  render_info,
946  count_distinct_descriptors,
947  must_use_baseline_sort,
948  output_columnar_hint,
949  /*streaming_top_n_hint=*/false);
950  }
951 }
952 
954  const std::list<Analyzer::OrderEntry>& order_entries) {
955  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
956  return false;
957  }
958  for (const auto& order_entry : order_entries) {
959  CHECK_GE(order_entry.tle_no, 1);
960  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
961  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
962  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
963  return false;
964  }
965  // TODO(alex): relax the restrictions
966  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
967  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
968  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
969  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
970  return false;
971  }
972  if (agg_expr->get_arg()) {
973  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
974  if (arg_ti.is_fp()) {
975  return false;
976  }
977  auto expr_range_info =
978  get_expr_range_info(ra_exe_unit_, query_infos_, agg_expr->get_arg(), executor_);
979  // TOD(adb): QMD not actually initialized here?
980  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
981  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
982  expr_range_info.has_nulls) &&
983  order_entry.is_desc == order_entry.nulls_first) {
984  return false;
985  }
986  }
987  const auto& target_ti = target_expr->get_type_info();
988  CHECK(!target_ti.is_buffer());
989  if (!target_ti.is_integer()) {
990  return false;
991  }
992  }
993  return true;
994 }
995 
996 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
997  llvm::BasicBlock* sc_false,
999  const CompilationOptions& co,
1000  const GpuSharedMemoryContext& gpu_smem_context) {
1001  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1002  CHECK(filter_result);
1003 
1004  bool can_return_error = false;
1005  llvm::BasicBlock* filter_false{nullptr};
1006 
1007  {
1008  const bool is_group_by = !ra_exe_unit_.groupby_exprs.empty();
1009 
1010  if (executor_->isArchMaxwell(co.device_type)) {
1011  prependForceSync();
1012  }
1013  DiamondCodegen filter_cfg(filter_result,
1014  executor_,
1015  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
1016  "filter", // filter_true and filter_false basic blocks
1017  nullptr,
1018  false);
1019  filter_false = filter_cfg.cond_false_;
1020 
1021  if (is_group_by) {
1023  !query_mem_desc.useStreamingTopN()) {
1024  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
1025  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
1026  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
1027  llvm::Value* old_total_matched_val{nullptr};
1029  old_total_matched_val =
1030  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
1031  total_matched_ptr,
1032  LL_INT(int32_t(1)),
1033 #if LLVM_VERSION_MAJOR > 12
1034  LLVM_ALIGN(8),
1035 #endif
1036  llvm::AtomicOrdering::Monotonic);
1037  } else {
1038  old_total_matched_val = LL_BUILDER.CreateLoad(
1039  total_matched_ptr->getType()->getPointerElementType(), total_matched_ptr);
1040  LL_BUILDER.CreateStore(
1041  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
1042  total_matched_ptr);
1043  }
1044  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
1045  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
1046  }
1047 
1048  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
1049  auto varlen_output_buffer = codegenVarlenOutputBuffer(query_mem_desc);
1050  if (query_mem_desc.usesGetGroupValueFast() ||
1051  query_mem_desc.getQueryDescriptionType() ==
1053  if (query_mem_desc.getGroupbyColCount() > 1) {
1054  filter_cfg.setChainToNext();
1055  }
1056  // Don't generate null checks if the group slot is guaranteed to be non-null,
1057  // as it's the case for get_group_value_fast* family.
1058  can_return_error = codegenAggCalls(agg_out_ptr_w_idx,
1059  varlen_output_buffer,
1060  {},
1062  co,
1063  gpu_smem_context,
1064  filter_cfg);
1065  } else {
1066  {
1067  llvm::Value* nullcheck_cond{nullptr};
1068  if (query_mem_desc.didOutputColumnar()) {
1069  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
1070  LL_INT(int32_t(0)));
1071  } else {
1072  nullcheck_cond = LL_BUILDER.CreateICmpNE(
1073  std::get<0>(agg_out_ptr_w_idx),
1074  llvm::ConstantPointerNull::get(
1075  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
1076  }
1077  DiamondCodegen nullcheck_cfg(
1078  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
1079  codegenAggCalls(agg_out_ptr_w_idx,
1080  varlen_output_buffer,
1081  {},
1083  co,
1084  gpu_smem_context,
1085  filter_cfg);
1086  }
1087  can_return_error = true;
1088  if (query_mem_desc.getQueryDescriptionType() ==
1090  query_mem_desc.useStreamingTopN()) {
1091  // Ignore rejection on pushing current row to top-K heap.
1092  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
1093  } else {
1094  CodeGenerator code_generator(executor_);
1095  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
1096  // TODO(alex): remove the trunc once pos is converted to 32 bits
1097  code_generator.posArg(nullptr),
1098  get_int_type(32, LL_CONTEXT))));
1099  }
1100  }
1101  } else {
1102  if (ra_exe_unit_.estimator) {
1103  std::stack<llvm::BasicBlock*> array_loops;
1104  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
1105  } else {
1106  auto arg_it = ROW_FUNC->arg_begin();
1107  std::vector<llvm::Value*> agg_out_vec;
1108  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
1109  agg_out_vec.push_back(&*arg_it++);
1110  }
1111  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
1112  /*varlen_output_buffer=*/nullptr,
1113  agg_out_vec,
1114  query_mem_desc,
1115  co,
1116  gpu_smem_context,
1117  filter_cfg);
1118  }
1119  }
1120  }
1121 
1122  if (ra_exe_unit_.join_quals.empty()) {
1123  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
1124  } else if (sc_false) {
1125  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
1126  LL_BUILDER.SetInsertPoint(sc_false);
1127  LL_BUILDER.CreateBr(filter_false);
1128  LL_BUILDER.SetInsertPoint(saved_insert_block);
1129  }
1130 
1131  return can_return_error;
1132 }
1133 
1135  llvm::Value* groups_buffer,
1137  const CompilationOptions& co,
1138  DiamondCodegen& diamond_codegen) {
1139  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1141  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1142  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
1143  CHECK(!group_expr);
1144  if (!query_mem_desc.didOutputColumnar()) {
1145  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
1146  }
1147  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1148  ? 0
1149  : query_mem_desc.getRowSize() / sizeof(int64_t);
1150  CodeGenerator code_generator(executor_);
1151  if (query_mem_desc.useStreamingTopN()) {
1152  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
1153  CHECK_GE(only_order_entry.tle_no, int(1));
1154  const size_t target_idx = only_order_entry.tle_no - 1;
1155  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1156  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1157  const auto chosen_bytes =
1158  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1159  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1160  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1162  std::string fname = "get_bin_from_k_heap";
1163  const auto& oe_ti = order_entry_expr->get_type_info();
1164  llvm::Value* null_key_lv = nullptr;
1165  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1166  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1167  switch (bit_width) {
1168  case 32:
1169  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1170  break;
1171  case 64:
1172  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1173  break;
1174  default:
1175  CHECK(false);
1176  }
1177  fname += "_int" + std::to_string(bit_width) + "_t";
1178  } else {
1179  CHECK(oe_ti.is_fp());
1180  if (order_entry_lv->getType()->isDoubleTy()) {
1181  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1182  } else {
1183  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1184  }
1185  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1186  }
1187  const auto key_slot_idx =
1189  return emitCall(
1190  fname,
1191  {groups_buffer,
1192  LL_INT(n),
1193  LL_INT(row_size_quad),
1194  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1195  LL_BOOL(only_order_entry.is_desc),
1196  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1197  LL_BOOL(only_order_entry.nulls_first),
1198  null_key_lv,
1199  order_entry_lv});
1200  } else {
1201  auto* arg = get_arg_by_name(ROW_FUNC, "max_matched");
1202  const auto output_buffer_entry_count_lv =
1203  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1204  arg = get_arg_by_name(ROW_FUNC, "old_total_matched");
1205  const auto group_expr_lv =
1206  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1207  std::vector<llvm::Value*> args{groups_buffer,
1208  output_buffer_entry_count_lv,
1209  group_expr_lv,
1210  code_generator.posArg(nullptr)};
1211  if (query_mem_desc.didOutputColumnar()) {
1212  const auto columnar_output_offset =
1213  emitCall("get_columnar_scan_output_offset", args);
1214  return columnar_output_offset;
1215  }
1216  args.push_back(LL_INT(row_size_quad));
1217  return emitCall("get_scan_output_slot", args);
1218  }
1219 }
1220 
1221 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1223  const CompilationOptions& co,
1224  DiamondCodegen& diamond_codegen) {
1225  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1226  auto arg_it = ROW_FUNC->arg_begin();
1227  auto groups_buffer = arg_it++;
1228 
1229  std::stack<llvm::BasicBlock*> array_loops;
1230 
1231  // TODO(Saman): move this logic outside of this function.
1233  if (query_mem_desc.didOutputColumnar()) {
1234  return std::make_tuple(
1235  &*groups_buffer,
1236  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1237  } else {
1238  return std::make_tuple(
1239  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1240  nullptr);
1241  }
1242  }
1243 
1244  CHECK(query_mem_desc.getQueryDescriptionType() ==
1246  query_mem_desc.getQueryDescriptionType() ==
1248 
1249  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1250  ? 0
1251  : query_mem_desc.getRowSize() / sizeof(int64_t);
1252 
1253  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1254  ? sizeof(int64_t)
1255  : query_mem_desc.getEffectiveKeyWidth();
1256  // for multi-column group by
1257  llvm::Value* group_key = nullptr;
1258  llvm::Value* key_size_lv = nullptr;
1259 
1260  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1261  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1262  if (query_mem_desc.getQueryDescriptionType() ==
1264  group_key =
1265  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1266  } else if (query_mem_desc.getQueryDescriptionType() ==
1268  group_key =
1269  col_width_size == sizeof(int32_t)
1270  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1271  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1272  }
1273  CHECK(group_key);
1274  CHECK(key_size_lv);
1275  }
1276 
1277  int32_t subkey_idx = 0;
1278  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1279  for (const auto& group_expr : ra_exe_unit_.groupby_exprs) {
1280  const auto col_range_info =
1282  const auto translated_null_value = static_cast<int64_t>(
1283  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1284  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1285  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1286  : checked_int64_t(col_range_info.max) +
1287  (col_range_info.bucket ? col_range_info.bucket : 1));
1288 
1289  const bool col_has_nulls =
1290  query_mem_desc.getQueryDescriptionType() ==
1292  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1293  ? query_mem_desc.hasNulls()
1294  : col_range_info.has_nulls)
1295  : false;
1296 
1297  const auto group_expr_lvs =
1298  executor_->groupByColumnCodegen(group_expr.get(),
1299  col_width_size,
1300  co,
1301  col_has_nulls,
1302  translated_null_value,
1303  diamond_codegen,
1304  array_loops,
1305  query_mem_desc.threadsShareMemory());
1306  const auto group_expr_lv = group_expr_lvs.translated_value;
1307  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1308  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1309  return codegenSingleColumnPerfectHash(query_mem_desc,
1310  co,
1311  &*groups_buffer,
1312  group_expr_lv,
1313  group_expr_lvs.original_value,
1314  row_size_quad);
1315  } else {
1316  // store the sub-key to the buffer
1317  LL_BUILDER.CreateStore(
1318  group_expr_lv,
1319  LL_BUILDER.CreateGEP(
1320  group_key->getType()->getScalarType()->getPointerElementType(),
1321  group_key,
1322  LL_INT(subkey_idx++)));
1323  }
1324  }
1325  if (query_mem_desc.getQueryDescriptionType() ==
1327  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1329  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1330  } else if (query_mem_desc.getQueryDescriptionType() ==
1333  &*groups_buffer,
1334  group_key,
1335  key_size_lv,
1336  query_mem_desc,
1337  col_width_size,
1338  row_size_quad);
1339  }
1340  CHECK(false);
1341  return std::make_tuple(nullptr, nullptr);
1342 }
1343 
1346  if (!query_mem_desc.hasVarlenOutput()) {
1347  return nullptr;
1348  }
1349 
1350  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1351  auto arg_it = ROW_FUNC->arg_begin();
1352  arg_it++; /* groups_buffer */
1353  auto varlen_output_buffer = arg_it++;
1354  CHECK(varlen_output_buffer->getType() == llvm::Type::getInt64PtrTy(LL_CONTEXT));
1355  return varlen_output_buffer;
1356 }
1357 
1358 std::tuple<llvm::Value*, llvm::Value*>
1361  const CompilationOptions& co,
1362  llvm::Value* groups_buffer,
1363  llvm::Value* group_expr_lv_translated,
1364  llvm::Value* group_expr_lv_original,
1365  const int32_t row_size_quad) {
1366  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1367  CHECK(query_mem_desc.usesGetGroupValueFast());
1368  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1369  ? "get_columnar_group_bin_offset"
1370  : "get_group_value_fast"};
1371  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1372  get_group_fn_name += "_keyless";
1373  }
1374  if (query_mem_desc.interleavedBins(co.device_type)) {
1375  CHECK(!query_mem_desc.didOutputColumnar());
1376  CHECK(query_mem_desc.hasKeylessHash());
1377  get_group_fn_name += "_semiprivate";
1378  }
1379  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1380  &*group_expr_lv_translated};
1381  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1382  query_mem_desc.mustUseBaselineSort()) {
1383  get_group_fn_name += "_with_original_key";
1384  get_group_fn_args.push_back(group_expr_lv_original);
1385  }
1386  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1387  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1388  if (!query_mem_desc.hasKeylessHash()) {
1389  if (!query_mem_desc.didOutputColumnar()) {
1390  get_group_fn_args.push_back(LL_INT(row_size_quad));
1391  }
1392  } else {
1393  if (!query_mem_desc.didOutputColumnar()) {
1394  get_group_fn_args.push_back(LL_INT(row_size_quad));
1395  }
1396  if (query_mem_desc.interleavedBins(co.device_type)) {
1397  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1398  get_group_fn_args.push_back(warp_idx);
1399  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1400  }
1401  }
1402  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1403  return std::make_tuple(&*groups_buffer,
1404  emitCall(get_group_fn_name, get_group_fn_args));
1405  }
1406  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1407 }
1408 
1409 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1410  llvm::Value* groups_buffer,
1411  llvm::Value* group_key,
1412  llvm::Value* key_size_lv,
1413  const QueryMemoryDescriptor& query_mem_desc,
1414  const int32_t row_size_quad) {
1415  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1416  CHECK(query_mem_desc.getQueryDescriptionType() ==
1418  // compute the index (perfect hash)
1419  auto perfect_hash_func = codegenPerfectHashFunction();
1420  auto hash_lv =
1421  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1422 
1423  if (query_mem_desc.didOutputColumnar()) {
1424  if (!query_mem_desc.hasKeylessHash()) {
1425  const std::string set_matching_func_name{
1426  "set_matching_group_value_perfect_hash_columnar"};
1427  const std::vector<llvm::Value*> set_matching_func_arg{
1428  groups_buffer,
1429  hash_lv,
1430  group_key,
1431  key_size_lv,
1432  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1433  query_mem_desc.getEntryCount())};
1434  emitCall(set_matching_func_name, set_matching_func_arg);
1435  }
1436  return std::make_tuple(groups_buffer, hash_lv);
1437  } else {
1438  if (query_mem_desc.hasKeylessHash()) {
1439  return std::make_tuple(emitCall("get_matching_group_value_perfect_hash_keyless",
1440  {groups_buffer, hash_lv, LL_INT(row_size_quad)}),
1441  nullptr);
1442  } else {
1443  return std::make_tuple(
1444  emitCall(
1445  "get_matching_group_value_perfect_hash",
1446  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1447  nullptr);
1448  }
1449  }
1450 }
1451 
1452 std::tuple<llvm::Value*, llvm::Value*>
1454  const CompilationOptions& co,
1455  llvm::Value* groups_buffer,
1456  llvm::Value* group_key,
1457  llvm::Value* key_size_lv,
1458  const QueryMemoryDescriptor& query_mem_desc,
1459  const size_t key_width,
1460  const int32_t row_size_quad) {
1461  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1462  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1463  CHECK(key_width == sizeof(int32_t));
1464  group_key =
1465  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1466  }
1467  std::vector<llvm::Value*> func_args{
1468  groups_buffer,
1469  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1470  &*group_key,
1471  &*key_size_lv,
1472  LL_INT(static_cast<int32_t>(key_width))};
1473  std::string func_name{"get_group_value"};
1474  if (query_mem_desc.didOutputColumnar()) {
1475  func_name += "_columnar_slot";
1476  } else {
1477  func_args.push_back(LL_INT(row_size_quad));
1478  }
1479  if (co.with_dynamic_watchdog) {
1480  func_name += "_with_watchdog";
1481  }
1482  if (query_mem_desc.didOutputColumnar()) {
1483  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1484  } else {
1485  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1486  }
1487 }
1488 
1490  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1491  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1492  auto ft = llvm::FunctionType::get(
1493  get_int_type(32, LL_CONTEXT),
1494  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1495  false);
1496  auto key_hash_func = llvm::Function::Create(ft,
1497  llvm::Function::ExternalLinkage,
1498  "perfect_key_hash",
1499  executor_->cgen_state_->module_);
1500  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1501  mark_function_always_inline(key_hash_func);
1502  auto& key_buff_arg = *key_hash_func->args().begin();
1503  llvm::Value* key_buff_lv = &key_buff_arg;
1504  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1505  llvm::IRBuilder<> key_hash_func_builder(bb);
1506  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1507  std::vector<int64_t> cardinalities;
1508  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1509  auto col_range_info =
1510  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1511  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1512  cardinalities.push_back(getBucketedCardinality(col_range_info));
1513  }
1514  size_t dim_idx = 0;
1515  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1516  auto* gep = key_hash_func_builder.CreateGEP(
1517  key_buff_lv->getType()->getScalarType()->getPointerElementType(),
1518  key_buff_lv,
1519  LL_INT(dim_idx));
1520  auto key_comp_lv =
1521  key_hash_func_builder.CreateLoad(gep->getType()->getPointerElementType(), gep);
1522  auto col_range_info =
1523  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1524  auto crt_term_lv =
1525  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1526  if (col_range_info.bucket) {
1527  crt_term_lv =
1528  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1529  }
1530  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1531  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1532  LL_INT(cardinalities[prev_dim_idx]));
1533  }
1534  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1535  ++dim_idx;
1536  }
1537  key_hash_func_builder.CreateRet(
1538  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1539  return key_hash_func;
1540 }
1541 
1543  const TargetInfo& agg_info,
1544  llvm::Value* target) {
1545  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1546  const auto& agg_type = agg_info.sql_type;
1547  const size_t chosen_bytes = agg_type.get_size();
1548 
1549  bool need_conversion{false};
1550  llvm::Value* arg_null{nullptr};
1551  llvm::Value* agg_null{nullptr};
1552  llvm::Value* target_to_cast{target};
1553  if (arg_type.is_fp()) {
1554  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1555  if (agg_type.is_fp()) {
1556  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1557  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1558  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1559  need_conversion = true;
1560  }
1561  } else {
1562  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1563  return target;
1564  }
1565  } else {
1566  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1567  if (agg_type.is_fp()) {
1568  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1569  need_conversion = true;
1570  target_to_cast = executor_->castToFP(target, arg_type, agg_type);
1571  } else {
1572  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1573  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1574  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1575  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1576  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1577  need_conversion = true;
1578  }
1579  }
1580  }
1581  if (need_conversion) {
1582  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1583  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1584  return LL_BUILDER.CreateSelect(
1585  cmp,
1586  agg_null,
1587  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1588  } else {
1589  return target;
1590  }
1591 }
1592 
1594  const Analyzer::WindowFunction* window_func,
1595  const QueryMemoryDescriptor& query_mem_desc,
1596  const CompilationOptions& co,
1597  DiamondCodegen& diamond_codegen) {
1598  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1599  const auto window_func_context =
1601  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1602  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1603  ? 0
1604  : query_mem_desc.getRowSize() / sizeof(int64_t);
1605  auto arg_it = ROW_FUNC->arg_begin();
1606  auto groups_buffer = arg_it++;
1607  CodeGenerator code_generator(executor_);
1608  auto window_pos_lv = code_generator.codegenWindowPosition(
1609  window_func_context, code_generator.posArg(nullptr));
1610  const auto pos_in_window =
1611  LL_BUILDER.CreateTrunc(window_pos_lv, get_int_type(32, LL_CONTEXT));
1612  llvm::Value* entry_count_lv =
1613  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1614  std::vector<llvm::Value*> args{
1615  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1616  if (query_mem_desc.didOutputColumnar()) {
1617  const auto columnar_output_offset =
1618  emitCall("get_columnar_scan_output_offset", args);
1619  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1620  }
1621  args.push_back(LL_INT(row_size_quad));
1622  return emitCall("get_scan_output_slot", args);
1623  }
1624  auto arg_it = ROW_FUNC->arg_begin();
1625  auto groups_buffer = arg_it++;
1626  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1627 }
1628 
1630  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1631  llvm::Value* varlen_output_buffer,
1632  const std::vector<llvm::Value*>& agg_out_vec,
1633  QueryMemoryDescriptor& query_mem_desc,
1634  const CompilationOptions& co,
1635  const GpuSharedMemoryContext& gpu_smem_context,
1636  DiamondCodegen& diamond_codegen) {
1637  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1638  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1639  // TODO(alex): unify the two cases, the output for non-group by queries
1640  // should be a contiguous buffer
1641  const bool is_group_by = std::get<0>(agg_out_ptr_w_idx);
1642  bool can_return_error = false;
1643  if (is_group_by) {
1644  CHECK(agg_out_vec.empty());
1645  } else {
1646  CHECK(!agg_out_vec.empty());
1647  }
1648 
1649  // output buffer is casted into a byte stream to be able to handle data elements of
1650  // different sizes (only used when actual column width sizes are used)
1651  llvm::Value* output_buffer_byte_stream{nullptr};
1652  llvm::Value* out_row_idx{nullptr};
1653  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1655  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1656  std::get<0>(agg_out_ptr_w_idx),
1657  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1658  output_buffer_byte_stream->setName("out_buff_b_stream");
1659  CHECK(std::get<1>(agg_out_ptr_w_idx));
1660  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1661  llvm::Type::getInt64Ty(LL_CONTEXT));
1662  out_row_idx->setName("out_row_idx");
1663  }
1664 
1665  TargetExprCodegenBuilder target_builder(ra_exe_unit_, is_group_by);
1666  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1667  ++target_idx) {
1668  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1669  CHECK(target_expr);
1670 
1671  target_builder(target_expr, executor_, query_mem_desc, co);
1672  }
1673 
1674  target_builder.codegen(this,
1675  executor_,
1676  query_mem_desc,
1677  co,
1678  gpu_smem_context,
1679  agg_out_ptr_w_idx,
1680  agg_out_vec,
1681  output_buffer_byte_stream,
1682  out_row_idx,
1683  varlen_output_buffer,
1684  diamond_codegen);
1685 
1686  for (auto target_expr : ra_exe_unit_.target_exprs) {
1687  CHECK(target_expr);
1688  executor_->plan_state_->isLazyFetchColumn(target_expr);
1689  }
1690 
1691  return can_return_error;
1692 }
1693 
1698  llvm::Value* output_buffer_byte_stream,
1699  llvm::Value* out_row_idx,
1700  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1701  const QueryMemoryDescriptor& query_mem_desc,
1702  const size_t chosen_bytes,
1703  const size_t agg_out_off,
1704  const size_t target_idx) {
1705  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1706  llvm::Value* agg_col_ptr{nullptr};
1707  if (query_mem_desc.didOutputColumnar()) {
1708  // TODO(Saman): remove the second columnar branch, and support all query description
1709  // types through the first branch. Then, input arguments should also be cleaned up
1710  if (!g_cluster &&
1712  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1713  chosen_bytes == 8);
1714  CHECK(output_buffer_byte_stream);
1715  CHECK(out_row_idx);
1716  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1717  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1718  auto out_per_col_byte_idx =
1719 #ifdef _WIN32
1720  LL_BUILDER.CreateShl(out_row_idx, __lzcnt(chosen_bytes) - 1);
1721 #else
1722  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1723 #endif
1724  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1725  LL_INT(static_cast<int64_t>(col_off)));
1726  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1727  auto output_ptr = LL_BUILDER.CreateGEP(
1728  output_buffer_byte_stream->getType()->getScalarType()->getPointerElementType(),
1729  output_buffer_byte_stream,
1730  byte_offset);
1731  agg_col_ptr = LL_BUILDER.CreateBitCast(
1732  output_ptr,
1733  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1734  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1735  } else {
1736  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1737  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1738  col_off /= chosen_bytes;
1739  CHECK(std::get<1>(agg_out_ptr_w_idx));
1740  auto offset = LL_BUILDER.CreateAdd(std::get<1>(agg_out_ptr_w_idx), LL_INT(col_off));
1741  auto* bit_cast = LL_BUILDER.CreateBitCast(
1742  std::get<0>(agg_out_ptr_w_idx),
1743  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1744  agg_col_ptr = LL_BUILDER.CreateGEP(
1745  bit_cast->getType()->getScalarType()->getPointerElementType(),
1746  bit_cast,
1747  offset);
1748  }
1749  } else {
1750  uint32_t col_off = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1751  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1752  col_off /= chosen_bytes;
1753  auto* bit_cast = LL_BUILDER.CreateBitCast(
1754  std::get<0>(agg_out_ptr_w_idx),
1755  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1756  agg_col_ptr = LL_BUILDER.CreateGEP(
1757  bit_cast->getType()->getScalarType()->getPointerElementType(),
1758  bit_cast,
1759  LL_INT(col_off));
1760  }
1761  CHECK(agg_col_ptr);
1762  return agg_col_ptr;
1763 }
1764 
1765 void GroupByAndAggregate::codegenEstimator(std::stack<llvm::BasicBlock*>& array_loops,
1766  DiamondCodegen& diamond_codegen,
1767  const QueryMemoryDescriptor& query_mem_desc,
1768  const CompilationOptions& co) {
1769  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1770  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1771  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1772  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1773  estimator_comp_count_lv);
1774  int32_t subkey_idx = 0;
1775  for (const auto& estimator_arg_comp : estimator_arg) {
1776  const auto estimator_arg_comp_lvs =
1777  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1778  query_mem_desc.getEffectiveKeyWidth(),
1779  co,
1780  false,
1781  0,
1782  diamond_codegen,
1783  array_loops,
1784  true);
1785  CHECK(!estimator_arg_comp_lvs.original_value);
1786  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1787  // store the sub-key to the buffer
1788  LL_BUILDER.CreateStore(
1789  estimator_arg_comp_lv,
1790  LL_BUILDER.CreateGEP(
1791  estimator_key_lv->getType()->getScalarType()->getPointerElementType(),
1792  estimator_key_lv,
1793  LL_INT(subkey_idx++)));
1794  }
1795  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1796  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1797  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1798  const auto estimator_comp_bytes_lv =
1799  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1800  const auto bitmap_size_lv =
1801  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1802  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1803  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1804 }
1805 
1806 extern "C" RUNTIME_EXPORT void agg_count_distinct(int64_t* agg, const int64_t val) {
1807  reinterpret_cast<CountDistinctSet*>(*agg)->insert(val);
1808 }
1809 
1810 extern "C" RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t* agg,
1811  const int64_t val,
1812  const int64_t skip_val) {
1813  if (val != skip_val) {
1814  agg_count_distinct(agg, val);
1815  }
1816 }
1817 
1818 extern "C" RUNTIME_EXPORT void agg_approx_quantile(int64_t* agg, const double val) {
1819  auto* t_digest = reinterpret_cast<quantile::TDigest*>(*agg);
1820  t_digest->allocate();
1821  t_digest->add(val);
1822 }
1823 
1824 extern "C" RUNTIME_EXPORT void agg_mode_func(int64_t* agg, const int64_t val) {
1825  auto* mode_map = reinterpret_cast<AggMode*>(*agg);
1826  mode_map->add(val);
1827 }
1828 
1830  const size_t target_idx,
1831  const Analyzer::Expr* target_expr,
1832  std::vector<llvm::Value*>& agg_args,
1833  const QueryMemoryDescriptor& query_mem_desc,
1834  const ExecutorDeviceType device_type) {
1835  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1836  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1837  const auto& arg_ti =
1838  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1839  if (arg_ti.is_fp()) {
1840  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1841  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1842  }
1843  const auto& count_distinct_descriptor =
1844  query_mem_desc.getCountDistinctDescriptor(target_idx);
1845  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1846  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1847  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1848  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1849  if (device_type == ExecutorDeviceType::GPU) {
1850  const auto base_dev_addr = getAdditionalLiteral(-1);
1851  const auto base_host_addr = getAdditionalLiteral(-2);
1852  agg_args.push_back(base_dev_addr);
1853  agg_args.push_back(base_host_addr);
1854  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1855  } else {
1856  emitCall("agg_approximate_count_distinct", agg_args);
1857  }
1858  return;
1859  }
1860  std::string agg_fname{"agg_count_distinct"};
1861  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1862  agg_fname += "_bitmap";
1863  agg_args.push_back(LL_INT(static_cast<int64_t>(count_distinct_descriptor.min_val)));
1864  }
1865  if (agg_info.skip_null_val) {
1866  auto null_lv = executor_->cgen_state_->castToTypeIn(
1867  (arg_ti.is_fp()
1868  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1869  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1870  64);
1871  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1872  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1873  agg_fname += "_skip_val";
1874  agg_args.push_back(null_lv);
1875  }
1876  if (device_type == ExecutorDeviceType::GPU) {
1877  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1878  agg_fname += "_gpu";
1879  const auto base_dev_addr = getAdditionalLiteral(-1);
1880  const auto base_host_addr = getAdditionalLiteral(-2);
1881  agg_args.push_back(base_dev_addr);
1882  agg_args.push_back(base_host_addr);
1883  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1884  CHECK_EQ(size_t(0),
1885  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1886  count_distinct_descriptor.sub_bitmap_count);
1887  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1888  count_distinct_descriptor.sub_bitmap_count)));
1889  }
1890  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1891  emitCall(agg_fname, agg_args);
1892  } else {
1893  executor_->cgen_state_->emitExternalCall(
1894  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1895  }
1896 }
1897 
1899  const size_t target_idx,
1900  const Analyzer::Expr* target_expr,
1901  std::vector<llvm::Value*>& agg_args,
1902  const QueryMemoryDescriptor& query_mem_desc,
1903  const ExecutorDeviceType device_type) {
1904  if (device_type == ExecutorDeviceType::GPU) {
1905  throw QueryMustRunOnCpu();
1906  }
1907  llvm::BasicBlock *calc, *skip{nullptr};
1908  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1909  auto const arg_ti =
1910  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1911  bool const nullable = !arg_ti.get_notnull();
1912 
1913  auto* cs = executor_->cgen_state_.get();
1914  auto& irb = cs->ir_builder_;
1915  if (nullable) {
1916  auto* const null_value = cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1917  auto* const skip_cond = arg_ti.is_fp()
1918  ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1919  : irb.CreateICmpEQ(agg_args.back(), null_value);
1920  calc = llvm::BasicBlock::Create(cs->context_, "calc_approx_quantile");
1921  skip = llvm::BasicBlock::Create(cs->context_, "skip_approx_quantile");
1922  irb.CreateCondBr(skip_cond, skip, calc);
1923  cs->current_func_->getBasicBlockList().push_back(calc);
1924  irb.SetInsertPoint(calc);
1925  }
1926  if (!arg_ti.is_fp()) {
1927  auto const agg_info = get_target_info(target_expr, g_bigint_count);
1928  agg_args.back() = executor_->castToFP(agg_args.back(), arg_ti, agg_info.sql_type);
1929  }
1930  cs->emitExternalCall(
1931  "agg_approx_quantile", llvm::Type::getVoidTy(cs->context_), agg_args);
1932  if (nullable) {
1933  irb.CreateBr(skip);
1934  cs->current_func_->getBasicBlockList().push_back(skip);
1935  irb.SetInsertPoint(skip);
1936  }
1937 }
1938 
1939 void GroupByAndAggregate::codegenMode(const size_t target_idx,
1940  const Analyzer::Expr* target_expr,
1941  std::vector<llvm::Value*>& agg_args,
1942  const QueryMemoryDescriptor& query_mem_desc,
1943  const ExecutorDeviceType device_type) {
1944  if (device_type == ExecutorDeviceType::GPU) {
1945  throw QueryMustRunOnCpu();
1946  }
1947  llvm::BasicBlock *calc, *skip{nullptr};
1948  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1949  auto const arg_ti =
1950  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1951  bool const nullable = !arg_ti.get_notnull();
1952  bool const is_fp = arg_ti.is_fp();
1953  auto* cs = executor_->cgen_state_.get();
1954  auto& irb = cs->ir_builder_;
1955  if (nullable) {
1956  auto* const null_value =
1957  is_fp ? cs->inlineNull(arg_ti) : cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1958  auto* const skip_cond = is_fp ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1959  : irb.CreateICmpEQ(agg_args.back(), null_value);
1960  calc = llvm::BasicBlock::Create(cs->context_, "calc_mode");
1961  skip = llvm::BasicBlock::Create(cs->context_, "skip_mode");
1962  irb.CreateCondBr(skip_cond, skip, calc);
1963  cs->current_func_->getBasicBlockList().push_back(calc);
1964  irb.SetInsertPoint(calc);
1965  }
1966  if (is_fp) {
1967  auto* const int_type = get_int_type(8 * arg_ti.get_size(), cs->context_);
1968  agg_args.back() = irb.CreateBitCast(agg_args.back(), int_type);
1969  }
1970  // "agg_mode" collides with existing names, so non-standard suffix "_func" is added.
1971  cs->emitExternalCall("agg_mode_func", llvm::Type::getVoidTy(cs->context_), agg_args);
1972  if (nullable) {
1973  irb.CreateBr(skip);
1974  cs->current_func_->getBasicBlockList().push_back(skip);
1975  irb.SetInsertPoint(skip);
1976  }
1977 }
1978 
1979 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
1980  CHECK_LT(off, 0);
1981  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
1982  auto* bit_cast = LL_BUILDER.CreateBitCast(
1983  lit_buff_lv, llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0));
1984  auto* gep =
1985  LL_BUILDER.CreateGEP(bit_cast->getType()->getScalarType()->getPointerElementType(),
1986  bit_cast,
1987  LL_INT(off));
1988  return LL_BUILDER.CreateLoad(gep->getType()->getPointerElementType(), gep);
1989 }
1990 
1991 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
1992  const Analyzer::Expr* target_expr,
1993  const CompilationOptions& co) {
1994  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1995  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
1996  const auto func_expr = dynamic_cast<const Analyzer::FunctionOper*>(target_expr);
1997  const auto arr_expr = dynamic_cast<const Analyzer::ArrayExpr*>(target_expr);
1998 
1999  // TODO(alex): handle arrays uniformly?
2000  CodeGenerator code_generator(executor_);
2001  if (target_expr) {
2002  const auto& target_ti = target_expr->get_type_info();
2003  if (target_ti.is_buffer() &&
2004  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
2005  const auto target_lvs =
2006  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2007  : code_generator.codegen(
2008  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
2009  if (!func_expr && !arr_expr) {
2010  // Something with the chunk transport is code that was generated from a source
2011  // other than an ARRAY[] expression
2012  if (target_ti.is_bytes()) {
2013  CHECK_EQ(size_t(3), target_lvs.size());
2014  return {target_lvs[1], target_lvs[2]};
2015  }
2016  CHECK(target_ti.is_array());
2017  CHECK_EQ(size_t(1), target_lvs.size());
2018  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
2019  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
2020  const auto i8p_ty =
2021  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
2022  const auto& elem_ti = target_ti.get_elem_type();
2023  return {
2024  executor_->cgen_state_->emitExternalCall(
2025  "array_buff",
2026  i8p_ty,
2027  {target_lvs.front(), code_generator.posArg(target_expr)}),
2028  executor_->cgen_state_->emitExternalCall(
2029  "array_size",
2030  i32_ty,
2031  {target_lvs.front(),
2032  code_generator.posArg(target_expr),
2033  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
2034  } else {
2035  if (agg_expr) {
2036  throw std::runtime_error(
2037  "Using array[] operator as argument to an aggregate operator is not "
2038  "supported");
2039  }
2040  CHECK(func_expr || arr_expr);
2041  if (dynamic_cast<const Analyzer::FunctionOper*>(target_expr)) {
2042  CHECK_EQ(size_t(1), target_lvs.size());
2043  const auto prefix = target_ti.get_buffer_name();
2044  CHECK(target_ti.is_array() || target_ti.is_bytes());
2045  const auto target_lv = LL_BUILDER.CreateLoad(
2046  target_lvs[0]->getType()->getPointerElementType(), target_lvs[0]);
2047  // const auto target_lv_type = target_lvs[0]->getType();
2048  // CHECK(target_lv_type->isStructTy());
2049  // CHECK_EQ(target_lv_type->getNumContainedTypes(), 3u);
2050  const auto i8p_ty = llvm::PointerType::get(
2051  get_int_type(8, executor_->cgen_state_->context_), 0);
2052  const auto ptr = LL_BUILDER.CreatePointerCast(
2053  LL_BUILDER.CreateExtractValue(target_lv, 0), i8p_ty);
2054  const auto size = LL_BUILDER.CreateExtractValue(target_lv, 1);
2055  const auto null_flag = LL_BUILDER.CreateExtractValue(target_lv, 2);
2056  const auto nullcheck_ok_bb =
2057  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_nullcheck_ok_bb", CUR_FUNC);
2058  const auto nullcheck_fail_bb = llvm::BasicBlock::Create(
2059  LL_CONTEXT, prefix + "_nullcheck_fail_bb", CUR_FUNC);
2060 
2061  // TODO(adb): probably better to zext the bool
2062  const auto nullcheck = LL_BUILDER.CreateICmpEQ(
2063  null_flag, executor_->cgen_state_->llInt(static_cast<int8_t>(1)));
2064  LL_BUILDER.CreateCondBr(nullcheck, nullcheck_fail_bb, nullcheck_ok_bb);
2065 
2066  const auto ret_bb =
2067  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_return", CUR_FUNC);
2068  LL_BUILDER.SetInsertPoint(ret_bb);
2069  auto result_phi = LL_BUILDER.CreatePHI(i8p_ty, 2, prefix + "_ptr_return");
2070  result_phi->addIncoming(ptr, nullcheck_ok_bb);
2071  const auto null_arr_sentinel = LL_BUILDER.CreateIntToPtr(
2072  executor_->cgen_state_->llInt(static_cast<int8_t>(0)), i8p_ty);
2073  result_phi->addIncoming(null_arr_sentinel, nullcheck_fail_bb);
2074  LL_BUILDER.SetInsertPoint(nullcheck_ok_bb);
2075  executor_->cgen_state_->emitExternalCall(
2076  "register_buffer_with_executor_rsm",
2077  llvm::Type::getVoidTy(executor_->cgen_state_->context_),
2078  {executor_->cgen_state_->llInt(reinterpret_cast<int64_t>(executor_)), ptr});
2079  LL_BUILDER.CreateBr(ret_bb);
2080  LL_BUILDER.SetInsertPoint(nullcheck_fail_bb);
2081  LL_BUILDER.CreateBr(ret_bb);
2082 
2083  LL_BUILDER.SetInsertPoint(ret_bb);
2084  return {result_phi, size};
2085  }
2086  CHECK_EQ(size_t(2), target_lvs.size());
2087  return {target_lvs[0], target_lvs[1]};
2088  }
2089  }
2090  if (target_ti.is_geometry() &&
2091  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
2092  auto generate_coord_lvs =
2093  [&](auto* selected_target_expr,
2094  bool const fetch_columns) -> std::vector<llvm::Value*> {
2095  const auto target_lvs =
2096  code_generator.codegen(selected_target_expr, fetch_columns, co);
2097  if (dynamic_cast<const Analyzer::GeoOperator*>(target_expr) &&
2098  target_expr->get_type_info().is_geometry()) {
2099  // return a pointer to the temporary alloca
2100  return target_lvs;
2101  }
2102  const auto geo_uoper = dynamic_cast<const Analyzer::GeoUOper*>(target_expr);
2103  const auto geo_binoper = dynamic_cast<const Analyzer::GeoBinOper*>(target_expr);
2104  if (geo_uoper || geo_binoper) {
2105  CHECK(target_expr->get_type_info().is_geometry());
2106  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
2107  target_lvs.size());
2108  return target_lvs;
2109  }
2110  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
2111  target_lvs.size());
2112 
2113  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
2114  const auto i8p_ty =
2115  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
2116  std::vector<llvm::Value*> coords;
2117  size_t ctr = 0;
2118  for (const auto& target_lv : target_lvs) {
2119  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
2120  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
2121  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
2122  // coords array (TINYINT). Subsequent arrays are regular INT.
2123 
2124  const size_t elem_sz = ctr == 0 ? 1 : 4;
2125  ctr++;
2126  int32_t fixlen = -1;
2127  if (target_ti.get_type() == kPOINT) {
2128  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
2129  if (col_var) {
2130  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
2131  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
2132  fixlen = coords_cd->columnType.get_size();
2133  }
2134  }
2135  }
2136  if (fixlen > 0) {
2137  coords.push_back(executor_->cgen_state_->emitExternalCall(
2138  "fast_fixlen_array_buff",
2139  i8p_ty,
2140  {target_lv, code_generator.posArg(selected_target_expr)}));
2141  coords.push_back(executor_->cgen_state_->llInt(int64_t(fixlen)));
2142  continue;
2143  }
2144  coords.push_back(executor_->cgen_state_->emitExternalCall(
2145  "array_buff",
2146  i8p_ty,
2147  {target_lv, code_generator.posArg(selected_target_expr)}));
2148  coords.push_back(executor_->cgen_state_->emitExternalCall(
2149  "array_size",
2150  i32_ty,
2151  {target_lv,
2152  code_generator.posArg(selected_target_expr),
2153  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
2154  }
2155  return coords;
2156  };
2157 
2158  if (agg_expr) {
2159  return generate_coord_lvs(agg_expr->get_arg(), true);
2160  } else {
2161  return generate_coord_lvs(target_expr,
2162  !executor_->plan_state_->allow_lazy_fetch_);
2163  }
2164  }
2165  }
2166  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2167  : code_generator.codegen(
2168  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
2169 }
2170 
2171 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
2172  const std::vector<llvm::Value*>& args) {
2173  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2174  return executor_->cgen_state_->emitCall(fname, args);
2175 }
2176 
2177 void GroupByAndAggregate::checkErrorCode(llvm::Value* retCode) {
2178  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2179  auto zero_const = llvm::ConstantInt::get(retCode->getType(), 0, true);
2180  auto rc_check_condition = executor_->cgen_state_->ir_builder_.CreateICmp(
2181  llvm::ICmpInst::ICMP_EQ, retCode, zero_const);
2182 
2183  executor_->cgen_state_->emitErrorCheck(rc_check_condition, retCode, "rc");
2184 }
2185 
2186 #undef CUR_FUNC
2187 #undef ROW_FUNC
2188 #undef LL_FP
2189 #undef LL_INT
2190 #undef LL_BOOL
2191 #undef LL_BUILDER
2192 #undef LL_CONTEXT
2193 
2195  const RelAlgExecutionUnit& ra_exe_unit,
2196  const Catalog_Namespace::Catalog& catalog) {
2197  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
2198  return 0;
2199  }
2200  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
2201  const auto grouped_col_expr =
2202  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
2203  if (!grouped_col_expr) {
2204  continue;
2205  }
2206  if (grouped_col_expr->get_table_id() <= 0) {
2207  return 0;
2208  }
2209  const auto td = catalog.getMetadataForTable(grouped_col_expr->get_table_id());
2210  if (td->shardedColumnId == grouped_col_expr->get_column_id()) {
2211  return td->nShards;
2212  }
2213  }
2214  return 0;
2215 }
RUNTIME_EXPORT void agg_approx_quantile(int64_t *agg, const double val)
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:230
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:2562
size_t g_watchdog_baseline_max_groups
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
llvm::Value * getAdditionalLiteral(const int32_t off)
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
llvm::BasicBlock * cond_false_
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
HOST DEVICE int get_size() const
Definition: sqltypes.h:390
std::string cat(Ts &&...args)
#define LL_BUILDER
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:132
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
Definition: Analyzer.h:216
RUNTIME_EXPORT void agg_count_distinct(int64_t *agg, const int64_t val)
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:113
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
#define LL_CONTEXT
bool expr_is_rowid(const Analyzer::Expr *expr, const Catalog_Namespace::Catalog &cat)
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
Definition: Analyzer.h:221
ExecutorDeviceType
KeylessInfo get_keyless_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
void codegenMode(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
SQLTypeInfo sql_type
Definition: TargetInfo.h:52
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:216
std::ostream & operator<<(std::ostream &os, const SessionInfo &session_info)
Definition: SessionInfo.cpp:57
void mark_function_always_inline(llvm::Function *func)
bool is_fp() const
Definition: sqltypes.h:580
ColRangeInfo getColRangeInfo()
const std::list< Analyzer::OrderEntry > order_entries
#define LL_INT(v)
QueryDescriptionType hash_type_
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:542
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint)
#define CHECK_GE(x, y)
Definition: Logger.h:235
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
llvm::Value * codegenVarlenOutputBuffer(const QueryMemoryDescriptor &query_mem_desc)
Expr * get_arg() const
Definition: Analyzer.h:1203
size_t getEffectiveKeyWidth() const
void codegenApproxQuantile(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
void checkErrorCode(llvm::Value *retCode)
CountDistinctDescriptors init_count_distinct_descriptors(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &group_by_range_info, const ExecutorDeviceType device_type, Executor *executor)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:102
#define LLVM_ALIGN(alignment)
RUNTIME_EXPORT void agg_mode_func(int64_t *agg, const int64_t val)
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
CountDistinctImplType impl_type_
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:234
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:88
std::string to_string(char const *&&v)
Helpers for codegen of target expressions.
#define LL_BOOL(v)
size_t getColOnlyOffInBytes(const size_t col_idx) const
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:75
const SQLTypeInfo get_compact_type(const TargetInfo &target)
const size_t limit
llvm::Value * codegenWindowPosition(const WindowFunctionContext *window_func_context, llvm::Value *pos_arg)
Definition: ColumnIR.cpp:230
bool isEmpty() const
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const std::optional< int64_t > &group_cardinality_estimation)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:166
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:220
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:34
size_t getGroupbyColCount() const
RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
const JoinQualsPerNestingLevel join_quals
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, llvm::Value *varlen_output_buffer, const std::vector< llvm::Value * > &agg_out_vec, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
bool g_bigint_count
Definition: sqldefs.h:77
bool g_enable_watchdog
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, llvm::Value *varlen_output_buffer, DiamondCodegen &diamond_codegen) const
int64_t g_bitmap_memory_limit
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:98
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
DEVICE void allocate()
Definition: quantile.h:601
#define AUTOMATIC_IR_METADATA(CGENSTATE)
This file includes the class specification for the buffer manager (BufferMgr), and related data struc...
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:51
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:82
QueryDescriptionType getQueryDescriptionType() const
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
ExecutorDeviceType device_type
#define RUNTIME_EXPORT
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:42
#define CHECK_LT(x, y)
Definition: Logger.h:232
Definition: sqltypes.h:68
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo &col_range_info)
#define CHECK_LE(x, y)
Definition: Logger.h:233
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
std::unordered_map< size_t, SQLTypeInfo > target_exprs_original_type_infos
Definition: sqldefs.h:78
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
void add(Value const value)
Definition: AggMode.h:40
Descriptor for the result set buffer layout.
TO bit_cast(FROM &&from)
Definition: misc.h:298
CountDistinctImplType
const std::optional< int64_t > group_cardinality_estimation_
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:222
bool is_geometry() const
Definition: sqltypes.h:588
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
bool g_cluster
int64_t get_epoch_days_from_seconds(const int64_t seconds)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
static size_t getBaselineThreshold(bool for_count_distinct, ExecutorDeviceType device_type)
Definition: Execute.h:1272
#define CUR_FUNC
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:176
Definition: sqltypes.h:60
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
constexpr double n
Definition: Utm.h:38
size_t g_leaf_count
Definition: ParserNode.cpp:76
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:387
const RelAlgExecutionUnit & ra_exe_unit_
const size_t offset
Definition: sqldefs.h:76
Definition: sqldefs.h:74
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)