OmniSciDB  a987f07e93
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
19 
20 #include "CardinalityEstimator.h"
21 #include "CodeGenerator.h"
23 #include "ExpressionRange.h"
24 #include "ExpressionRewrite.h"
25 #include "GpuInitGroups.h"
26 #include "InPlaceSort.h"
28 #include "MaxwellCodegenPatch.h"
30 #include "TargetExprBuilder.h"
31 
32 #include "../CudaMgr/CudaMgr.h"
33 #include "../Shared/checked_alloc.h"
34 #include "../Shared/funcannotations.h"
35 #include "../Utils/ChunkIter.h"
37 #include "Execute.h"
38 #include "QueryTemplateGenerator.h"
39 #include "RuntimeFunctions.h"
40 #include "Shared/misc.h"
41 #include "StreamingTopN.h"
42 #include "TopKSort.h"
43 #include "WindowContext.h"
44 
45 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
46 
47 #include <cstring> // strcat()
48 #include <limits>
49 #include <numeric>
50 #include <string_view>
51 #include <thread>
52 
53 bool g_cluster{false};
54 bool g_bigint_count{false};
57 extern int64_t g_bitmap_memory_limit;
58 extern size_t g_leaf_count;
59 
60 bool ColRangeInfo::isEmpty() const {
61  return min == 0 && max == -1;
62 }
63 
64 std::ostream& operator<<(std::ostream& out, const ColRangeInfo& info) {
65  out << "Hash Type = " << info.hash_type_ << " min = " << info.min
66  << " max = " << info.max << " bucket = " << info.bucket
67  << " has_nulls = " << info.has_nulls << "\n";
68  return out;
69 }
70 
71 std::ostream& operator<<(std::ostream& out, const CountDistinctImplType& type) {
72  switch (type) {
74  out << "Invalid";
75  break;
77  out << "Bitmap";
78  break;
80  out << "UnorderedSet";
81  break;
82  default:
83  out << "<Unkown Type>";
84  break;
85  }
86  return out;
87 }
88 
89 std::ostream& operator<<(std::ostream& out, const CountDistinctDescriptor& desc) {
90  out << "Type = " << desc.impl_type_ << " min val = " << desc.min_val
91  << " bitmap_sz_bits = " << desc.bitmap_sz_bits
92  << " bool approximate = " << desc.approximate
93  << " device_type = " << desc.device_type
94  << " sub_bitmap_count = " << desc.sub_bitmap_count;
95  return out;
96 }
97 
98 namespace {
99 
100 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
101  int32_t agg_count{0};
102  for (auto target_expr : target_exprs) {
103  CHECK(target_expr);
104  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
105  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
106  const auto& ti = target_expr->get_type_info();
107  if (ti.is_buffer()) {
108  agg_count += 2;
109  } else if (ti.is_geometry()) {
110  agg_count += ti.get_physical_coord_cols() * 2;
111  } else {
112  ++agg_count;
113  }
114  continue;
115  }
116  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
117  agg_count += 2;
118  } else {
119  ++agg_count;
120  }
121  }
122  return agg_count;
123 }
124 
126  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
127  if (!col) {
128  return false;
129  }
130  const auto cd =
131  get_column_descriptor_maybe(col->get_column_id(), col->get_table_id(), cat);
132  if (!cd || !cd->isVirtualCol) {
133  return false;
134  }
135  CHECK_EQ("rowid", cd->columnName);
136  return true;
137 }
138 
139 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
140  for (const auto& target_expr : ra_exe_unit.target_exprs) {
141  const auto agg_info = get_target_info(target_expr, g_bigint_count);
142  if (agg_info.is_agg && is_distinct_target(agg_info)) {
143  return true;
144  }
145  }
146  return false;
147 }
148 
150  const int64_t max_entry_count) {
151  try {
152  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
153  checked_int64_t(col_range_info.min)) >= max_entry_count;
154  } catch (...) {
155  return true;
156  }
157 }
158 
159 bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate,
160  const ColRangeInfo& col_range_info) {
161  try {
162  // the cardinality estimate is the size of the baseline hash table. further penalize
163  // the baseline hash table by a factor of 2x due to overhead in computing baseline
164  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
165  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
166  // count of the column, we use baseline hash on the filtered set
167  return checked_int64_t(cardinality_estimate) * 2 <
168  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
169  checked_int64_t(col_range_info.min));
170  } catch (...) {
171  return false;
172  }
173 }
174 
176  const std::vector<InputTableInfo>& query_infos,
177  const Analyzer::Expr* expr,
178  Executor* executor) {
179  if (!expr) {
180  return {QueryDescriptionType::Projection, 0, 0, 0, false};
181  }
182 
183  const auto expr_range = getExpressionRange(
184  expr, query_infos, executor, boost::make_optional(ra_exe_unit.simple_quals));
185  switch (expr_range.getType()) {
187  if (expr_range.getIntMin() > expr_range.getIntMax()) {
188  return {
189  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
190  }
192  expr_range.getIntMin(),
193  expr_range.getIntMax(),
194  expr_range.getBucket(),
195  expr_range.hasNulls()};
196  }
199  if (expr_range.getFpMin() > expr_range.getFpMax()) {
200  return {
201  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
202  }
203  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
204  }
206  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
207  default:
208  CHECK(false);
209  }
210  CHECK(false);
211  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
212 }
213 
214 } // namespace
215 
217  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
218  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
219  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
220  // can expect this to be true anyway for grouped queries since the precise version
221  // uses significantly more memory.
222  const int64_t baseline_threshold =
224  if (ra_exe_unit_.groupby_exprs.size() != 1) {
225  try {
226  checked_int64_t cardinality{1};
227  bool has_nulls{false};
228  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
229  auto col_range_info = get_expr_range_info(
230  ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
231  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
232  // going through baseline hash if a non-integer type is encountered
233  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
234  }
235  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
236  CHECK_GE(crt_col_cardinality, 0);
237  cardinality *= crt_col_cardinality;
238  if (col_range_info.has_nulls) {
239  has_nulls = true;
240  }
241  }
242  // For zero or high cardinalities, use baseline layout.
243  if (!cardinality || cardinality > baseline_threshold) {
244  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
245  }
247  0,
248  int64_t(cardinality),
249  0,
250  has_nulls};
251  } catch (...) { // overflow when computing cardinality
252  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
253  }
254  }
255  // For single column groupby on high timestamps, force baseline hash due to wide ranges
256  // we are likely to encounter when applying quals to the expression range
257  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
258  // the range is small enough
259  if (ra_exe_unit_.groupby_exprs.front() &&
260  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
261  ra_exe_unit_.simple_quals.size() > 0) {
262  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
263  }
264  const auto col_range_info = get_expr_range_info(
266  if (!ra_exe_unit_.groupby_exprs.front()) {
267  return col_range_info;
268  }
269  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
270  const int64_t col_count =
272  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
274  max_entry_count = std::min(max_entry_count, baseline_threshold);
275  }
276  const auto& groupby_expr_ti = ra_exe_unit_.groupby_exprs.front()->get_type_info();
277  if (groupby_expr_ti.is_string() && !col_range_info.bucket) {
278  CHECK(groupby_expr_ti.get_compression() == kENCODING_DICT);
279 
280  const bool has_filters =
281  !ra_exe_unit_.quals.empty() || !ra_exe_unit_.simple_quals.empty();
282  if (has_filters &&
283  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
284  // if filters are present, we can use the filter to narrow the cardinality of the
285  // group by in the case of ranges too big for perfect hash. Otherwise, we are better
286  // off attempting perfect hash (since we know the range will be made of
287  // monotonically increasing numbers from min to max for dictionary encoded strings)
288  // and failing later due to excessive memory use.
289  // Check the conditions where baseline hash can provide a performance increase and
290  // return baseline hash (potentially forcing an estimator query) as the range type.
291  // Otherwise, return col_range_info which will likely be perfect hash, though could
292  // be baseline from a previous call of this function prior to the estimator query.
293  if (!ra_exe_unit_.sort_info.order_entries.empty()) {
294  // TODO(adb): allow some sorts to pass through this block by centralizing sort
295  // algorithm decision making
297  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
298  // always use baseline hash for column range too big for perfect hash with count
299  // distinct descriptors. We will need 8GB of CPU memory minimum for the perfect
300  // hash group by in this case.
302  col_range_info.min,
303  col_range_info.max,
304  0,
305  col_range_info.has_nulls};
306  } else {
307  // use original col range for sort
308  return col_range_info;
309  }
310  }
311  // if filters are present and the filtered range is less than the cardinality of
312  // the column, consider baseline hash
315  col_range_info)) {
317  col_range_info.min,
318  col_range_info.max,
319  0,
320  col_range_info.has_nulls};
321  }
322  }
323  } else if ((!expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get(),
324  *executor_->catalog_)) &&
325  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
326  !col_range_info.bucket) {
328  col_range_info.min,
329  col_range_info.max,
330  0,
331  col_range_info.has_nulls};
332  }
333  return col_range_info;
334 }
335 
337  checked_int64_t crt_col_cardinality =
338  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
339  if (col_range_info.bucket) {
340  crt_col_cardinality /= col_range_info.bucket;
341  }
342  return static_cast<int64_t>(crt_col_cardinality +
343  (1 + (col_range_info.has_nulls ? 1 : 0)));
344 }
345 
346 namespace {
347 // Like getBucketedCardinality() without counting nulls.
348 int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo& col_range_info) {
349  if (col_range_info.min <= col_range_info.max) {
350  size_t size = col_range_info.max - col_range_info.min;
351  if (col_range_info.bucket) {
352  size /= col_range_info.bucket;
353  }
354  if (size >= static_cast<size_t>(std::numeric_limits<int64_t>::max())) {
355  // try to use unordered_set instead of crashing due to CHECK failure
356  // i.e., CHECK_LT(size, std::numeric_limits<int64_t>::max());
357  return 0;
358  }
359  return static_cast<int64_t>(size + 1);
360  } else {
361  return 0;
362  }
363 }
364 } // namespace
365 
366 #define LL_CONTEXT executor_->cgen_state_->context_
367 #define LL_BUILDER executor_->cgen_state_->ir_builder_
368 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
369 #define LL_INT(v) executor_->cgen_state_->llInt(v)
370 #define LL_FP(v) executor_->cgen_state_->llFp(v)
371 #define ROW_FUNC executor_->cgen_state_->row_func_
372 #define CUR_FUNC executor_->cgen_state_->current_func_
373 
375  Executor* executor,
376  const ExecutorDeviceType device_type,
377  const RelAlgExecutionUnit& ra_exe_unit,
378  const std::vector<InputTableInfo>& query_infos,
379  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
380  const std::optional<int64_t>& group_cardinality_estimation)
381  : executor_(executor)
382  , ra_exe_unit_(ra_exe_unit)
383  , query_infos_(query_infos)
384  , row_set_mem_owner_(row_set_mem_owner)
385  , device_type_(device_type)
386  , group_cardinality_estimation_(group_cardinality_estimation) {
387  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
388  if (!groupby_expr) {
389  continue;
390  }
391  const auto& groupby_ti = groupby_expr->get_type_info();
392  if (groupby_ti.is_bytes()) {
393  throw std::runtime_error(
394  "Cannot group by string columns which are not dictionary encoded.");
395  }
396  if (groupby_ti.is_buffer()) {
397  throw std::runtime_error("Group by buffer not supported");
398  }
399  if (groupby_ti.is_geometry()) {
400  throw std::runtime_error("Group by geometry not supported");
401  }
402  }
403 }
404 
406  const size_t shard_count) const {
407  size_t device_count{0};
409  device_count = executor_->cudaMgr()->getDeviceCount();
410  CHECK_GT(device_count, 0u);
411  }
412 
413  int64_t bucket{col_range_info.bucket};
414 
415  if (shard_count) {
416  CHECK(!col_range_info.bucket);
417  /*
418  when a node has fewer devices than shard count,
419  a) In a distributed setup, the minimum distance between two keys would be
420  device_count because shards are stored consecutively across the physical tables,
421  i.e if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1
422  would have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf
423  node has only 1 device, in this case, all the keys from each node are loaded on
424  the device each.
425 
426  b) In a single node setup, the distance would be minimum of device_count or
427  difference of device_count - shard_count. For example: If a single node server
428  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
429  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9
430  device 3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum
431  of device_count or difference.
432 
433  When a node has device count equal to or more than shard count then the
434  minimum distance is always at least shard_count * no of leaf nodes.
435  */
436  if (device_count < shard_count) {
437  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
438  : std::min(device_count, shard_count - device_count);
439  } else {
440  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
441  }
442  }
443 
444  return bucket;
445 }
446 
447 namespace {
448 
459  const std::vector<InputTableInfo>& query_infos,
460  const bool is_group_by,
461  Executor* executor) {
462  bool keyless{true}, found{false};
463  int32_t num_agg_expr{0};
464  int32_t index{0};
465  for (const auto target_expr : ra_exe_unit.target_exprs) {
466  const auto agg_info = get_target_info(target_expr, g_bigint_count);
467  const auto chosen_type = get_compact_type(agg_info);
468  if (agg_info.is_agg) {
469  num_agg_expr++;
470  }
471  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
472  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
473  CHECK(agg_expr);
474  const auto arg_expr = agg_arg(target_expr);
475  const bool float_argument_input = takes_float_argument(agg_info);
476  switch (agg_info.agg_kind) {
477  case kAVG:
478  ++index;
479  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
480  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
481  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
482  expr_range_info.hasNulls()) {
483  break;
484  }
485  }
486  found = true;
487  break;
488  case kCOUNT:
489  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
490  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
491  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
492  expr_range_info.hasNulls()) {
493  break;
494  }
495  }
496  found = true;
497  break;
498  case kSUM: {
499  auto arg_ti = arg_expr->get_type_info();
500  if (constrained_not_null(arg_expr, ra_exe_unit.quals)) {
501  arg_ti.set_notnull(true);
502  }
503  if (!arg_ti.get_notnull()) {
504  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
505  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
506  !expr_range_info.hasNulls()) {
507  found = true;
508  }
509  } else {
510  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
511  switch (expr_range_info.getType()) {
514  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
515  found = true;
516  }
517  break;
519  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
520  found = true;
521  }
522  break;
523  default:
524  break;
525  }
526  }
527  break;
528  }
529  case kMIN: {
530  CHECK(agg_expr && agg_expr->get_arg());
531  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
532  if (arg_ti.is_string() || arg_ti.is_buffer()) {
533  break;
534  }
535  auto expr_range_info =
536  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
537  auto init_max = get_agg_initial_val(agg_info.agg_kind,
538  chosen_type,
539  is_group_by || float_argument_input,
540  float_argument_input ? sizeof(float) : 8);
541  switch (expr_range_info.getType()) {
544  auto double_max =
545  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
546  if (expr_range_info.getFpMax() < double_max) {
547  found = true;
548  }
549  break;
550  }
552  if (expr_range_info.getIntMax() < init_max) {
553  found = true;
554  }
555  break;
556  default:
557  break;
558  }
559  break;
560  }
561  case kMAX: {
562  CHECK(agg_expr && agg_expr->get_arg());
563  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
564  if (arg_ti.is_string() || arg_ti.is_buffer()) {
565  break;
566  }
567  auto expr_range_info =
568  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
569  // NULL sentinel and init value for kMAX are identical, which results in
570  // ambiguity in detecting empty keys in presence of nulls.
571  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
572  expr_range_info.hasNulls()) {
573  break;
574  }
575  auto init_min = get_agg_initial_val(agg_info.agg_kind,
576  chosen_type,
577  is_group_by || float_argument_input,
578  float_argument_input ? sizeof(float) : 8);
579  switch (expr_range_info.getType()) {
582  auto double_min =
583  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
584  if (expr_range_info.getFpMin() > double_min) {
585  found = true;
586  }
587  break;
588  }
590  if (expr_range_info.getIntMin() > init_min) {
591  found = true;
592  }
593  break;
594  default:
595  break;
596  }
597  break;
598  }
599  default:
600  keyless = false;
601  break;
602  }
603  }
604  if (!keyless) {
605  break;
606  }
607  if (!found) {
608  ++index;
609  }
610  }
611 
612  // shouldn't use keyless for projection only
613  return {
614  keyless && found,
615  index,
616  };
617 }
618 
620  const RelAlgExecutionUnit& ra_exe_unit,
621  const std::vector<InputTableInfo>& query_infos,
622  const ColRangeInfo& group_by_range_info,
623  const ExecutorDeviceType device_type,
624  Executor* executor) {
625  CountDistinctDescriptors count_distinct_descriptors;
626  auto compute_bytes_per_group =
627  [](size_t bitmap_sz, size_t sub_bitmap_count, ExecutorDeviceType device_type) {
628  size_t effective_size_bytes = (bitmap_sz + 7) / 8;
629  const auto padded_size =
630  (device_type == ExecutorDeviceType::GPU || sub_bitmap_count > 1)
631  ? align_to_int64(effective_size_bytes)
632  : effective_size_bytes;
633  return padded_size * sub_bitmap_count;
634  };
635  for (size_t i = 0; i < ra_exe_unit.target_exprs.size(); i++) {
636  const auto target_expr = ra_exe_unit.target_exprs[i];
637  auto agg_info = get_target_info(target_expr, g_bigint_count);
638  if (is_distinct_target(agg_info)) {
639  CHECK(agg_info.is_agg);
640  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
641  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
642  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
643  if (arg_ti.is_bytes()) {
644  throw std::runtime_error(
645  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
646  }
647  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
648  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
649  }
650  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
651  throw std::runtime_error(
652  "APPROX_COUNT_DISTINCT on geometry columns not supported");
653  }
654  if (agg_info.is_distinct && arg_ti.is_geometry()) {
655  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
656  }
657  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
658  auto arg_range_info =
659  arg_ti.is_fp() ? no_range_info
661  ra_exe_unit, query_infos, agg_expr->get_arg(), executor);
662  const auto it = ra_exe_unit.target_exprs_original_type_infos.find(i);
663  if (it != ra_exe_unit.target_exprs_original_type_infos.end()) {
664  const auto& original_target_expr_ti = it->second;
665  if (arg_ti.is_integer() && original_target_expr_ti.get_type() == kDATE &&
666  original_target_expr_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
667  // manually encode the col range of date col if necessary
668  // (see conditionally_change_arg_to_int_type function in RelAlgExecutor.cpp)
669  auto is_date_value_not_encoded = [&original_target_expr_ti](int64_t date_val) {
670  if (original_target_expr_ti.get_comp_param() == 16) {
671  return date_val < INT16_MIN || date_val > INT16_MAX;
672  } else {
673  return date_val < INT32_MIN || date_val > INT32_MIN;
674  }
675  };
676  if (is_date_value_not_encoded(arg_range_info.min)) {
677  // chunk metadata of the date column contains decoded value
678  // so we manually encode it again here to represent its column range correctly
679  arg_range_info.min =
681  }
682  if (is_date_value_not_encoded(arg_range_info.max)) {
683  arg_range_info.max =
685  }
686  // now we manually encode the value, so we need to invalidate bucket value
687  // i.e., 86000 -> 0, to correctly calculate the size of bitmap
688  arg_range_info.bucket = 0;
689  }
690  }
691 
693  int64_t bitmap_sz_bits{0};
694  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
695  const auto error_rate_expr = agg_expr->get_arg1();
696  if (error_rate_expr) {
697  CHECK(error_rate_expr->get_type_info().get_type() == kINT);
698  auto const error_rate =
699  dynamic_cast<Analyzer::Constant const*>(error_rate_expr.get());
700  CHECK(error_rate);
701  CHECK_GE(error_rate->get_constval().intval, 1);
702  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
703  } else {
704  bitmap_sz_bits = g_hll_precision_bits;
705  }
706  }
707  if (arg_range_info.isEmpty()) {
708  count_distinct_descriptors.emplace_back(
710  0,
711  64,
712  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
713  device_type,
714  1});
715  continue;
716  }
717  const auto sub_bitmap_count =
718  get_count_distinct_sub_bitmap_count(bitmap_sz_bits, ra_exe_unit, device_type);
719  size_t worst_case_num_groups{1};
720  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
721  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
722  // implementation for arrays
723  count_distinct_impl_type = CountDistinctImplType::Bitmap;
724  if (shared::is_any<kCOUNT, kCOUNT_IF>(agg_info.agg_kind)) {
725  bitmap_sz_bits = get_bucketed_cardinality_without_nulls(arg_range_info);
726  if (bitmap_sz_bits <= 0 || g_bitmap_memory_limit <= bitmap_sz_bits) {
727  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
728  }
729  // check a potential OOM when using bitmap-based approach
730  const auto total_bytes_per_entry =
731  compute_bytes_per_group(bitmap_sz_bits, sub_bitmap_count, device_type);
732  const auto range_bucket = std::max(group_by_range_info.bucket, (int64_t)1);
733  const auto maximum_num_groups =
734  (group_by_range_info.max - group_by_range_info.min + 1) / range_bucket;
735  const auto total_bitmap_bytes_for_groups =
736  total_bytes_per_entry * maximum_num_groups;
737  // we can estimate a potential OOM of bitmap-based count-distinct operator
738  // by using the logic "check_total_bitmap_memory"
739  if (total_bitmap_bytes_for_groups >=
740  static_cast<size_t>(g_bitmap_memory_limit)) {
741  const auto agg_expr_max_entry_count =
742  arg_range_info.max - arg_range_info.min + 1;
743  int64_t max_agg_expr_table_cardinality{1};
744  std::set<const Analyzer::ColumnVar*,
745  bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
747  agg_expr->collect_column_var(colvar_set, true);
748  for (const auto cv : colvar_set) {
749  auto it =
750  std::find_if(query_infos.begin(),
751  query_infos.end(),
752  [&](const auto& input_table_info) {
753  return input_table_info.table_id == cv->get_table_id();
754  });
755  int64_t cur_table_cardinality =
756  it != query_infos.end()
757  ? static_cast<int64_t>(it->info.getNumTuplesUpperBound())
758  : -1;
759  max_agg_expr_table_cardinality =
760  std::max(max_agg_expr_table_cardinality, cur_table_cardinality);
761  worst_case_num_groups *= cur_table_cardinality;
762  }
763  auto has_valid_stat = [agg_expr_max_entry_count, maximum_num_groups]() {
764  return agg_expr_max_entry_count > 0 && maximum_num_groups > 0;
765  };
766  // if we have valid stats regarding input expr, we can try to relax the OOM
767  if (has_valid_stat()) {
768  // a threshold related to a ratio of a range of agg expr (let's say R)
769  // and table cardinality (C), i.e., use unordered_set if the # bits to build
770  // a bitmap based on R is four times larger than that of C
771  const size_t unordered_set_threshold{2};
772  // When we detect OOM of bitmap-based approach we selectively switch it to
773  // hash set-based processing logic if one of the followings is satisfied:
774  // 1) the column range is too wide compared with the table cardinality, or
775  // 2) the column range is too wide compared with the avg of # unique values
776  // per group by entry
777  const auto bits_for_agg_entry = std::ceil(log(agg_expr_max_entry_count));
778  const auto bits_for_agg_table =
779  std::ceil(log(max_agg_expr_table_cardinality));
780  const auto avg_num_unique_entries_per_group =
781  std::ceil(max_agg_expr_table_cardinality / maximum_num_groups);
782  // case a) given a range of entry count of agg_expr and the maximum
783  // cardinality among source tables of the agg_expr , we try to detect the
784  // misleading case of too sparse column range , i.e., agg_expr has 1M column
785  // range but only has two tuples {1 and 1M} / case b) check whether
786  // using bitmap is really beneficial when considering uniform distribution
787  // of (unique) keys.
788  if ((bits_for_agg_entry - bits_for_agg_table) >= unordered_set_threshold ||
789  agg_expr_max_entry_count >= avg_num_unique_entries_per_group) {
790  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
791  } else {
792  throw std::runtime_error(
793  "Consider using approx_count_distinct operator instead of "
794  "count_distinct operator to lower the memory "
795  "requirements");
796  }
797  }
798  }
799  }
800  }
801  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
802  count_distinct_impl_type == CountDistinctImplType::UnorderedSet &&
803  !(arg_ti.is_array() || arg_ti.is_geometry())) {
804  count_distinct_impl_type = CountDistinctImplType::Bitmap;
805  }
806  const size_t too_many_entries{100000000};
807  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
808  worst_case_num_groups > too_many_entries &&
809  count_distinct_impl_type == CountDistinctImplType::UnorderedSet) {
810  throw WatchdogException(
811  "Detect too many input entries for set-based count distinct operator under "
812  "the watchdog");
813  }
814  count_distinct_descriptors.emplace_back(
815  CountDistinctDescriptor{count_distinct_impl_type,
816  arg_range_info.min,
817  bitmap_sz_bits,
818  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
819  device_type,
820  sub_bitmap_count});
821  } else {
822  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
823  CountDistinctImplType::Invalid, 0, 0, false, device_type, 0});
824  }
825  }
826  return count_distinct_descriptors;
827 }
828 
829 } // namespace
830 
831 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
832  const bool allow_multifrag,
833  const size_t max_groups_buffer_entry_count,
834  const int8_t crt_min_byte_width,
835  RenderInfo* render_info,
836  const bool output_columnar_hint) {
837  const auto shard_count =
840  : 0;
841  bool sort_on_gpu_hint =
842  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
845  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
846  // but the total output buffer size would be too big or it's a sharded top query.
847  // For the sake of managing risk, use the new result set way very selectively for
848  // this case only (alongside the baseline layout we've enabled for a while now).
849  bool must_use_baseline_sort = shard_count;
850  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
851  while (true) {
852  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
853  max_groups_buffer_entry_count,
854  crt_min_byte_width,
855  sort_on_gpu_hint,
856  render_info,
857  must_use_baseline_sort,
858  output_columnar_hint);
859  CHECK(query_mem_desc);
860  if (query_mem_desc->sortOnGpu() &&
861  (query_mem_desc->getBufferSizeBytes(device_type_) +
862  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
863  2 * 1024 * 1024 * 1024LL) {
864  must_use_baseline_sort = true;
865  sort_on_gpu_hint = false;
866  } else {
867  break;
868  }
869  }
870  return query_mem_desc;
871 }
872 
873 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
874  const bool allow_multifrag,
875  const size_t max_groups_buffer_entry_count,
876  const int8_t crt_min_byte_width,
877  const bool sort_on_gpu_hint,
878  RenderInfo* render_info,
879  const bool must_use_baseline_sort,
880  const bool output_columnar_hint) {
881  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
882 
883  auto col_range_info_nosharding = getColRangeInfo();
884 
885  const auto shard_count =
888  : 0;
889 
890  const auto col_range_info =
891  ColRangeInfo{col_range_info_nosharding.hash_type_,
892  col_range_info_nosharding.min,
893  col_range_info_nosharding.max,
894  getShardedTopBucket(col_range_info_nosharding, shard_count),
895  col_range_info_nosharding.has_nulls};
896 
897  // Non-grouped aggregates do not support accessing aggregated ranges
898  // Keyless hash is currently only supported with single-column perfect hash
899  const auto keyless_info =
900  !(is_group_by &&
901  col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash)
902  ? KeylessInfo{false, -1}
904 
905  if (g_enable_watchdog &&
906  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
907  max_groups_buffer_entry_count > g_watchdog_baseline_max_groups) ||
908  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
909  ra_exe_unit_.groupby_exprs.size() == 1 &&
910  (col_range_info.max - col_range_info.min) /
911  std::max(col_range_info.bucket, int64_t(1)) >
912  130000000))) {
913  throw WatchdogException("Query would use too much memory");
914  }
915 
916  const auto count_distinct_descriptors = init_count_distinct_descriptors(
917  ra_exe_unit_, query_infos_, col_range_info, device_type_, executor_);
918  try {
920  ra_exe_unit_,
921  query_infos_,
922  col_range_info,
923  keyless_info,
924  allow_multifrag,
925  device_type_,
926  crt_min_byte_width,
927  sort_on_gpu_hint,
928  shard_count,
929  max_groups_buffer_entry_count,
930  render_info,
931  count_distinct_descriptors,
932  must_use_baseline_sort,
933  output_columnar_hint,
934  /*streaming_top_n_hint=*/true);
935  } catch (const StreamingTopNOOM& e) {
936  LOG(WARNING) << e.what() << " Disabling Streaming Top N.";
938  ra_exe_unit_,
939  query_infos_,
940  col_range_info,
941  keyless_info,
942  allow_multifrag,
943  device_type_,
944  crt_min_byte_width,
945  sort_on_gpu_hint,
946  shard_count,
947  max_groups_buffer_entry_count,
948  render_info,
949  count_distinct_descriptors,
950  must_use_baseline_sort,
951  output_columnar_hint,
952  /*streaming_top_n_hint=*/false);
953  }
954 }
955 
957  const std::list<Analyzer::OrderEntry>& order_entries) {
958  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
959  return false;
960  }
961  for (const auto& order_entry : order_entries) {
962  CHECK_GE(order_entry.tle_no, 1);
963  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
964  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
965  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
966  return false;
967  }
968  // TODO(alex): relax the restrictions
969  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
970  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
971  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
972  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
973  return false;
974  }
975  if (agg_expr->get_arg()) {
976  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
977  if (arg_ti.is_fp()) {
978  return false;
979  }
980  auto expr_range_info =
981  get_expr_range_info(ra_exe_unit_, query_infos_, agg_expr->get_arg(), executor_);
982  // TOD(adb): QMD not actually initialized here?
983  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
984  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
985  expr_range_info.has_nulls) &&
986  order_entry.is_desc == order_entry.nulls_first) {
987  return false;
988  }
989  }
990  const auto& target_ti = target_expr->get_type_info();
991  CHECK(!target_ti.is_buffer());
992  if (!target_ti.is_integer()) {
993  return false;
994  }
995  }
996  return true;
997 }
998 
999 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
1000  llvm::BasicBlock* sc_false,
1002  const CompilationOptions& co,
1003  const GpuSharedMemoryContext& gpu_smem_context) {
1004  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1005  CHECK(filter_result);
1006 
1007  bool can_return_error = false;
1008  llvm::BasicBlock* filter_false{nullptr};
1009 
1010  {
1011  const bool is_group_by = !ra_exe_unit_.groupby_exprs.empty();
1012 
1013  if (executor_->isArchMaxwell(co.device_type)) {
1014  prependForceSync();
1015  }
1016  DiamondCodegen filter_cfg(filter_result,
1017  executor_,
1018  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
1019  "filter", // filter_true and filter_false basic blocks
1020  nullptr,
1021  false);
1022  filter_false = filter_cfg.cond_false_;
1023 
1024  if (is_group_by) {
1026  !query_mem_desc.useStreamingTopN()) {
1027  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
1028  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
1029  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
1030  llvm::Value* old_total_matched_val{nullptr};
1032  old_total_matched_val =
1033  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
1034  total_matched_ptr,
1035  LL_INT(int32_t(1)),
1036 #if LLVM_VERSION_MAJOR > 12
1037  LLVM_ALIGN(8),
1038 #endif
1039  llvm::AtomicOrdering::Monotonic);
1040  } else {
1041  old_total_matched_val = LL_BUILDER.CreateLoad(
1042  total_matched_ptr->getType()->getPointerElementType(), total_matched_ptr);
1043  LL_BUILDER.CreateStore(
1044  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
1045  total_matched_ptr);
1046  }
1047  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
1048  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
1049  }
1050 
1051  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
1052  auto varlen_output_buffer = codegenVarlenOutputBuffer(query_mem_desc);
1053  if (query_mem_desc.usesGetGroupValueFast() ||
1054  query_mem_desc.getQueryDescriptionType() ==
1056  if (query_mem_desc.getGroupbyColCount() > 1) {
1057  filter_cfg.setChainToNext();
1058  }
1059  // Don't generate null checks if the group slot is guaranteed to be non-null,
1060  // as it's the case for get_group_value_fast* family.
1061  can_return_error = codegenAggCalls(agg_out_ptr_w_idx,
1062  varlen_output_buffer,
1063  {},
1065  co,
1066  gpu_smem_context,
1067  filter_cfg);
1068  } else {
1069  {
1070  llvm::Value* nullcheck_cond{nullptr};
1071  if (query_mem_desc.didOutputColumnar()) {
1072  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
1073  LL_INT(int32_t(0)));
1074  } else {
1075  nullcheck_cond = LL_BUILDER.CreateICmpNE(
1076  std::get<0>(agg_out_ptr_w_idx),
1077  llvm::ConstantPointerNull::get(
1078  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
1079  }
1080  DiamondCodegen nullcheck_cfg(
1081  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
1082  codegenAggCalls(agg_out_ptr_w_idx,
1083  varlen_output_buffer,
1084  {},
1086  co,
1087  gpu_smem_context,
1088  filter_cfg);
1089  }
1090  can_return_error = true;
1091  if (query_mem_desc.getQueryDescriptionType() ==
1093  query_mem_desc.useStreamingTopN()) {
1094  // Ignore rejection on pushing current row to top-K heap.
1095  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
1096  } else {
1097  CodeGenerator code_generator(executor_);
1098  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
1099  // TODO(alex): remove the trunc once pos is converted to 32 bits
1100  code_generator.posArg(nullptr),
1101  get_int_type(32, LL_CONTEXT))));
1102  }
1103  }
1104  } else {
1105  if (ra_exe_unit_.estimator) {
1106  std::stack<llvm::BasicBlock*> array_loops;
1107  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
1108  } else {
1109  auto arg_it = ROW_FUNC->arg_begin();
1110  std::vector<llvm::Value*> agg_out_vec;
1111  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
1112  agg_out_vec.push_back(&*arg_it++);
1113  }
1114  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
1115  /*varlen_output_buffer=*/nullptr,
1116  agg_out_vec,
1117  query_mem_desc,
1118  co,
1119  gpu_smem_context,
1120  filter_cfg);
1121  }
1122  }
1123  }
1124 
1125  if (ra_exe_unit_.join_quals.empty()) {
1126  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
1127  } else if (sc_false) {
1128  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
1129  LL_BUILDER.SetInsertPoint(sc_false);
1130  LL_BUILDER.CreateBr(filter_false);
1131  LL_BUILDER.SetInsertPoint(saved_insert_block);
1132  }
1133 
1134  return can_return_error;
1135 }
1136 
1138  llvm::Value* groups_buffer,
1140  const CompilationOptions& co,
1141  DiamondCodegen& diamond_codegen) {
1142  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1144  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1145  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
1146  CHECK(!group_expr);
1147  if (!query_mem_desc.didOutputColumnar()) {
1148  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
1149  }
1150  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1151  ? 0
1152  : query_mem_desc.getRowSize() / sizeof(int64_t);
1153  CodeGenerator code_generator(executor_);
1154  if (query_mem_desc.useStreamingTopN()) {
1155  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
1156  CHECK_GE(only_order_entry.tle_no, int(1));
1157  const size_t target_idx = only_order_entry.tle_no - 1;
1158  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1159  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1160  const auto chosen_bytes =
1161  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1162  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1163  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1165  std::string fname = "get_bin_from_k_heap";
1166  const auto& oe_ti = order_entry_expr->get_type_info();
1167  llvm::Value* null_key_lv = nullptr;
1168  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1169  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1170  switch (bit_width) {
1171  case 32:
1172  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1173  break;
1174  case 64:
1175  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1176  break;
1177  default:
1178  CHECK(false);
1179  }
1180  fname += "_int" + std::to_string(bit_width) + "_t";
1181  } else {
1182  CHECK(oe_ti.is_fp());
1183  if (order_entry_lv->getType()->isDoubleTy()) {
1184  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1185  } else {
1186  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1187  }
1188  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1189  }
1190  const auto key_slot_idx =
1192  return emitCall(
1193  fname,
1194  {groups_buffer,
1195  LL_INT(n),
1196  LL_INT(row_size_quad),
1197  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1198  LL_BOOL(only_order_entry.is_desc),
1199  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1200  LL_BOOL(only_order_entry.nulls_first),
1201  null_key_lv,
1202  order_entry_lv});
1203  } else {
1204  auto* arg = get_arg_by_name(ROW_FUNC, "max_matched");
1205  const auto output_buffer_entry_count_lv =
1206  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1207  arg = get_arg_by_name(ROW_FUNC, "old_total_matched");
1208  const auto group_expr_lv =
1209  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1210  std::vector<llvm::Value*> args{groups_buffer,
1211  output_buffer_entry_count_lv,
1212  group_expr_lv,
1213  code_generator.posArg(nullptr)};
1214  if (query_mem_desc.didOutputColumnar()) {
1215  const auto columnar_output_offset =
1216  emitCall("get_columnar_scan_output_offset", args);
1217  return columnar_output_offset;
1218  }
1219  args.push_back(LL_INT(row_size_quad));
1220  return emitCall("get_scan_output_slot", args);
1221  }
1222 }
1223 
1224 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1226  const CompilationOptions& co,
1227  DiamondCodegen& diamond_codegen) {
1228  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1229  auto arg_it = ROW_FUNC->arg_begin();
1230  auto groups_buffer = arg_it++;
1231 
1232  std::stack<llvm::BasicBlock*> array_loops;
1233 
1234  // TODO(Saman): move this logic outside of this function.
1236  if (query_mem_desc.didOutputColumnar()) {
1237  return std::make_tuple(
1238  &*groups_buffer,
1239  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1240  } else {
1241  return std::make_tuple(
1242  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1243  nullptr);
1244  }
1245  }
1246 
1247  CHECK(query_mem_desc.getQueryDescriptionType() ==
1249  query_mem_desc.getQueryDescriptionType() ==
1251 
1252  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1253  ? 0
1254  : query_mem_desc.getRowSize() / sizeof(int64_t);
1255 
1256  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1257  ? sizeof(int64_t)
1258  : query_mem_desc.getEffectiveKeyWidth();
1259  // for multi-column group by
1260  llvm::Value* group_key = nullptr;
1261  llvm::Value* key_size_lv = nullptr;
1262 
1263  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1264  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1265  if (query_mem_desc.getQueryDescriptionType() ==
1267  group_key =
1268  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1269  } else if (query_mem_desc.getQueryDescriptionType() ==
1271  group_key =
1272  col_width_size == sizeof(int32_t)
1273  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1274  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1275  }
1276  CHECK(group_key);
1277  CHECK(key_size_lv);
1278  }
1279 
1280  int32_t subkey_idx = 0;
1281  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1282  for (const auto& group_expr : ra_exe_unit_.groupby_exprs) {
1283  const auto col_range_info =
1285  const auto translated_null_value = static_cast<int64_t>(
1286  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1287  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1288  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1289  : checked_int64_t(col_range_info.max) +
1290  (col_range_info.bucket ? col_range_info.bucket : 1));
1291 
1292  const bool col_has_nulls =
1293  query_mem_desc.getQueryDescriptionType() ==
1295  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1296  ? query_mem_desc.hasNulls()
1297  : col_range_info.has_nulls)
1298  : false;
1299 
1300  const auto group_expr_lvs =
1301  executor_->groupByColumnCodegen(group_expr.get(),
1302  col_width_size,
1303  co,
1304  col_has_nulls,
1305  translated_null_value,
1306  diamond_codegen,
1307  array_loops,
1308  query_mem_desc.threadsShareMemory());
1309  const auto group_expr_lv = group_expr_lvs.translated_value;
1310  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1311  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1312  return codegenSingleColumnPerfectHash(query_mem_desc,
1313  co,
1314  &*groups_buffer,
1315  group_expr_lv,
1316  group_expr_lvs.original_value,
1317  row_size_quad);
1318  } else {
1319  // store the sub-key to the buffer
1320  LL_BUILDER.CreateStore(
1321  group_expr_lv,
1322  LL_BUILDER.CreateGEP(
1323  group_key->getType()->getScalarType()->getPointerElementType(),
1324  group_key,
1325  LL_INT(subkey_idx++)));
1326  }
1327  }
1328  if (query_mem_desc.getQueryDescriptionType() ==
1330  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1332  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1333  } else if (query_mem_desc.getQueryDescriptionType() ==
1336  &*groups_buffer,
1337  group_key,
1338  key_size_lv,
1339  query_mem_desc,
1340  col_width_size,
1341  row_size_quad);
1342  }
1343  CHECK(false);
1344  return std::make_tuple(nullptr, nullptr);
1345 }
1346 
1349  if (!query_mem_desc.hasVarlenOutput()) {
1350  return nullptr;
1351  }
1352 
1353  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1354  auto arg_it = ROW_FUNC->arg_begin();
1355  arg_it++; /* groups_buffer */
1356  auto varlen_output_buffer = arg_it++;
1357  CHECK(varlen_output_buffer->getType() == llvm::Type::getInt64PtrTy(LL_CONTEXT));
1358  return varlen_output_buffer;
1359 }
1360 
1361 std::tuple<llvm::Value*, llvm::Value*>
1364  const CompilationOptions& co,
1365  llvm::Value* groups_buffer,
1366  llvm::Value* group_expr_lv_translated,
1367  llvm::Value* group_expr_lv_original,
1368  const int32_t row_size_quad) {
1369  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1370  CHECK(query_mem_desc.usesGetGroupValueFast());
1371  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1372  ? "get_columnar_group_bin_offset"
1373  : "get_group_value_fast"};
1374  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1375  get_group_fn_name += "_keyless";
1376  }
1377  if (query_mem_desc.interleavedBins(co.device_type)) {
1378  CHECK(!query_mem_desc.didOutputColumnar());
1379  CHECK(query_mem_desc.hasKeylessHash());
1380  get_group_fn_name += "_semiprivate";
1381  }
1382  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1383  &*group_expr_lv_translated};
1384  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1385  query_mem_desc.mustUseBaselineSort()) {
1386  get_group_fn_name += "_with_original_key";
1387  get_group_fn_args.push_back(group_expr_lv_original);
1388  }
1389  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1390  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1391  if (!query_mem_desc.hasKeylessHash()) {
1392  if (!query_mem_desc.didOutputColumnar()) {
1393  get_group_fn_args.push_back(LL_INT(row_size_quad));
1394  }
1395  } else {
1396  if (!query_mem_desc.didOutputColumnar()) {
1397  get_group_fn_args.push_back(LL_INT(row_size_quad));
1398  }
1399  if (query_mem_desc.interleavedBins(co.device_type)) {
1400  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1401  get_group_fn_args.push_back(warp_idx);
1402  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1403  }
1404  }
1405  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1406  return std::make_tuple(&*groups_buffer,
1407  emitCall(get_group_fn_name, get_group_fn_args));
1408  }
1409  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1410 }
1411 
1412 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1413  llvm::Value* groups_buffer,
1414  llvm::Value* group_key,
1415  llvm::Value* key_size_lv,
1416  const QueryMemoryDescriptor& query_mem_desc,
1417  const int32_t row_size_quad) {
1418  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1419  CHECK(query_mem_desc.getQueryDescriptionType() ==
1421  // compute the index (perfect hash)
1422  auto perfect_hash_func = codegenPerfectHashFunction();
1423  auto hash_lv =
1424  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1425 
1426  if (query_mem_desc.didOutputColumnar()) {
1427  if (!query_mem_desc.hasKeylessHash()) {
1428  const std::string set_matching_func_name{
1429  "set_matching_group_value_perfect_hash_columnar"};
1430  const std::vector<llvm::Value*> set_matching_func_arg{
1431  groups_buffer,
1432  hash_lv,
1433  group_key,
1434  key_size_lv,
1435  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1436  query_mem_desc.getEntryCount())};
1437  emitCall(set_matching_func_name, set_matching_func_arg);
1438  }
1439  return std::make_tuple(groups_buffer, hash_lv);
1440  } else {
1441  if (query_mem_desc.hasKeylessHash()) {
1442  return std::make_tuple(emitCall("get_matching_group_value_perfect_hash_keyless",
1443  {groups_buffer, hash_lv, LL_INT(row_size_quad)}),
1444  nullptr);
1445  } else {
1446  return std::make_tuple(
1447  emitCall(
1448  "get_matching_group_value_perfect_hash",
1449  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1450  nullptr);
1451  }
1452  }
1453 }
1454 
1455 std::tuple<llvm::Value*, llvm::Value*>
1457  const CompilationOptions& co,
1458  llvm::Value* groups_buffer,
1459  llvm::Value* group_key,
1460  llvm::Value* key_size_lv,
1461  const QueryMemoryDescriptor& query_mem_desc,
1462  const size_t key_width,
1463  const int32_t row_size_quad) {
1464  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1465  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1466  CHECK(key_width == sizeof(int32_t));
1467  group_key =
1468  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1469  }
1470  std::vector<llvm::Value*> func_args{
1471  groups_buffer,
1472  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1473  &*group_key,
1474  &*key_size_lv,
1475  LL_INT(static_cast<int32_t>(key_width))};
1476  std::string func_name{"get_group_value"};
1477  if (query_mem_desc.didOutputColumnar()) {
1478  func_name += "_columnar_slot";
1479  } else {
1480  func_args.push_back(LL_INT(row_size_quad));
1481  }
1482  if (co.with_dynamic_watchdog) {
1483  func_name += "_with_watchdog";
1484  }
1485  if (query_mem_desc.didOutputColumnar()) {
1486  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1487  } else {
1488  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1489  }
1490 }
1491 
1493  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1494  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1495  auto ft = llvm::FunctionType::get(
1496  get_int_type(32, LL_CONTEXT),
1497  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1498  false);
1499  auto key_hash_func = llvm::Function::Create(ft,
1500  llvm::Function::ExternalLinkage,
1501  "perfect_key_hash",
1502  executor_->cgen_state_->module_);
1503  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1504  mark_function_always_inline(key_hash_func);
1505  auto& key_buff_arg = *key_hash_func->args().begin();
1506  llvm::Value* key_buff_lv = &key_buff_arg;
1507  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1508  llvm::IRBuilder<> key_hash_func_builder(bb);
1509  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1510  std::vector<int64_t> cardinalities;
1511  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1512  auto col_range_info =
1513  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1514  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1515  cardinalities.push_back(getBucketedCardinality(col_range_info));
1516  }
1517  size_t dim_idx = 0;
1518  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1519  auto* gep = key_hash_func_builder.CreateGEP(
1520  key_buff_lv->getType()->getScalarType()->getPointerElementType(),
1521  key_buff_lv,
1522  LL_INT(dim_idx));
1523  auto key_comp_lv =
1524  key_hash_func_builder.CreateLoad(gep->getType()->getPointerElementType(), gep);
1525  auto col_range_info =
1526  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1527  auto crt_term_lv =
1528  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1529  if (col_range_info.bucket) {
1530  crt_term_lv =
1531  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1532  }
1533  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1534  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1535  LL_INT(cardinalities[prev_dim_idx]));
1536  }
1537  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1538  ++dim_idx;
1539  }
1540  key_hash_func_builder.CreateRet(
1541  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1542  return key_hash_func;
1543 }
1544 
1546  const TargetInfo& agg_info,
1547  llvm::Value* target) {
1548  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1549  const auto& agg_type = agg_info.sql_type;
1550  const size_t chosen_bytes = agg_type.get_size();
1551 
1552  bool need_conversion{false};
1553  llvm::Value* arg_null{nullptr};
1554  llvm::Value* agg_null{nullptr};
1555  llvm::Value* target_to_cast{target};
1556  if (arg_type.is_fp()) {
1557  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1558  if (agg_type.is_fp()) {
1559  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1560  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1561  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1562  need_conversion = true;
1563  }
1564  } else {
1565  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1566  return target;
1567  }
1568  } else {
1569  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1570  if (agg_type.is_fp()) {
1571  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1572  need_conversion = true;
1573  target_to_cast = executor_->castToFP(target, arg_type, agg_type);
1574  } else {
1575  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1576  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1577  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1578  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1579  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1580  need_conversion = true;
1581  }
1582  }
1583  }
1584  if (need_conversion) {
1585  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1586  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1587  return LL_BUILDER.CreateSelect(
1588  cmp,
1589  agg_null,
1590  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1591  } else {
1592  return target;
1593  }
1594 }
1595 
1597  const Analyzer::WindowFunction* window_func,
1598  const QueryMemoryDescriptor& query_mem_desc,
1599  const CompilationOptions& co,
1600  DiamondCodegen& diamond_codegen) {
1601  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1602  const auto window_func_context =
1604  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1605  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1606  ? 0
1607  : query_mem_desc.getRowSize() / sizeof(int64_t);
1608  auto arg_it = ROW_FUNC->arg_begin();
1609  auto groups_buffer = arg_it++;
1610  CodeGenerator code_generator(executor_);
1611  auto window_pos_lv = code_generator.codegenWindowPosition(
1612  window_func_context, code_generator.posArg(nullptr));
1613  const auto pos_in_window =
1614  LL_BUILDER.CreateTrunc(window_pos_lv, get_int_type(32, LL_CONTEXT));
1615  llvm::Value* entry_count_lv =
1616  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1617  std::vector<llvm::Value*> args{
1618  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1619  if (query_mem_desc.didOutputColumnar()) {
1620  const auto columnar_output_offset =
1621  emitCall("get_columnar_scan_output_offset", args);
1622  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1623  }
1624  args.push_back(LL_INT(row_size_quad));
1625  return emitCall("get_scan_output_slot", args);
1626  }
1627  auto arg_it = ROW_FUNC->arg_begin();
1628  auto groups_buffer = arg_it++;
1629  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1630 }
1631 
1633  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1634  llvm::Value* varlen_output_buffer,
1635  const std::vector<llvm::Value*>& agg_out_vec,
1636  QueryMemoryDescriptor& query_mem_desc,
1637  const CompilationOptions& co,
1638  const GpuSharedMemoryContext& gpu_smem_context,
1639  DiamondCodegen& diamond_codegen) {
1640  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1641  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1642  // TODO(alex): unify the two cases, the output for non-group by queries
1643  // should be a contiguous buffer
1644  const bool is_group_by = std::get<0>(agg_out_ptr_w_idx);
1645  bool can_return_error = false;
1646  if (is_group_by) {
1647  CHECK(agg_out_vec.empty());
1648  } else {
1649  CHECK(!agg_out_vec.empty());
1650  }
1651 
1652  // output buffer is casted into a byte stream to be able to handle data elements of
1653  // different sizes (only used when actual column width sizes are used)
1654  llvm::Value* output_buffer_byte_stream{nullptr};
1655  llvm::Value* out_row_idx{nullptr};
1656  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1658  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1659  std::get<0>(agg_out_ptr_w_idx),
1660  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1661  output_buffer_byte_stream->setName("out_buff_b_stream");
1662  CHECK(std::get<1>(agg_out_ptr_w_idx));
1663  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1664  llvm::Type::getInt64Ty(LL_CONTEXT));
1665  out_row_idx->setName("out_row_idx");
1666  }
1667 
1668  TargetExprCodegenBuilder target_builder(ra_exe_unit_, is_group_by);
1669  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1670  ++target_idx) {
1671  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1672  CHECK(target_expr);
1673 
1674  target_builder(target_expr, executor_, query_mem_desc, co);
1675  }
1676 
1677  target_builder.codegen(this,
1678  executor_,
1679  query_mem_desc,
1680  co,
1681  gpu_smem_context,
1682  agg_out_ptr_w_idx,
1683  agg_out_vec,
1684  output_buffer_byte_stream,
1685  out_row_idx,
1686  varlen_output_buffer,
1687  diamond_codegen);
1688 
1689  for (auto target_expr : ra_exe_unit_.target_exprs) {
1690  CHECK(target_expr);
1691  executor_->plan_state_->isLazyFetchColumn(target_expr);
1692  }
1693 
1694  return can_return_error;
1695 }
1696 
1701  llvm::Value* output_buffer_byte_stream,
1702  llvm::Value* out_row_idx,
1703  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1704  const QueryMemoryDescriptor& query_mem_desc,
1705  const size_t chosen_bytes,
1706  const size_t agg_out_off,
1707  const size_t target_idx) {
1708  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1709  llvm::Value* agg_col_ptr{nullptr};
1710  if (query_mem_desc.didOutputColumnar()) {
1711  // TODO(Saman): remove the second columnar branch, and support all query description
1712  // types through the first branch. Then, input arguments should also be cleaned up
1713  if (!g_cluster &&
1715  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1716  chosen_bytes == 8);
1717  CHECK(output_buffer_byte_stream);
1718  CHECK(out_row_idx);
1719  size_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1720  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1721  auto out_per_col_byte_idx =
1722 #ifdef _WIN32
1723  LL_BUILDER.CreateShl(out_row_idx, __lzcnt(chosen_bytes) - 1);
1724 #else
1725  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1726 #endif
1727  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1728  LL_INT(static_cast<int64_t>(col_off)));
1729  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1730  auto output_ptr = LL_BUILDER.CreateGEP(
1731  output_buffer_byte_stream->getType()->getScalarType()->getPointerElementType(),
1732  output_buffer_byte_stream,
1733  byte_offset);
1734  agg_col_ptr = LL_BUILDER.CreateBitCast(
1735  output_ptr,
1736  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1737  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1738  } else {
1739  auto const col_off_in_bytes = query_mem_desc.getColOffInBytes(agg_out_off);
1740  auto const col_off = col_off_in_bytes / chosen_bytes;
1741  auto const col_rem = col_off_in_bytes % chosen_bytes;
1742  CHECK_EQ(col_rem, 0u) << col_off_in_bytes << " % " << chosen_bytes;
1743  CHECK(std::get<1>(agg_out_ptr_w_idx));
1744  auto* agg_out_idx = LL_BUILDER.CreateZExt(
1745  std::get<1>(agg_out_ptr_w_idx),
1746  get_int_type(8 * sizeof(col_off), executor_->cgen_state_->context_));
1747  auto* offset = LL_BUILDER.CreateAdd(agg_out_idx, LL_INT(col_off));
1748  auto* bit_cast = LL_BUILDER.CreateBitCast(
1749  std::get<0>(agg_out_ptr_w_idx),
1750  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1751  agg_col_ptr = LL_BUILDER.CreateGEP(
1752  bit_cast->getType()->getScalarType()->getPointerElementType(),
1753  bit_cast,
1754  offset);
1755  }
1756  } else {
1757  auto const col_off_in_bytes = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1758  auto const col_off = col_off_in_bytes / chosen_bytes;
1759  auto const col_rem = col_off_in_bytes % chosen_bytes;
1760  CHECK_EQ(col_rem, 0u) << col_off_in_bytes << " % " << chosen_bytes;
1761  auto* bit_cast = LL_BUILDER.CreateBitCast(
1762  std::get<0>(agg_out_ptr_w_idx),
1763  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1764  agg_col_ptr = LL_BUILDER.CreateGEP(
1765  bit_cast->getType()->getScalarType()->getPointerElementType(),
1766  bit_cast,
1767  LL_INT(col_off));
1768  }
1769  CHECK(agg_col_ptr);
1770  return agg_col_ptr;
1771 }
1772 
1773 void GroupByAndAggregate::codegenEstimator(std::stack<llvm::BasicBlock*>& array_loops,
1774  DiamondCodegen& diamond_codegen,
1775  const QueryMemoryDescriptor& query_mem_desc,
1776  const CompilationOptions& co) {
1777  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1778  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1779  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1780  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1781  estimator_comp_count_lv);
1782  int32_t subkey_idx = 0;
1783  for (const auto& estimator_arg_comp : estimator_arg) {
1784  const auto estimator_arg_comp_lvs =
1785  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1786  query_mem_desc.getEffectiveKeyWidth(),
1787  co,
1788  false,
1789  0,
1790  diamond_codegen,
1791  array_loops,
1792  true);
1793  CHECK(!estimator_arg_comp_lvs.original_value);
1794  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1795  // store the sub-key to the buffer
1796  LL_BUILDER.CreateStore(
1797  estimator_arg_comp_lv,
1798  LL_BUILDER.CreateGEP(
1799  estimator_key_lv->getType()->getScalarType()->getPointerElementType(),
1800  estimator_key_lv,
1801  LL_INT(subkey_idx++)));
1802  }
1803  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1804  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1805  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1806  const auto estimator_comp_bytes_lv =
1807  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1808  const auto bitmap_size_lv =
1809  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1810  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1811  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1812 }
1813 
1814 extern "C" RUNTIME_EXPORT void agg_count_distinct(int64_t* agg, const int64_t val) {
1815  reinterpret_cast<CountDistinctSet*>(*agg)->insert(val);
1816 }
1817 
1818 extern "C" RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t* agg,
1819  const int64_t val,
1820  const int64_t skip_val) {
1821  if (val != skip_val) {
1822  agg_count_distinct(agg, val);
1823  }
1824 }
1825 
1826 extern "C" RUNTIME_EXPORT void agg_approx_quantile(int64_t* agg, const double val) {
1827  auto* t_digest = reinterpret_cast<quantile::TDigest*>(*agg);
1828  t_digest->allocate();
1829  t_digest->add(val);
1830 }
1831 
1832 extern "C" RUNTIME_EXPORT void agg_mode_func(int64_t* agg, const int64_t val) {
1833  auto* mode_map = reinterpret_cast<AggMode*>(*agg);
1834  mode_map->add(val);
1835 }
1836 
1838  const size_t target_idx,
1839  const Analyzer::Expr* target_expr,
1840  std::vector<llvm::Value*>& agg_args,
1841  const QueryMemoryDescriptor& query_mem_desc,
1842  const ExecutorDeviceType device_type) {
1843  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1844  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1845  const auto& arg_ti =
1846  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1847  if (arg_ti.is_fp()) {
1848  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1849  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1850  }
1851  const auto& count_distinct_descriptor =
1852  query_mem_desc.getCountDistinctDescriptor(target_idx);
1853  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1854  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1855  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1856  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1857  if (device_type == ExecutorDeviceType::GPU) {
1858  const auto base_dev_addr = getAdditionalLiteral(-1);
1859  const auto base_host_addr = getAdditionalLiteral(-2);
1860  agg_args.push_back(base_dev_addr);
1861  agg_args.push_back(base_host_addr);
1862  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1863  } else {
1864  emitCall("agg_approximate_count_distinct", agg_args);
1865  }
1866  return;
1867  }
1868  std::string agg_fname{"agg_count_distinct"};
1869  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1870  agg_fname += "_bitmap";
1871  agg_args.push_back(LL_INT(static_cast<int64_t>(count_distinct_descriptor.min_val)));
1872  }
1873  if (agg_info.skip_null_val) {
1874  auto null_lv = executor_->cgen_state_->castToTypeIn(
1875  (arg_ti.is_fp()
1876  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1877  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1878  64);
1879  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1880  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1881  agg_fname += "_skip_val";
1882  agg_args.push_back(null_lv);
1883  }
1884  if (device_type == ExecutorDeviceType::GPU) {
1885  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1886  agg_fname += "_gpu";
1887  const auto base_dev_addr = getAdditionalLiteral(-1);
1888  const auto base_host_addr = getAdditionalLiteral(-2);
1889  agg_args.push_back(base_dev_addr);
1890  agg_args.push_back(base_host_addr);
1891  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1892  CHECK_EQ(size_t(0),
1893  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1894  count_distinct_descriptor.sub_bitmap_count);
1895  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1896  count_distinct_descriptor.sub_bitmap_count)));
1897  }
1898  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1899  emitCall(agg_fname, agg_args);
1900  } else {
1901  executor_->cgen_state_->emitExternalCall(
1902  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1903  }
1904 }
1905 
1907  const size_t target_idx,
1908  const Analyzer::Expr* target_expr,
1909  std::vector<llvm::Value*>& agg_args,
1910  const QueryMemoryDescriptor& query_mem_desc,
1911  const ExecutorDeviceType device_type) {
1912  if (device_type == ExecutorDeviceType::GPU) {
1913  throw QueryMustRunOnCpu();
1914  }
1915  llvm::BasicBlock *calc, *skip{nullptr};
1916  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1917  auto const arg_ti =
1918  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1919  bool const nullable = !arg_ti.get_notnull();
1920 
1921  auto* cs = executor_->cgen_state_.get();
1922  auto& irb = cs->ir_builder_;
1923  if (nullable) {
1924  auto* const null_value = cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1925  auto* const skip_cond = arg_ti.is_fp()
1926  ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1927  : irb.CreateICmpEQ(agg_args.back(), null_value);
1928  calc = llvm::BasicBlock::Create(cs->context_, "calc_approx_quantile");
1929  skip = llvm::BasicBlock::Create(cs->context_, "skip_approx_quantile");
1930  irb.CreateCondBr(skip_cond, skip, calc);
1931  cs->current_func_->getBasicBlockList().push_back(calc);
1932  irb.SetInsertPoint(calc);
1933  }
1934  if (!arg_ti.is_fp()) {
1935  auto const agg_info = get_target_info(target_expr, g_bigint_count);
1936  agg_args.back() = executor_->castToFP(agg_args.back(), arg_ti, agg_info.sql_type);
1937  }
1938  cs->emitExternalCall(
1939  "agg_approx_quantile", llvm::Type::getVoidTy(cs->context_), agg_args);
1940  if (nullable) {
1941  irb.CreateBr(skip);
1942  cs->current_func_->getBasicBlockList().push_back(skip);
1943  irb.SetInsertPoint(skip);
1944  }
1945 }
1946 
1947 void GroupByAndAggregate::codegenMode(const size_t target_idx,
1948  const Analyzer::Expr* target_expr,
1949  std::vector<llvm::Value*>& agg_args,
1950  const QueryMemoryDescriptor& query_mem_desc,
1951  const ExecutorDeviceType device_type) {
1952  if (device_type == ExecutorDeviceType::GPU) {
1953  throw QueryMustRunOnCpu();
1954  }
1955  llvm::BasicBlock *calc, *skip{nullptr};
1956  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1957  auto const arg_ti =
1958  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1959  bool const nullable = !arg_ti.get_notnull();
1960  bool const is_fp = arg_ti.is_fp();
1961  auto* cs = executor_->cgen_state_.get();
1962  auto& irb = cs->ir_builder_;
1963  if (nullable) {
1964  auto* const null_value =
1965  is_fp ? cs->inlineNull(arg_ti) : cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1966  auto* const skip_cond = is_fp ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1967  : irb.CreateICmpEQ(agg_args.back(), null_value);
1968  calc = llvm::BasicBlock::Create(cs->context_, "calc_mode");
1969  skip = llvm::BasicBlock::Create(cs->context_, "skip_mode");
1970  irb.CreateCondBr(skip_cond, skip, calc);
1971  cs->current_func_->getBasicBlockList().push_back(calc);
1972  irb.SetInsertPoint(calc);
1973  }
1974  if (is_fp) {
1975  auto* const int_type = get_int_type(8 * arg_ti.get_size(), cs->context_);
1976  agg_args.back() = irb.CreateBitCast(agg_args.back(), int_type);
1977  }
1978  // "agg_mode" collides with existing names, so non-standard suffix "_func" is added.
1979  cs->emitExternalCall("agg_mode_func", llvm::Type::getVoidTy(cs->context_), agg_args);
1980  if (nullable) {
1981  irb.CreateBr(skip);
1982  cs->current_func_->getBasicBlockList().push_back(skip);
1983  irb.SetInsertPoint(skip);
1984  }
1985 }
1986 
1987 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
1988  CHECK_LT(off, 0);
1989  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
1990  auto* bit_cast = LL_BUILDER.CreateBitCast(
1991  lit_buff_lv, llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0));
1992  auto* gep =
1993  LL_BUILDER.CreateGEP(bit_cast->getType()->getScalarType()->getPointerElementType(),
1994  bit_cast,
1995  LL_INT(off));
1996  return LL_BUILDER.CreateLoad(gep->getType()->getPointerElementType(), gep);
1997 }
1998 
1999 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
2000  const Analyzer::Expr* target_expr,
2001  const CompilationOptions& co) {
2002  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2003  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
2004  const auto func_expr = dynamic_cast<const Analyzer::FunctionOper*>(target_expr);
2005  const auto arr_expr = dynamic_cast<const Analyzer::ArrayExpr*>(target_expr);
2006 
2007  // TODO(alex): handle arrays uniformly?
2008  CodeGenerator code_generator(executor_);
2009  if (target_expr) {
2010  const auto& target_ti = target_expr->get_type_info();
2011  if (target_ti.is_buffer() &&
2012  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
2013  const auto target_lvs =
2014  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2015  : code_generator.codegen(
2016  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
2017  if (!func_expr && !arr_expr) {
2018  // Something with the chunk transport is code that was generated from a source
2019  // other than an ARRAY[] expression
2020  if (target_ti.is_bytes()) {
2021  CHECK_EQ(size_t(3), target_lvs.size());
2022  return {target_lvs[1], target_lvs[2]};
2023  }
2024  CHECK(target_ti.is_array());
2025  CHECK_EQ(size_t(1), target_lvs.size());
2026  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
2027  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
2028  const auto i8p_ty =
2029  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
2030  const auto& elem_ti = target_ti.get_elem_type();
2031  return {
2032  executor_->cgen_state_->emitExternalCall(
2033  "array_buff",
2034  i8p_ty,
2035  {target_lvs.front(), code_generator.posArg(target_expr)}),
2036  executor_->cgen_state_->emitExternalCall(
2037  "array_size",
2038  i32_ty,
2039  {target_lvs.front(),
2040  code_generator.posArg(target_expr),
2041  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
2042  } else {
2043  if (agg_expr) {
2044  throw std::runtime_error(
2045  "Using array[] operator as argument to an aggregate operator is not "
2046  "supported");
2047  }
2048  CHECK(func_expr || arr_expr);
2049  if (dynamic_cast<const Analyzer::FunctionOper*>(target_expr)) {
2050  CHECK_EQ(size_t(1), target_lvs.size());
2051  const auto prefix = target_ti.get_buffer_name();
2052  CHECK(target_ti.is_array() || target_ti.is_bytes());
2053  const auto target_lv = LL_BUILDER.CreateLoad(
2054  target_lvs[0]->getType()->getPointerElementType(), target_lvs[0]);
2055  // const auto target_lv_type = target_lvs[0]->getType();
2056  // CHECK(target_lv_type->isStructTy());
2057  // CHECK_EQ(target_lv_type->getNumContainedTypes(), 3u);
2058  const auto i8p_ty = llvm::PointerType::get(
2059  get_int_type(8, executor_->cgen_state_->context_), 0);
2060  const auto ptr = LL_BUILDER.CreatePointerCast(
2061  LL_BUILDER.CreateExtractValue(target_lv, 0), i8p_ty);
2062  const auto size = LL_BUILDER.CreateExtractValue(target_lv, 1);
2063  const auto null_flag = LL_BUILDER.CreateExtractValue(target_lv, 2);
2064  const auto nullcheck_ok_bb =
2065  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_nullcheck_ok_bb", CUR_FUNC);
2066  const auto nullcheck_fail_bb = llvm::BasicBlock::Create(
2067  LL_CONTEXT, prefix + "_nullcheck_fail_bb", CUR_FUNC);
2068 
2069  // TODO(adb): probably better to zext the bool
2070  const auto nullcheck = LL_BUILDER.CreateICmpEQ(
2071  null_flag, executor_->cgen_state_->llInt(static_cast<int8_t>(1)));
2072  LL_BUILDER.CreateCondBr(nullcheck, nullcheck_fail_bb, nullcheck_ok_bb);
2073 
2074  const auto ret_bb =
2075  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_return", CUR_FUNC);
2076  LL_BUILDER.SetInsertPoint(ret_bb);
2077  auto result_phi = LL_BUILDER.CreatePHI(i8p_ty, 2, prefix + "_ptr_return");
2078  result_phi->addIncoming(ptr, nullcheck_ok_bb);
2079  const auto null_arr_sentinel = LL_BUILDER.CreateIntToPtr(
2080  executor_->cgen_state_->llInt(static_cast<int8_t>(0)), i8p_ty);
2081  result_phi->addIncoming(null_arr_sentinel, nullcheck_fail_bb);
2082  LL_BUILDER.SetInsertPoint(nullcheck_ok_bb);
2083  executor_->cgen_state_->emitExternalCall(
2084  "register_buffer_with_executor_rsm",
2085  llvm::Type::getVoidTy(executor_->cgen_state_->context_),
2086  {executor_->cgen_state_->llInt(reinterpret_cast<int64_t>(executor_)), ptr});
2087  LL_BUILDER.CreateBr(ret_bb);
2088  LL_BUILDER.SetInsertPoint(nullcheck_fail_bb);
2089  LL_BUILDER.CreateBr(ret_bb);
2090 
2091  LL_BUILDER.SetInsertPoint(ret_bb);
2092  return {result_phi, size};
2093  }
2094  CHECK_EQ(size_t(2), target_lvs.size());
2095  return {target_lvs[0], target_lvs[1]};
2096  }
2097  }
2098  if (target_ti.is_geometry() &&
2099  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
2100  auto generate_coord_lvs =
2101  [&](auto* selected_target_expr,
2102  bool const fetch_columns) -> std::vector<llvm::Value*> {
2103  const auto target_lvs =
2104  code_generator.codegen(selected_target_expr, fetch_columns, co);
2105  if (dynamic_cast<const Analyzer::GeoOperator*>(target_expr) &&
2106  target_expr->get_type_info().is_geometry()) {
2107  // return a pointer to the temporary alloca
2108  return target_lvs;
2109  }
2110  const auto geo_uoper = dynamic_cast<const Analyzer::GeoUOper*>(target_expr);
2111  const auto geo_binoper = dynamic_cast<const Analyzer::GeoBinOper*>(target_expr);
2112  if (geo_uoper || geo_binoper) {
2113  CHECK(target_expr->get_type_info().is_geometry());
2114  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
2115  target_lvs.size());
2116  return target_lvs;
2117  }
2118  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
2119  target_lvs.size());
2120 
2121  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
2122  const auto i8p_ty =
2123  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
2124  std::vector<llvm::Value*> coords;
2125  size_t ctr = 0;
2126  for (const auto& target_lv : target_lvs) {
2127  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
2128  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
2129  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
2130  // coords array (TINYINT). Subsequent arrays are regular INT.
2131 
2132  const size_t elem_sz = ctr == 0 ? 1 : 4;
2133  ctr++;
2134  int32_t fixlen = -1;
2135  if (target_ti.get_type() == kPOINT) {
2136  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
2137  if (col_var) {
2138  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
2139  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
2140  fixlen = coords_cd->columnType.get_size();
2141  }
2142  }
2143  }
2144  if (fixlen > 0) {
2145  coords.push_back(executor_->cgen_state_->emitExternalCall(
2146  "fast_fixlen_array_buff",
2147  i8p_ty,
2148  {target_lv, code_generator.posArg(selected_target_expr)}));
2149  coords.push_back(executor_->cgen_state_->llInt(int64_t(fixlen)));
2150  continue;
2151  }
2152  coords.push_back(executor_->cgen_state_->emitExternalCall(
2153  "array_buff",
2154  i8p_ty,
2155  {target_lv, code_generator.posArg(selected_target_expr)}));
2156  coords.push_back(executor_->cgen_state_->emitExternalCall(
2157  "array_size",
2158  i32_ty,
2159  {target_lv,
2160  code_generator.posArg(selected_target_expr),
2161  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
2162  }
2163  return coords;
2164  };
2165 
2166  if (agg_expr) {
2167  return generate_coord_lvs(agg_expr->get_arg(), true);
2168  } else {
2169  return generate_coord_lvs(target_expr,
2170  !executor_->plan_state_->allow_lazy_fetch_);
2171  }
2172  }
2173  }
2174  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2175  : code_generator.codegen(
2176  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
2177 }
2178 
2179 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
2180  const std::vector<llvm::Value*>& args) {
2181  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2182  return executor_->cgen_state_->emitCall(fname, args);
2183 }
2184 
2185 void GroupByAndAggregate::checkErrorCode(llvm::Value* retCode) {
2186  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2187  auto zero_const = llvm::ConstantInt::get(retCode->getType(), 0, true);
2188  auto rc_check_condition = executor_->cgen_state_->ir_builder_.CreateICmp(
2189  llvm::ICmpInst::ICMP_EQ, retCode, zero_const);
2190 
2191  executor_->cgen_state_->emitErrorCheck(rc_check_condition, retCode, "rc");
2192 }
2193 
2194 #undef CUR_FUNC
2195 #undef ROW_FUNC
2196 #undef LL_FP
2197 #undef LL_INT
2198 #undef LL_BOOL
2199 #undef LL_BUILDER
2200 #undef LL_CONTEXT
2201 
2203  const RelAlgExecutionUnit& ra_exe_unit,
2204  const Catalog_Namespace::Catalog& catalog) {
2205  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
2206  return 0;
2207  }
2208  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
2209  const auto grouped_col_expr =
2210  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
2211  if (!grouped_col_expr) {
2212  continue;
2213  }
2214  if (grouped_col_expr->get_table_id() <= 0) {
2215  return 0;
2216  }
2217  const auto td = catalog.getMetadataForTable(grouped_col_expr->get_table_id());
2218  if (td->shardedColumnId == grouped_col_expr->get_column_id()) {
2219  return td->nShards;
2220  }
2221  }
2222  return 0;
2223 }
RUNTIME_EXPORT void agg_approx_quantile(int64_t *agg, const double val)
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:297
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:2570
size_t g_watchdog_baseline_max_groups
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
llvm::Value * getAdditionalLiteral(const int32_t off)
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
llvm::BasicBlock * cond_false_
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
HOST DEVICE int get_size() const
Definition: sqltypes.h:390
std::string cat(Ts &&...args)
#define LL_BUILDER
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:132
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
Definition: Analyzer.h:217
RUNTIME_EXPORT void agg_count_distinct(int64_t *agg, const int64_t val)
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:113
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
#define LL_CONTEXT
bool expr_is_rowid(const Analyzer::Expr *expr, const Catalog_Namespace::Catalog &cat)
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
Definition: Analyzer.h:222
ExecutorDeviceType
KeylessInfo get_keyless_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
void codegenMode(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
SQLTypeInfo sql_type
Definition: TargetInfo.h:52
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:283
std::ostream & operator<<(std::ostream &os, const SessionInfo &session_info)
Definition: SessionInfo.cpp:57
void mark_function_always_inline(llvm::Function *func)
bool is_fp() const
Definition: sqltypes.h:580
ColRangeInfo getColRangeInfo()
const std::list< Analyzer::OrderEntry > order_entries
#define LL_INT(v)
QueryDescriptionType hash_type_
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:582
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint)
#define CHECK_GE(x, y)
Definition: Logger.h:302
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
llvm::Value * codegenVarlenOutputBuffer(const QueryMemoryDescriptor &query_mem_desc)
Expr * get_arg() const
Definition: Analyzer.h:1204
size_t getEffectiveKeyWidth() const
void codegenApproxQuantile(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
void checkErrorCode(llvm::Value *retCode)
CountDistinctDescriptors init_count_distinct_descriptors(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &group_by_range_info, const ExecutorDeviceType device_type, Executor *executor)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:102
#define LLVM_ALIGN(alignment)
RUNTIME_EXPORT void agg_mode_func(int64_t *agg, const int64_t val)
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
CountDistinctImplType impl_type_
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:301
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:88
std::string to_string(char const *&&v)
Helpers for codegen of target expressions.
#define LL_BOOL(v)
size_t getColOnlyOffInBytes(const size_t col_idx) const
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:75
const SQLTypeInfo get_compact_type(const TargetInfo &target)
const size_t limit
llvm::Value * codegenWindowPosition(const WindowFunctionContext *window_func_context, llvm::Value *pos_arg)
Definition: ColumnIR.cpp:230
bool isEmpty() const
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const std::optional< int64_t > &group_cardinality_estimation)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:166
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:220
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:34
size_t getGroupbyColCount() const
RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
const JoinQualsPerNestingLevel join_quals
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, llvm::Value *varlen_output_buffer, const std::vector< llvm::Value * > &agg_out_vec, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
bool g_bigint_count
Definition: sqldefs.h:77
bool g_enable_watchdog
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, llvm::Value *varlen_output_buffer, DiamondCodegen &diamond_codegen) const
int64_t g_bitmap_memory_limit
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:98
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
DEVICE void allocate()
Definition: quantile.h:601
#define AUTOMATIC_IR_METADATA(CGENSTATE)
This file includes the class specification for the buffer manager (BufferMgr), and related data struc...
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:51
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:83
QueryDescriptionType getQueryDescriptionType() const
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
ExecutorDeviceType device_type
#define RUNTIME_EXPORT
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:43
#define CHECK_LT(x, y)
Definition: Logger.h:299
Definition: sqltypes.h:68
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo &col_range_info)
#define CHECK_LE(x, y)
Definition: Logger.h:300
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
std::unordered_map< size_t, SQLTypeInfo > target_exprs_original_type_infos
Definition: sqldefs.h:78
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
void add(Value const value)
Definition: AggMode.h:40
Descriptor for the result set buffer layout.
TO bit_cast(FROM &&from)
Definition: misc.h:298
CountDistinctImplType
const std::optional< int64_t > group_cardinality_estimation_
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:289
bool is_geometry() const
Definition: sqltypes.h:588
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
bool g_cluster
int64_t get_epoch_days_from_seconds(const int64_t seconds)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
static size_t getBaselineThreshold(bool for_count_distinct, ExecutorDeviceType device_type)
Definition: Execute.h:1277
#define CUR_FUNC
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:176
Definition: sqltypes.h:60
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
constexpr double n
Definition: Utm.h:38
size_t g_leaf_count
Definition: ParserNode.cpp:76
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:387
const RelAlgExecutionUnit & ra_exe_unit_
const size_t offset
Definition: sqldefs.h:76
Definition: sqldefs.h:74
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)