OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
19 
20 #include "CardinalityEstimator.h"
21 #include "CodeGenerator.h"
23 #include "ExpressionRange.h"
24 #include "ExpressionRewrite.h"
25 #include "GpuInitGroups.h"
26 #include "InPlaceSort.h"
28 #include "MaxwellCodegenPatch.h"
30 #include "TargetExprBuilder.h"
31 
32 #include "../CudaMgr/CudaMgr.h"
33 #include "../Shared/checked_alloc.h"
34 #include "../Shared/funcannotations.h"
35 #include "../Utils/ChunkIter.h"
37 #include "Execute.h"
38 #include "QueryTemplateGenerator.h"
39 #include "RuntimeFunctions.h"
40 #include "Shared/misc.h"
41 #include "StreamingTopN.h"
42 #include "TopKSort.h"
43 #include "WindowContext.h"
44 
45 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
46 
47 #include <cstring> // strcat()
48 #include <limits>
49 #include <numeric>
50 #include <string_view>
51 #include <thread>
52 
53 bool g_cluster{false};
54 bool g_bigint_count{false};
57 extern int64_t g_bitmap_memory_limit;
58 extern size_t g_leaf_count;
59 
60 bool ColRangeInfo::isEmpty() const {
61  return min == 0 && max == -1;
62 }
63 
64 std::ostream& operator<<(std::ostream& out, const ColRangeInfo& info) {
65  out << "Hash Type = " << info.hash_type_ << " min = " << info.min
66  << " max = " << info.max << " bucket = " << info.bucket
67  << " has_nulls = " << info.has_nulls << "\n";
68  return out;
69 }
70 
71 std::ostream& operator<<(std::ostream& out, const CountDistinctImplType& type) {
72  switch (type) {
74  out << "Invalid";
75  break;
77  out << "Bitmap";
78  break;
80  out << "UnorderedSet";
81  break;
82  default:
83  out << "<Unkown Type>";
84  break;
85  }
86  return out;
87 }
88 
89 std::ostream& operator<<(std::ostream& out, const CountDistinctDescriptor& desc) {
90  out << "Type = " << desc.impl_type_ << " min val = " << desc.min_val
91  << " bitmap_sz_bits = " << desc.bitmap_sz_bits
92  << " bool approximate = " << desc.approximate
93  << " device_type = " << desc.device_type
94  << " sub_bitmap_count = " << desc.sub_bitmap_count;
95  return out;
96 }
97 
98 namespace {
99 
100 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
101  int32_t agg_count{0};
102  for (auto target_expr : target_exprs) {
103  CHECK(target_expr);
104  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
105  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
106  const auto& ti = target_expr->get_type_info();
107  if (ti.is_buffer()) {
108  agg_count += 2;
109  } else if (ti.is_geometry()) {
110  agg_count += ti.get_physical_coord_cols() * 2;
111  } else {
112  ++agg_count;
113  }
114  continue;
115  }
116  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
117  agg_count += 2;
118  } else {
119  ++agg_count;
120  }
121  }
122  return agg_count;
123 }
124 
125 bool expr_is_rowid(const Analyzer::Expr* expr) {
126  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
127  if (!col) {
128  return false;
129  }
130  const auto cd = get_column_descriptor_maybe(col->getColumnKey());
131  if (!cd || !cd->isVirtualCol) {
132  return false;
133  }
134  CHECK_EQ("rowid", cd->columnName);
135  return true;
136 }
137 
138 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
139  for (const auto& target_expr : ra_exe_unit.target_exprs) {
140  const auto agg_info = get_target_info(target_expr, g_bigint_count);
141  if (agg_info.is_agg && is_distinct_target(agg_info)) {
142  return true;
143  }
144  }
145  return false;
146 }
147 
149  const int64_t max_entry_count) {
150  try {
151  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
152  checked_int64_t(col_range_info.min)) >= max_entry_count;
153  } catch (...) {
154  return true;
155  }
156 }
157 
158 bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate,
159  const ColRangeInfo& col_range_info) {
160  try {
161  // the cardinality estimate is the size of the baseline hash table. further penalize
162  // the baseline hash table by a factor of 2x due to overhead in computing baseline
163  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
164  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
165  // count of the column, we use baseline hash on the filtered set
166  return checked_int64_t(cardinality_estimate) * 2 <
167  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
168  checked_int64_t(col_range_info.min));
169  } catch (...) {
170  return false;
171  }
172 }
173 
175  const std::vector<InputTableInfo>& query_infos,
176  const Analyzer::Expr* expr,
177  Executor* executor) {
178  if (!expr) {
179  return {QueryDescriptionType::Projection, 0, 0, 0, false};
180  }
181 
182  const auto expr_range = getExpressionRange(
183  expr, query_infos, executor, boost::make_optional(ra_exe_unit.simple_quals));
184  switch (expr_range.getType()) {
186  if (expr_range.getIntMin() > expr_range.getIntMax()) {
187  return {
188  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
189  }
191  expr_range.getIntMin(),
192  expr_range.getIntMax(),
193  expr_range.getBucket(),
194  expr_range.hasNulls()};
195  }
198  if (expr_range.getFpMin() > expr_range.getFpMax()) {
199  return {
200  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
201  }
202  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
203  }
205  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
206  default:
207  CHECK(false);
208  }
209  CHECK(false);
210  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
211 }
212 
213 } // namespace
214 
216  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
217  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
218  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
219  // can expect this to be true anyway for grouped queries since the precise version
220  // uses significantly more memory.
221  const int64_t baseline_threshold =
223  if (ra_exe_unit_.groupby_exprs.size() != 1) {
224  try {
225  checked_int64_t cardinality{1};
226  bool has_nulls{false};
227  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
228  auto col_range_info = get_expr_range_info(
229  ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
230  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
231  // going through baseline hash if a non-integer type is encountered
232  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
233  }
234  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
235  CHECK_GE(crt_col_cardinality, 0);
236  cardinality *= crt_col_cardinality;
237  if (col_range_info.has_nulls) {
238  has_nulls = true;
239  }
240  }
241  // For zero or high cardinalities, use baseline layout.
242  if (!cardinality || cardinality > baseline_threshold) {
243  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
244  }
246  0,
247  int64_t(cardinality),
248  0,
249  has_nulls};
250  } catch (...) { // overflow when computing cardinality
251  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
252  }
253  }
254  // For single column groupby on high timestamps, force baseline hash due to wide ranges
255  // we are likely to encounter when applying quals to the expression range
256  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
257  // the range is small enough
258  if (ra_exe_unit_.groupby_exprs.front() &&
259  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
260  ra_exe_unit_.simple_quals.size() > 0) {
261  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
262  }
263  const auto col_range_info = get_expr_range_info(
265  if (!ra_exe_unit_.groupby_exprs.front()) {
266  return col_range_info;
267  }
268  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
269  const int64_t col_count =
271  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
273  max_entry_count = std::min(max_entry_count, baseline_threshold);
274  }
275  const auto& groupby_expr_ti = ra_exe_unit_.groupby_exprs.front()->get_type_info();
276  if (groupby_expr_ti.is_string() && !col_range_info.bucket) {
277  CHECK(groupby_expr_ti.get_compression() == kENCODING_DICT);
278 
279  const bool has_filters =
280  !ra_exe_unit_.quals.empty() || !ra_exe_unit_.simple_quals.empty();
281  if (has_filters &&
282  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
283  // if filters are present, we can use the filter to narrow the cardinality of the
284  // group by in the case of ranges too big for perfect hash. Otherwise, we are better
285  // off attempting perfect hash (since we know the range will be made of
286  // monotonically increasing numbers from min to max for dictionary encoded strings)
287  // and failing later due to excessive memory use.
288  // Check the conditions where baseline hash can provide a performance increase and
289  // return baseline hash (potentially forcing an estimator query) as the range type.
290  // Otherwise, return col_range_info which will likely be perfect hash, though could
291  // be baseline from a previous call of this function prior to the estimator query.
292  if (!ra_exe_unit_.sort_info.order_entries.empty()) {
293  // TODO(adb): allow some sorts to pass through this block by centralizing sort
294  // algorithm decision making
296  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
297  // always use baseline hash for column range too big for perfect hash with count
298  // distinct descriptors. We will need 8GB of CPU memory minimum for the perfect
299  // hash group by in this case.
301  col_range_info.min,
302  col_range_info.max,
303  0,
304  col_range_info.has_nulls};
305  } else {
306  // use original col range for sort
307  return col_range_info;
308  }
309  }
310  // if filters are present and the filtered range is less than the cardinality of
311  // the column, consider baseline hash
314  col_range_info)) {
316  col_range_info.min,
317  col_range_info.max,
318  0,
319  col_range_info.has_nulls};
320  }
321  }
322  } else if ((!expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get())) &&
323  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
324  !col_range_info.bucket) {
326  col_range_info.min,
327  col_range_info.max,
328  0,
329  col_range_info.has_nulls};
330  }
331  return col_range_info;
332 }
333 
335  checked_int64_t crt_col_cardinality =
336  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
337  if (col_range_info.bucket) {
338  crt_col_cardinality /= col_range_info.bucket;
339  }
340  return static_cast<int64_t>(crt_col_cardinality +
341  (1 + (col_range_info.has_nulls ? 1 : 0)));
342 }
343 
344 namespace {
345 // Like getBucketedCardinality() without counting nulls.
346 int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo& col_range_info) {
347  if (col_range_info.min <= col_range_info.max) {
348  size_t size = col_range_info.max - col_range_info.min;
349  if (col_range_info.bucket) {
350  size /= col_range_info.bucket;
351  }
352  if (size >= static_cast<size_t>(std::numeric_limits<int64_t>::max())) {
353  // try to use unordered_set instead of crashing due to CHECK failure
354  // i.e., CHECK_LT(size, std::numeric_limits<int64_t>::max());
355  return 0;
356  }
357  return static_cast<int64_t>(size + 1);
358  } else {
359  return 0;
360  }
361 }
362 } // namespace
363 
364 #define LL_CONTEXT executor_->cgen_state_->context_
365 #define LL_BUILDER executor_->cgen_state_->ir_builder_
366 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
367 #define LL_INT(v) executor_->cgen_state_->llInt(v)
368 #define LL_FP(v) executor_->cgen_state_->llFp(v)
369 #define ROW_FUNC executor_->cgen_state_->row_func_
370 #define CUR_FUNC executor_->cgen_state_->current_func_
371 
373  Executor* executor,
374  const ExecutorDeviceType device_type,
375  const RelAlgExecutionUnit& ra_exe_unit,
376  const std::vector<InputTableInfo>& query_infos,
377  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
378  const std::optional<int64_t>& group_cardinality_estimation)
379  : executor_(executor)
380  , ra_exe_unit_(ra_exe_unit)
381  , query_infos_(query_infos)
382  , row_set_mem_owner_(row_set_mem_owner)
383  , device_type_(device_type)
384  , group_cardinality_estimation_(group_cardinality_estimation) {
385  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
386  if (!groupby_expr) {
387  continue;
388  }
389  const auto& groupby_ti = groupby_expr->get_type_info();
390  if (groupby_ti.is_bytes()) {
391  throw std::runtime_error(
392  "Cannot group by string columns which are not dictionary encoded.");
393  }
394  if (groupby_ti.is_buffer()) {
395  throw std::runtime_error("Group by buffer not supported");
396  }
397  if (groupby_ti.is_geometry()) {
398  throw std::runtime_error("Group by geometry not supported");
399  }
400  }
401 }
402 
404  const size_t shard_count) const {
405  size_t device_count{0};
407  device_count = executor_->cudaMgr()->getDeviceCount();
408  CHECK_GT(device_count, 0u);
409  }
410 
411  int64_t bucket{col_range_info.bucket};
412 
413  if (shard_count) {
414  CHECK(!col_range_info.bucket);
415  /*
416  when a node has fewer devices than shard count,
417  a) In a distributed setup, the minimum distance between two keys would be
418  device_count because shards are stored consecutively across the physical tables,
419  i.e if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1
420  would have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf
421  node has only 1 device, in this case, all the keys from each node are loaded on
422  the device each.
423 
424  b) In a single node setup, the distance would be minimum of device_count or
425  difference of device_count - shard_count. For example: If a single node server
426  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
427  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9
428  device 3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum
429  of device_count or difference.
430 
431  When a node has device count equal to or more than shard count then the
432  minimum distance is always at least shard_count * no of leaf nodes.
433  */
434  if (device_count < shard_count) {
435  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
436  : std::min(device_count, shard_count - device_count);
437  } else {
438  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
439  }
440  }
441 
442  return bucket;
443 }
444 
445 namespace {
446 
457  const std::vector<InputTableInfo>& query_infos,
458  const bool is_group_by,
459  Executor* executor) {
460  bool keyless{true}, found{false};
461  int32_t num_agg_expr{0};
462  int32_t index{0};
463  for (const auto target_expr : ra_exe_unit.target_exprs) {
464  const auto agg_info = get_target_info(target_expr, g_bigint_count);
465  const auto chosen_type = get_compact_type(agg_info);
466  if (agg_info.is_agg) {
467  num_agg_expr++;
468  }
469  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
470  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
471  CHECK(agg_expr);
472  const auto arg_expr = agg_arg(target_expr);
473  const bool float_argument_input = takes_float_argument(agg_info);
474  switch (agg_info.agg_kind) {
475  case kAVG:
476  ++index;
477  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
478  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
479  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
480  expr_range_info.hasNulls()) {
481  break;
482  }
483  }
484  found = true;
485  break;
486  case kCOUNT:
487  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
488  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
489  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
490  expr_range_info.hasNulls()) {
491  break;
492  }
493  }
494  found = true;
495  break;
496  case kSUM: {
497  auto arg_ti = arg_expr->get_type_info();
498  if (constrained_not_null(arg_expr, ra_exe_unit.quals)) {
499  arg_ti.set_notnull(true);
500  }
501  if (!arg_ti.get_notnull()) {
502  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
503  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
504  !expr_range_info.hasNulls()) {
505  found = true;
506  }
507  } else {
508  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
509  switch (expr_range_info.getType()) {
512  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
513  found = true;
514  }
515  break;
517  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
518  found = true;
519  }
520  break;
521  default:
522  break;
523  }
524  }
525  break;
526  }
527  case kMIN: {
528  CHECK(agg_expr && agg_expr->get_arg());
529  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
530  if (arg_ti.is_string() || arg_ti.is_buffer()) {
531  break;
532  }
533  auto expr_range_info =
534  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
535  auto init_max = get_agg_initial_val(agg_info.agg_kind,
536  chosen_type,
537  is_group_by || float_argument_input,
538  float_argument_input ? sizeof(float) : 8);
539  switch (expr_range_info.getType()) {
542  auto double_max =
543  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
544  if (expr_range_info.getFpMax() < double_max) {
545  found = true;
546  }
547  break;
548  }
550  if (expr_range_info.getIntMax() < init_max) {
551  found = true;
552  }
553  break;
554  default:
555  break;
556  }
557  break;
558  }
559  case kMAX: {
560  CHECK(agg_expr && agg_expr->get_arg());
561  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
562  if (arg_ti.is_string() || arg_ti.is_buffer()) {
563  break;
564  }
565  auto expr_range_info =
566  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
567  // NULL sentinel and init value for kMAX are identical, which results in
568  // ambiguity in detecting empty keys in presence of nulls.
569  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
570  expr_range_info.hasNulls()) {
571  break;
572  }
573  auto init_min = get_agg_initial_val(agg_info.agg_kind,
574  chosen_type,
575  is_group_by || float_argument_input,
576  float_argument_input ? sizeof(float) : 8);
577  switch (expr_range_info.getType()) {
580  auto double_min =
581  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
582  if (expr_range_info.getFpMin() > double_min) {
583  found = true;
584  }
585  break;
586  }
588  if (expr_range_info.getIntMin() > init_min) {
589  found = true;
590  }
591  break;
592  default:
593  break;
594  }
595  break;
596  }
597  default:
598  keyless = false;
599  break;
600  }
601  }
602  if (!keyless) {
603  break;
604  }
605  if (!found) {
606  ++index;
607  }
608  }
609 
610  // shouldn't use keyless for projection only
611  return {
612  keyless && found,
613  index,
614  };
615 }
616 
618  const RelAlgExecutionUnit& ra_exe_unit,
619  const std::vector<InputTableInfo>& query_infos,
620  const ColRangeInfo& group_by_range_info,
621  const ExecutorDeviceType device_type,
622  Executor* executor) {
623  CountDistinctDescriptors count_distinct_descriptors;
624  auto compute_bytes_per_group =
625  [](size_t bitmap_sz, size_t sub_bitmap_count, ExecutorDeviceType device_type) {
626  size_t effective_size_bytes = (bitmap_sz + 7) / 8;
627  const auto padded_size =
628  (device_type == ExecutorDeviceType::GPU || sub_bitmap_count > 1)
629  ? align_to_int64(effective_size_bytes)
630  : effective_size_bytes;
631  return padded_size * sub_bitmap_count;
632  };
633  for (size_t i = 0; i < ra_exe_unit.target_exprs.size(); i++) {
634  const auto target_expr = ra_exe_unit.target_exprs[i];
635  auto agg_info = get_target_info(target_expr, g_bigint_count);
636  if (is_distinct_target(agg_info)) {
637  CHECK(agg_info.is_agg);
638  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
639  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
640  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
641  if (arg_ti.is_bytes()) {
642  throw std::runtime_error(
643  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
644  }
645  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
646  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
647  }
648  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
649  throw std::runtime_error(
650  "APPROX_COUNT_DISTINCT on geometry columns not supported");
651  }
652  if (agg_info.is_distinct && arg_ti.is_geometry()) {
653  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
654  }
655  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
656  auto arg_range_info =
657  arg_ti.is_fp() ? no_range_info
659  ra_exe_unit, query_infos, agg_expr->get_arg(), executor);
660  const auto it = ra_exe_unit.target_exprs_original_type_infos.find(i);
661  if (it != ra_exe_unit.target_exprs_original_type_infos.end()) {
662  const auto& original_target_expr_ti = it->second;
663  if (arg_ti.is_integer() && original_target_expr_ti.get_type() == kDATE &&
664  original_target_expr_ti.get_compression() == kENCODING_DATE_IN_DAYS) {
665  // manually encode the col range of date col if necessary
666  // (see conditionally_change_arg_to_int_type function in RelAlgExecutor.cpp)
667  auto is_date_value_not_encoded = [&original_target_expr_ti](int64_t date_val) {
668  if (original_target_expr_ti.get_comp_param() == 16) {
669  return date_val < INT16_MIN || date_val > INT16_MAX;
670  } else {
671  return date_val < INT32_MIN || date_val > INT32_MIN;
672  }
673  };
674  if (is_date_value_not_encoded(arg_range_info.min)) {
675  // chunk metadata of the date column contains decoded value
676  // so we manually encode it again here to represent its column range correctly
677  arg_range_info.min =
679  }
680  if (is_date_value_not_encoded(arg_range_info.max)) {
681  arg_range_info.max =
683  }
684  // now we manually encode the value, so we need to invalidate bucket value
685  // i.e., 86000 -> 0, to correctly calculate the size of bitmap
686  arg_range_info.bucket = 0;
687  }
688  }
689 
691  int64_t bitmap_sz_bits{0};
692  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
693  const auto error_rate_expr = agg_expr->get_arg1();
694  if (error_rate_expr) {
695  CHECK(error_rate_expr->get_type_info().get_type() == kINT);
696  auto const error_rate =
697  dynamic_cast<Analyzer::Constant const*>(error_rate_expr.get());
698  CHECK(error_rate);
699  CHECK_GE(error_rate->get_constval().intval, 1);
700  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
701  } else {
702  bitmap_sz_bits = g_hll_precision_bits;
703  }
704  }
705  if (arg_range_info.isEmpty()) {
706  count_distinct_descriptors.emplace_back(
708  0,
709  64,
710  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
711  device_type,
712  1});
713  continue;
714  }
715  const auto sub_bitmap_count =
716  get_count_distinct_sub_bitmap_count(bitmap_sz_bits, ra_exe_unit, device_type);
717  size_t worst_case_num_groups{1};
718  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
719  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
720  // implementation for arrays
721  count_distinct_impl_type = CountDistinctImplType::Bitmap;
722  if (shared::is_any<kCOUNT, kCOUNT_IF>(agg_info.agg_kind)) {
723  bitmap_sz_bits = get_bucketed_cardinality_without_nulls(arg_range_info);
724  if (bitmap_sz_bits <= 0 || g_bitmap_memory_limit <= bitmap_sz_bits) {
725  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
726  }
727  // check a potential OOM when using bitmap-based approach
728  const auto total_bytes_per_entry =
729  compute_bytes_per_group(bitmap_sz_bits, sub_bitmap_count, device_type);
730  const auto range_bucket = std::max(group_by_range_info.bucket, (int64_t)1);
731  const auto maximum_num_groups =
732  (group_by_range_info.max - group_by_range_info.min + 1) / range_bucket;
733  const auto total_bitmap_bytes_for_groups =
734  total_bytes_per_entry * maximum_num_groups;
735  // we can estimate a potential OOM of bitmap-based count-distinct operator
736  // by using the logic "check_total_bitmap_memory"
737  if (total_bitmap_bytes_for_groups >=
738  static_cast<size_t>(g_bitmap_memory_limit)) {
739  const auto agg_expr_max_entry_count =
740  arg_range_info.max - arg_range_info.min + 1;
741  int64_t max_agg_expr_table_cardinality{1};
742  std::set<const Analyzer::ColumnVar*,
743  bool (*)(const Analyzer::ColumnVar*, const Analyzer::ColumnVar*)>
745  agg_expr->collect_column_var(colvar_set, true);
746  for (const auto cv : colvar_set) {
747  auto it =
748  std::find_if(query_infos.begin(),
749  query_infos.end(),
750  [&](const auto& input_table_info) {
751  return input_table_info.table_key == cv->getTableKey();
752  });
753  int64_t cur_table_cardinality =
754  it != query_infos.end()
755  ? static_cast<int64_t>(it->info.getNumTuplesUpperBound())
756  : -1;
757  max_agg_expr_table_cardinality =
758  std::max(max_agg_expr_table_cardinality, cur_table_cardinality);
759  worst_case_num_groups *= cur_table_cardinality;
760  }
761  auto has_valid_stat = [agg_expr_max_entry_count, maximum_num_groups]() {
762  return agg_expr_max_entry_count > 0 && maximum_num_groups > 0;
763  };
764  // if we have valid stats regarding input expr, we can try to relax the OOM
765  if (has_valid_stat()) {
766  // a threshold related to a ratio of a range of agg expr (let's say R)
767  // and table cardinality (C), i.e., use unordered_set if the # bits to build
768  // a bitmap based on R is four times larger than that of C
769  const size_t unordered_set_threshold{2};
770  // When we detect OOM of bitmap-based approach we selectively switch it to
771  // hash set-based processing logic if one of the followings is satisfied:
772  // 1) the column range is too wide compared with the table cardinality, or
773  // 2) the column range is too wide compared with the avg of # unique values
774  // per group by entry
775  const auto bits_for_agg_entry = std::ceil(log(agg_expr_max_entry_count));
776  const auto bits_for_agg_table =
777  std::ceil(log(max_agg_expr_table_cardinality));
778  const auto avg_num_unique_entries_per_group =
779  std::ceil(max_agg_expr_table_cardinality / maximum_num_groups);
780  // case a) given a range of entry count of agg_expr and the maximum
781  // cardinality among source tables of the agg_expr , we try to detect the
782  // misleading case of too sparse column range , i.e., agg_expr has 1M column
783  // range but only has two tuples {1 and 1M} / case b) check whether
784  // using bitmap is really beneficial when considering uniform distribution
785  // of (unique) keys.
786  if ((bits_for_agg_entry - bits_for_agg_table) >= unordered_set_threshold ||
787  agg_expr_max_entry_count >= avg_num_unique_entries_per_group) {
788  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
789  } else {
790  throw std::runtime_error(
791  "Consider using approx_count_distinct operator instead of "
792  "count_distinct operator to lower the memory "
793  "requirements");
794  }
795  }
796  }
797  }
798  }
799  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
800  count_distinct_impl_type == CountDistinctImplType::UnorderedSet &&
801  !(arg_ti.is_array() || arg_ti.is_geometry())) {
802  count_distinct_impl_type = CountDistinctImplType::Bitmap;
803  }
804  const size_t too_many_entries{100000000};
805  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
806  worst_case_num_groups > too_many_entries &&
807  count_distinct_impl_type == CountDistinctImplType::UnorderedSet) {
808  throw WatchdogException(
809  "Detect too many input entries for set-based count distinct operator under "
810  "the watchdog");
811  }
812  count_distinct_descriptors.emplace_back(
813  CountDistinctDescriptor{count_distinct_impl_type,
814  arg_range_info.min,
815  bitmap_sz_bits,
816  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
817  device_type,
818  sub_bitmap_count});
819  } else {
820  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
821  CountDistinctImplType::Invalid, 0, 0, false, device_type, 0});
822  }
823  }
824  return count_distinct_descriptors;
825 }
826 
827 } // namespace
828 
829 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
830  const bool allow_multifrag,
831  const size_t max_groups_buffer_entry_count,
832  const int8_t crt_min_byte_width,
833  RenderInfo* render_info,
834  const bool output_columnar_hint) {
835  const auto shard_count = device_type_ == ExecutorDeviceType::GPU
837  : 0;
838  bool sort_on_gpu_hint =
839  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
842  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
843  // but the total output buffer size would be too big or it's a sharded top query.
844  // For the sake of managing risk, use the new result set way very selectively for
845  // this case only (alongside the baseline layout we've enabled for a while now).
846  bool must_use_baseline_sort = shard_count;
847  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
848  while (true) {
849  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
850  max_groups_buffer_entry_count,
851  crt_min_byte_width,
852  sort_on_gpu_hint,
853  render_info,
854  must_use_baseline_sort,
855  output_columnar_hint);
856  CHECK(query_mem_desc);
857  if (query_mem_desc->sortOnGpu() &&
858  (query_mem_desc->getBufferSizeBytes(device_type_) +
859  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
860  2 * 1024 * 1024 * 1024LL) {
861  must_use_baseline_sort = true;
862  sort_on_gpu_hint = false;
863  } else {
864  break;
865  }
866  }
867  return query_mem_desc;
868 }
869 
870 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
871  const bool allow_multifrag,
872  const size_t max_groups_buffer_entry_count,
873  const int8_t crt_min_byte_width,
874  const bool sort_on_gpu_hint,
875  RenderInfo* render_info,
876  const bool must_use_baseline_sort,
877  const bool output_columnar_hint) {
878  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
879 
880  auto col_range_info_nosharding = getColRangeInfo();
881 
882  const auto shard_count = device_type_ == ExecutorDeviceType::GPU
884  : 0;
885 
886  const auto col_range_info =
887  ColRangeInfo{col_range_info_nosharding.hash_type_,
888  col_range_info_nosharding.min,
889  col_range_info_nosharding.max,
890  getShardedTopBucket(col_range_info_nosharding, shard_count),
891  col_range_info_nosharding.has_nulls};
892 
893  // Non-grouped aggregates do not support accessing aggregated ranges
894  // Keyless hash is currently only supported with single-column perfect hash
895  const auto keyless_info =
896  !(is_group_by &&
897  col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash)
898  ? KeylessInfo{false, -1}
900 
901  if (g_enable_watchdog &&
902  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
903  max_groups_buffer_entry_count > g_watchdog_baseline_max_groups) ||
904  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
905  ra_exe_unit_.groupby_exprs.size() == 1 &&
906  (col_range_info.max - col_range_info.min) /
907  std::max(col_range_info.bucket, int64_t(1)) >
908  130000000))) {
909  throw WatchdogException("Query would use too much memory");
910  }
911 
912  const auto count_distinct_descriptors = init_count_distinct_descriptors(
913  ra_exe_unit_, query_infos_, col_range_info, device_type_, executor_);
914  try {
916  ra_exe_unit_,
917  query_infos_,
918  col_range_info,
919  keyless_info,
920  allow_multifrag,
921  device_type_,
922  crt_min_byte_width,
923  sort_on_gpu_hint,
924  shard_count,
925  max_groups_buffer_entry_count,
926  render_info,
927  count_distinct_descriptors,
928  must_use_baseline_sort,
929  output_columnar_hint,
930  /*streaming_top_n_hint=*/true);
931  } catch (const StreamingTopNOOM& e) {
932  LOG(WARNING) << e.what() << " Disabling Streaming Top N.";
934  ra_exe_unit_,
935  query_infos_,
936  col_range_info,
937  keyless_info,
938  allow_multifrag,
939  device_type_,
940  crt_min_byte_width,
941  sort_on_gpu_hint,
942  shard_count,
943  max_groups_buffer_entry_count,
944  render_info,
945  count_distinct_descriptors,
946  must_use_baseline_sort,
947  output_columnar_hint,
948  /*streaming_top_n_hint=*/false);
949  }
950 }
951 
953  const std::list<Analyzer::OrderEntry>& order_entries) {
954  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
955  return false;
956  }
957  for (const auto& order_entry : order_entries) {
958  CHECK_GE(order_entry.tle_no, 1);
959  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
960  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
961  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
962  return false;
963  }
964  // TODO(alex): relax the restrictions
965  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
966  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
967  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
968  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
969  return false;
970  }
971  if (agg_expr->get_arg()) {
972  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
973  if (arg_ti.is_fp()) {
974  return false;
975  }
976  auto expr_range_info =
977  get_expr_range_info(ra_exe_unit_, query_infos_, agg_expr->get_arg(), executor_);
978  // TOD(adb): QMD not actually initialized here?
979  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
980  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
981  expr_range_info.has_nulls) &&
982  order_entry.is_desc == order_entry.nulls_first) {
983  return false;
984  }
985  }
986  const auto& target_ti = target_expr->get_type_info();
987  CHECK(!target_ti.is_buffer());
988  if (!target_ti.is_integer()) {
989  return false;
990  }
991  }
992  return true;
993 }
994 
995 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
996  llvm::BasicBlock* sc_false,
998  const CompilationOptions& co,
999  const GpuSharedMemoryContext& gpu_smem_context) {
1000  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1001  CHECK(filter_result);
1002 
1003  bool can_return_error = false;
1004  llvm::BasicBlock* filter_false{nullptr};
1005 
1006  {
1007  const bool is_group_by = !ra_exe_unit_.groupby_exprs.empty();
1008 
1009  if (executor_->isArchMaxwell(co.device_type)) {
1010  prependForceSync();
1011  }
1012  DiamondCodegen filter_cfg(filter_result,
1013  executor_,
1014  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
1015  "filter", // filter_true and filter_false basic blocks
1016  nullptr,
1017  false);
1018  filter_false = filter_cfg.cond_false_;
1019 
1020  if (is_group_by) {
1022  !query_mem_desc.useStreamingTopN()) {
1023  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
1024  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
1025  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
1026  llvm::Value* old_total_matched_val{nullptr};
1028  old_total_matched_val =
1029  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
1030  total_matched_ptr,
1031  LL_INT(int32_t(1)),
1032 #if LLVM_VERSION_MAJOR > 12
1033  LLVM_ALIGN(8),
1034 #endif
1035  llvm::AtomicOrdering::Monotonic);
1036  } else {
1037  old_total_matched_val = LL_BUILDER.CreateLoad(
1038  total_matched_ptr->getType()->getPointerElementType(), total_matched_ptr);
1039  LL_BUILDER.CreateStore(
1040  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
1041  total_matched_ptr);
1042  }
1043  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
1044  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
1045  }
1046 
1047  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
1048  auto varlen_output_buffer = codegenVarlenOutputBuffer(query_mem_desc);
1049  if (query_mem_desc.usesGetGroupValueFast() ||
1050  query_mem_desc.getQueryDescriptionType() ==
1052  if (query_mem_desc.getGroupbyColCount() > 1) {
1053  filter_cfg.setChainToNext();
1054  }
1055  // Don't generate null checks if the group slot is guaranteed to be non-null,
1056  // as it's the case for get_group_value_fast* family.
1057  can_return_error = codegenAggCalls(agg_out_ptr_w_idx,
1058  varlen_output_buffer,
1059  {},
1061  co,
1062  gpu_smem_context,
1063  filter_cfg);
1064  } else {
1065  {
1066  llvm::Value* nullcheck_cond{nullptr};
1067  if (query_mem_desc.didOutputColumnar()) {
1068  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
1069  LL_INT(int32_t(0)));
1070  } else {
1071  nullcheck_cond = LL_BUILDER.CreateICmpNE(
1072  std::get<0>(agg_out_ptr_w_idx),
1073  llvm::ConstantPointerNull::get(
1074  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
1075  }
1076  DiamondCodegen nullcheck_cfg(
1077  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
1078  codegenAggCalls(agg_out_ptr_w_idx,
1079  varlen_output_buffer,
1080  {},
1082  co,
1083  gpu_smem_context,
1084  filter_cfg);
1085  }
1086  can_return_error = true;
1087  if (query_mem_desc.getQueryDescriptionType() ==
1089  query_mem_desc.useStreamingTopN()) {
1090  // Ignore rejection on pushing current row to top-K heap.
1091  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
1092  } else {
1093  CodeGenerator code_generator(executor_);
1094  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
1095  // TODO(alex): remove the trunc once pos is converted to 32 bits
1096  code_generator.posArg(nullptr),
1097  get_int_type(32, LL_CONTEXT))));
1098  }
1099  }
1100  } else {
1101  if (ra_exe_unit_.estimator) {
1102  std::stack<llvm::BasicBlock*> array_loops;
1103  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
1104  } else {
1105  auto arg_it = ROW_FUNC->arg_begin();
1106  std::vector<llvm::Value*> agg_out_vec;
1107  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
1108  agg_out_vec.push_back(&*arg_it++);
1109  }
1110  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
1111  /*varlen_output_buffer=*/nullptr,
1112  agg_out_vec,
1113  query_mem_desc,
1114  co,
1115  gpu_smem_context,
1116  filter_cfg);
1117  }
1118  }
1119  }
1120 
1121  if (ra_exe_unit_.join_quals.empty()) {
1122  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
1123  } else if (sc_false) {
1124  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
1125  LL_BUILDER.SetInsertPoint(sc_false);
1126  LL_BUILDER.CreateBr(filter_false);
1127  LL_BUILDER.SetInsertPoint(saved_insert_block);
1128  }
1129 
1130  return can_return_error;
1131 }
1132 
1134  llvm::Value* groups_buffer,
1136  const CompilationOptions& co,
1137  DiamondCodegen& diamond_codegen) {
1138  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1140  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1141  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
1142  CHECK(!group_expr);
1143  if (!query_mem_desc.didOutputColumnar()) {
1144  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
1145  }
1146  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1147  ? 0
1148  : query_mem_desc.getRowSize() / sizeof(int64_t);
1149  CodeGenerator code_generator(executor_);
1150  if (query_mem_desc.useStreamingTopN()) {
1151  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
1152  CHECK_GE(only_order_entry.tle_no, int(1));
1153  const size_t target_idx = only_order_entry.tle_no - 1;
1154  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1155  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1156  const auto chosen_bytes =
1157  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1158  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1159  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1161  std::string fname = "get_bin_from_k_heap";
1162  const auto& oe_ti = order_entry_expr->get_type_info();
1163  llvm::Value* null_key_lv = nullptr;
1164  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1165  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1166  switch (bit_width) {
1167  case 32:
1168  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1169  break;
1170  case 64:
1171  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1172  break;
1173  default:
1174  CHECK(false);
1175  }
1176  fname += "_int" + std::to_string(bit_width) + "_t";
1177  } else {
1178  CHECK(oe_ti.is_fp());
1179  if (order_entry_lv->getType()->isDoubleTy()) {
1180  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1181  } else {
1182  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1183  }
1184  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1185  }
1186  const auto key_slot_idx =
1188  return emitCall(
1189  fname,
1190  {groups_buffer,
1191  LL_INT(n),
1192  LL_INT(row_size_quad),
1193  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1194  LL_BOOL(only_order_entry.is_desc),
1195  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1196  LL_BOOL(only_order_entry.nulls_first),
1197  null_key_lv,
1198  order_entry_lv});
1199  } else {
1200  auto* arg = get_arg_by_name(ROW_FUNC, "max_matched");
1201  const auto output_buffer_entry_count_lv =
1202  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1203  arg = get_arg_by_name(ROW_FUNC, "old_total_matched");
1204  const auto group_expr_lv =
1205  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1206  std::vector<llvm::Value*> args{groups_buffer,
1207  output_buffer_entry_count_lv,
1208  group_expr_lv,
1209  code_generator.posArg(nullptr)};
1210  if (query_mem_desc.didOutputColumnar()) {
1211  const auto columnar_output_offset =
1212  emitCall("get_columnar_scan_output_offset", args);
1213  return columnar_output_offset;
1214  }
1215  args.push_back(LL_INT(row_size_quad));
1216  return emitCall("get_scan_output_slot", args);
1217  }
1218 }
1219 
1220 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1222  const CompilationOptions& co,
1223  DiamondCodegen& diamond_codegen) {
1224  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1225  auto arg_it = ROW_FUNC->arg_begin();
1226  auto groups_buffer = arg_it++;
1227 
1228  std::stack<llvm::BasicBlock*> array_loops;
1229 
1230  // TODO(Saman): move this logic outside of this function.
1232  if (query_mem_desc.didOutputColumnar()) {
1233  return std::make_tuple(
1234  &*groups_buffer,
1235  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1236  } else {
1237  return std::make_tuple(
1238  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1239  nullptr);
1240  }
1241  }
1242 
1243  CHECK(query_mem_desc.getQueryDescriptionType() ==
1245  query_mem_desc.getQueryDescriptionType() ==
1247 
1248  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1249  ? 0
1250  : query_mem_desc.getRowSize() / sizeof(int64_t);
1251 
1252  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1253  ? sizeof(int64_t)
1254  : query_mem_desc.getEffectiveKeyWidth();
1255  // for multi-column group by
1256  llvm::Value* group_key = nullptr;
1257  llvm::Value* key_size_lv = nullptr;
1258 
1259  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1260  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1261  if (query_mem_desc.getQueryDescriptionType() ==
1263  group_key =
1264  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1265  } else if (query_mem_desc.getQueryDescriptionType() ==
1267  group_key =
1268  col_width_size == sizeof(int32_t)
1269  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1270  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1271  }
1272  CHECK(group_key);
1273  CHECK(key_size_lv);
1274  }
1275 
1276  int32_t subkey_idx = 0;
1277  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1278  for (const auto& group_expr : ra_exe_unit_.groupby_exprs) {
1279  const auto col_range_info =
1281  const auto translated_null_value = static_cast<int64_t>(
1282  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1283  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1284  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1285  : checked_int64_t(col_range_info.max) +
1286  (col_range_info.bucket ? col_range_info.bucket : 1));
1287 
1288  const bool col_has_nulls =
1289  query_mem_desc.getQueryDescriptionType() ==
1291  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1292  ? query_mem_desc.hasNulls()
1293  : col_range_info.has_nulls)
1294  : false;
1295 
1296  const auto group_expr_lvs =
1297  executor_->groupByColumnCodegen(group_expr.get(),
1298  col_width_size,
1299  co,
1300  col_has_nulls,
1301  translated_null_value,
1302  diamond_codegen,
1303  array_loops,
1304  query_mem_desc.threadsShareMemory());
1305  const auto group_expr_lv = group_expr_lvs.translated_value;
1306  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1307  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1308  return codegenSingleColumnPerfectHash(query_mem_desc,
1309  co,
1310  &*groups_buffer,
1311  group_expr_lv,
1312  group_expr_lvs.original_value,
1313  row_size_quad);
1314  } else {
1315  // store the sub-key to the buffer
1316  LL_BUILDER.CreateStore(
1317  group_expr_lv,
1318  LL_BUILDER.CreateGEP(
1319  group_key->getType()->getScalarType()->getPointerElementType(),
1320  group_key,
1321  LL_INT(subkey_idx++)));
1322  }
1323  }
1324  if (query_mem_desc.getQueryDescriptionType() ==
1326  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1328  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1329  } else if (query_mem_desc.getQueryDescriptionType() ==
1332  &*groups_buffer,
1333  group_key,
1334  key_size_lv,
1335  query_mem_desc,
1336  col_width_size,
1337  row_size_quad);
1338  }
1339  CHECK(false);
1340  return std::make_tuple(nullptr, nullptr);
1341 }
1342 
1345  if (!query_mem_desc.hasVarlenOutput()) {
1346  return nullptr;
1347  }
1348 
1349  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1350  auto arg_it = ROW_FUNC->arg_begin();
1351  arg_it++; /* groups_buffer */
1352  auto varlen_output_buffer = arg_it++;
1353  CHECK(varlen_output_buffer->getType() == llvm::Type::getInt64PtrTy(LL_CONTEXT));
1354  return varlen_output_buffer;
1355 }
1356 
1357 std::tuple<llvm::Value*, llvm::Value*>
1360  const CompilationOptions& co,
1361  llvm::Value* groups_buffer,
1362  llvm::Value* group_expr_lv_translated,
1363  llvm::Value* group_expr_lv_original,
1364  const int32_t row_size_quad) {
1365  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1366  CHECK(query_mem_desc.usesGetGroupValueFast());
1367  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1368  ? "get_columnar_group_bin_offset"
1369  : "get_group_value_fast"};
1370  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1371  get_group_fn_name += "_keyless";
1372  }
1373  if (query_mem_desc.interleavedBins(co.device_type)) {
1374  CHECK(!query_mem_desc.didOutputColumnar());
1375  CHECK(query_mem_desc.hasKeylessHash());
1376  get_group_fn_name += "_semiprivate";
1377  }
1378  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1379  &*group_expr_lv_translated};
1380  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1381  query_mem_desc.mustUseBaselineSort()) {
1382  get_group_fn_name += "_with_original_key";
1383  get_group_fn_args.push_back(group_expr_lv_original);
1384  }
1385  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1386  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1387  if (!query_mem_desc.hasKeylessHash()) {
1388  if (!query_mem_desc.didOutputColumnar()) {
1389  get_group_fn_args.push_back(LL_INT(row_size_quad));
1390  }
1391  } else {
1392  if (!query_mem_desc.didOutputColumnar()) {
1393  get_group_fn_args.push_back(LL_INT(row_size_quad));
1394  }
1395  if (query_mem_desc.interleavedBins(co.device_type)) {
1396  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1397  get_group_fn_args.push_back(warp_idx);
1398  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1399  }
1400  }
1401  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1402  return std::make_tuple(&*groups_buffer,
1403  emitCall(get_group_fn_name, get_group_fn_args));
1404  }
1405  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1406 }
1407 
1408 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1409  llvm::Value* groups_buffer,
1410  llvm::Value* group_key,
1411  llvm::Value* key_size_lv,
1412  const QueryMemoryDescriptor& query_mem_desc,
1413  const int32_t row_size_quad) {
1414  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1415  CHECK(query_mem_desc.getQueryDescriptionType() ==
1417  // compute the index (perfect hash)
1418  auto perfect_hash_func = codegenPerfectHashFunction();
1419  auto hash_lv =
1420  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1421 
1422  if (query_mem_desc.didOutputColumnar()) {
1423  if (!query_mem_desc.hasKeylessHash()) {
1424  const std::string set_matching_func_name{
1425  "set_matching_group_value_perfect_hash_columnar"};
1426  const std::vector<llvm::Value*> set_matching_func_arg{
1427  groups_buffer,
1428  hash_lv,
1429  group_key,
1430  key_size_lv,
1431  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1432  query_mem_desc.getEntryCount())};
1433  emitCall(set_matching_func_name, set_matching_func_arg);
1434  }
1435  return std::make_tuple(groups_buffer, hash_lv);
1436  } else {
1437  if (query_mem_desc.hasKeylessHash()) {
1438  return std::make_tuple(emitCall("get_matching_group_value_perfect_hash_keyless",
1439  {groups_buffer, hash_lv, LL_INT(row_size_quad)}),
1440  nullptr);
1441  } else {
1442  return std::make_tuple(
1443  emitCall(
1444  "get_matching_group_value_perfect_hash",
1445  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1446  nullptr);
1447  }
1448  }
1449 }
1450 
1451 std::tuple<llvm::Value*, llvm::Value*>
1453  const CompilationOptions& co,
1454  llvm::Value* groups_buffer,
1455  llvm::Value* group_key,
1456  llvm::Value* key_size_lv,
1457  const QueryMemoryDescriptor& query_mem_desc,
1458  const size_t key_width,
1459  const int32_t row_size_quad) {
1460  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1461  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1462  CHECK(key_width == sizeof(int32_t));
1463  group_key =
1464  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1465  }
1466  std::vector<llvm::Value*> func_args{
1467  groups_buffer,
1468  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1469  &*group_key,
1470  &*key_size_lv,
1471  LL_INT(static_cast<int32_t>(key_width))};
1472  std::string func_name{"get_group_value"};
1473  if (query_mem_desc.didOutputColumnar()) {
1474  func_name += "_columnar_slot";
1475  } else {
1476  func_args.push_back(LL_INT(row_size_quad));
1477  }
1478  if (co.with_dynamic_watchdog) {
1479  func_name += "_with_watchdog";
1480  }
1481  if (query_mem_desc.didOutputColumnar()) {
1482  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1483  } else {
1484  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1485  }
1486 }
1487 
1489  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1490  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1491  auto ft = llvm::FunctionType::get(
1492  get_int_type(32, LL_CONTEXT),
1493  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1494  false);
1495  auto key_hash_func = llvm::Function::Create(ft,
1496  llvm::Function::ExternalLinkage,
1497  "perfect_key_hash",
1498  executor_->cgen_state_->module_);
1499  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1500  mark_function_always_inline(key_hash_func);
1501  auto& key_buff_arg = *key_hash_func->args().begin();
1502  llvm::Value* key_buff_lv = &key_buff_arg;
1503  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1504  llvm::IRBuilder<> key_hash_func_builder(bb);
1505  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1506  std::vector<int64_t> cardinalities;
1507  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1508  auto col_range_info =
1509  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1510  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1511  cardinalities.push_back(getBucketedCardinality(col_range_info));
1512  }
1513  size_t dim_idx = 0;
1514  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1515  auto* gep = key_hash_func_builder.CreateGEP(
1516  key_buff_lv->getType()->getScalarType()->getPointerElementType(),
1517  key_buff_lv,
1518  LL_INT(dim_idx));
1519  auto key_comp_lv =
1520  key_hash_func_builder.CreateLoad(gep->getType()->getPointerElementType(), gep);
1521  auto col_range_info =
1522  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1523  auto crt_term_lv =
1524  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1525  if (col_range_info.bucket) {
1526  crt_term_lv =
1527  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1528  }
1529  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1530  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1531  LL_INT(cardinalities[prev_dim_idx]));
1532  }
1533  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1534  ++dim_idx;
1535  }
1536  key_hash_func_builder.CreateRet(
1537  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1538  return key_hash_func;
1539 }
1540 
1542  const TargetInfo& agg_info,
1543  llvm::Value* target) {
1544  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1545  const auto& agg_type = agg_info.sql_type;
1546  const size_t chosen_bytes = agg_type.get_size();
1547 
1548  bool need_conversion{false};
1549  llvm::Value* arg_null{nullptr};
1550  llvm::Value* agg_null{nullptr};
1551  llvm::Value* target_to_cast{target};
1552  if (arg_type.is_fp()) {
1553  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1554  if (agg_type.is_fp()) {
1555  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1556  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1557  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1558  need_conversion = true;
1559  }
1560  } else {
1561  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1562  return target;
1563  }
1564  } else {
1565  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1566  if (agg_type.is_fp()) {
1567  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1568  need_conversion = true;
1569  target_to_cast = executor_->castToFP(target, arg_type, agg_type);
1570  } else {
1571  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1572  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1573  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1574  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1575  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1576  need_conversion = true;
1577  }
1578  }
1579  }
1580  if (need_conversion) {
1581  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1582  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1583  return LL_BUILDER.CreateSelect(
1584  cmp,
1585  agg_null,
1586  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1587  } else {
1588  return target;
1589  }
1590 }
1591 
1593  const Analyzer::WindowFunction* window_func,
1594  const QueryMemoryDescriptor& query_mem_desc,
1595  const CompilationOptions& co,
1596  DiamondCodegen& diamond_codegen) {
1597  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1598  const auto window_func_context =
1600  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1601  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1602  ? 0
1603  : query_mem_desc.getRowSize() / sizeof(int64_t);
1604  auto arg_it = ROW_FUNC->arg_begin();
1605  auto groups_buffer = arg_it++;
1606  CodeGenerator code_generator(executor_);
1607  auto window_pos_lv = code_generator.codegenWindowPosition(
1608  window_func_context, code_generator.posArg(nullptr));
1609  const auto pos_in_window =
1610  LL_BUILDER.CreateTrunc(window_pos_lv, get_int_type(32, LL_CONTEXT));
1611  llvm::Value* entry_count_lv =
1612  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1613  std::vector<llvm::Value*> args{
1614  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1615  if (query_mem_desc.didOutputColumnar()) {
1616  const auto columnar_output_offset =
1617  emitCall("get_columnar_scan_output_offset", args);
1618  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1619  }
1620  args.push_back(LL_INT(row_size_quad));
1621  return emitCall("get_scan_output_slot", args);
1622  }
1623  auto arg_it = ROW_FUNC->arg_begin();
1624  auto groups_buffer = arg_it++;
1625  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1626 }
1627 
1629  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1630  llvm::Value* varlen_output_buffer,
1631  const std::vector<llvm::Value*>& agg_out_vec,
1632  QueryMemoryDescriptor& query_mem_desc,
1633  const CompilationOptions& co,
1634  const GpuSharedMemoryContext& gpu_smem_context,
1635  DiamondCodegen& diamond_codegen) {
1636  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1637  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1638  // TODO(alex): unify the two cases, the output for non-group by queries
1639  // should be a contiguous buffer
1640  const bool is_group_by = std::get<0>(agg_out_ptr_w_idx);
1641  bool can_return_error = false;
1642  if (is_group_by) {
1643  CHECK(agg_out_vec.empty());
1644  } else {
1645  CHECK(!agg_out_vec.empty());
1646  }
1647 
1648  // output buffer is casted into a byte stream to be able to handle data elements of
1649  // different sizes (only used when actual column width sizes are used)
1650  llvm::Value* output_buffer_byte_stream{nullptr};
1651  llvm::Value* out_row_idx{nullptr};
1652  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1654  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1655  std::get<0>(agg_out_ptr_w_idx),
1656  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1657  output_buffer_byte_stream->setName("out_buff_b_stream");
1658  CHECK(std::get<1>(agg_out_ptr_w_idx));
1659  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1660  llvm::Type::getInt64Ty(LL_CONTEXT));
1661  out_row_idx->setName("out_row_idx");
1662  }
1663 
1664  TargetExprCodegenBuilder target_builder(ra_exe_unit_, is_group_by);
1665  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1666  ++target_idx) {
1667  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1668  CHECK(target_expr);
1669 
1670  target_builder(target_expr, executor_, query_mem_desc, co);
1671  }
1672 
1673  target_builder.codegen(this,
1674  executor_,
1675  query_mem_desc,
1676  co,
1677  gpu_smem_context,
1678  agg_out_ptr_w_idx,
1679  agg_out_vec,
1680  output_buffer_byte_stream,
1681  out_row_idx,
1682  varlen_output_buffer,
1683  diamond_codegen);
1684 
1685  for (auto target_expr : ra_exe_unit_.target_exprs) {
1686  CHECK(target_expr);
1687  executor_->plan_state_->isLazyFetchColumn(target_expr);
1688  }
1689 
1690  return can_return_error;
1691 }
1692 
1697  llvm::Value* output_buffer_byte_stream,
1698  llvm::Value* out_row_idx,
1699  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1700  const QueryMemoryDescriptor& query_mem_desc,
1701  const size_t chosen_bytes,
1702  const size_t agg_out_off,
1703  const size_t target_idx) {
1704  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1705  llvm::Value* agg_col_ptr{nullptr};
1706  if (query_mem_desc.didOutputColumnar()) {
1707  // TODO(Saman): remove the second columnar branch, and support all query description
1708  // types through the first branch. Then, input arguments should also be cleaned up
1709  if (!g_cluster &&
1711  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1712  chosen_bytes == 8);
1713  CHECK(output_buffer_byte_stream);
1714  CHECK(out_row_idx);
1715  size_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1716  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1717  auto out_per_col_byte_idx =
1718 #ifdef _WIN32
1719  LL_BUILDER.CreateShl(out_row_idx, __lzcnt(chosen_bytes) - 1);
1720 #else
1721  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1722 #endif
1723  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1724  LL_INT(static_cast<int64_t>(col_off)));
1725  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1726  auto output_ptr = LL_BUILDER.CreateGEP(
1727  output_buffer_byte_stream->getType()->getScalarType()->getPointerElementType(),
1728  output_buffer_byte_stream,
1729  byte_offset);
1730  agg_col_ptr = LL_BUILDER.CreateBitCast(
1731  output_ptr,
1732  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1733  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1734  } else {
1735  auto const col_off_in_bytes = query_mem_desc.getColOffInBytes(agg_out_off);
1736  auto const col_off = col_off_in_bytes / chosen_bytes;
1737  auto const col_rem = col_off_in_bytes % chosen_bytes;
1738  CHECK_EQ(col_rem, 0u) << col_off_in_bytes << " % " << chosen_bytes;
1739  CHECK(std::get<1>(agg_out_ptr_w_idx));
1740  auto* agg_out_idx = LL_BUILDER.CreateZExt(
1741  std::get<1>(agg_out_ptr_w_idx),
1742  get_int_type(8 * sizeof(col_off), executor_->cgen_state_->context_));
1743  auto* offset = LL_BUILDER.CreateAdd(agg_out_idx, LL_INT(col_off));
1744  auto* bit_cast = LL_BUILDER.CreateBitCast(
1745  std::get<0>(agg_out_ptr_w_idx),
1746  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1747  agg_col_ptr = LL_BUILDER.CreateGEP(
1748  bit_cast->getType()->getScalarType()->getPointerElementType(),
1749  bit_cast,
1750  offset);
1751  }
1752  } else {
1753  auto const col_off_in_bytes = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1754  auto const col_off = col_off_in_bytes / chosen_bytes;
1755  auto const col_rem = col_off_in_bytes % chosen_bytes;
1756  CHECK_EQ(col_rem, 0u) << col_off_in_bytes << " % " << chosen_bytes;
1757  auto* bit_cast = LL_BUILDER.CreateBitCast(
1758  std::get<0>(agg_out_ptr_w_idx),
1759  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1760  agg_col_ptr = LL_BUILDER.CreateGEP(
1761  bit_cast->getType()->getScalarType()->getPointerElementType(),
1762  bit_cast,
1763  LL_INT(col_off));
1764  }
1765  CHECK(agg_col_ptr);
1766  return agg_col_ptr;
1767 }
1768 
1769 void GroupByAndAggregate::codegenEstimator(std::stack<llvm::BasicBlock*>& array_loops,
1770  DiamondCodegen& diamond_codegen,
1771  const QueryMemoryDescriptor& query_mem_desc,
1772  const CompilationOptions& co) {
1773  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1774  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1775  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1776  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1777  estimator_comp_count_lv);
1778  int32_t subkey_idx = 0;
1779  for (const auto& estimator_arg_comp : estimator_arg) {
1780  const auto estimator_arg_comp_lvs =
1781  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1782  query_mem_desc.getEffectiveKeyWidth(),
1783  co,
1784  false,
1785  0,
1786  diamond_codegen,
1787  array_loops,
1788  true);
1789  CHECK(!estimator_arg_comp_lvs.original_value);
1790  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1791  // store the sub-key to the buffer
1792  LL_BUILDER.CreateStore(
1793  estimator_arg_comp_lv,
1794  LL_BUILDER.CreateGEP(
1795  estimator_key_lv->getType()->getScalarType()->getPointerElementType(),
1796  estimator_key_lv,
1797  LL_INT(subkey_idx++)));
1798  }
1799  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1800  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1801  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1802  const auto estimator_comp_bytes_lv =
1803  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1804  const auto bitmap_size_lv =
1805  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1806  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1807  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1808 }
1809 
1810 extern "C" RUNTIME_EXPORT void agg_count_distinct(int64_t* agg, const int64_t val) {
1811  reinterpret_cast<CountDistinctSet*>(*agg)->insert(val);
1812 }
1813 
1814 extern "C" RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t* agg,
1815  const int64_t val,
1816  const int64_t skip_val) {
1817  if (val != skip_val) {
1818  agg_count_distinct(agg, val);
1819  }
1820 }
1821 
1822 extern "C" RUNTIME_EXPORT void agg_approx_quantile(int64_t* agg, const double val) {
1823  auto* t_digest = reinterpret_cast<quantile::TDigest*>(*agg);
1824  t_digest->allocate();
1825  t_digest->add(val);
1826 }
1827 
1828 extern "C" RUNTIME_EXPORT void agg_mode_func(int64_t* agg, const int64_t val) {
1829  auto* mode_map = reinterpret_cast<AggMode*>(*agg);
1830  mode_map->add(val);
1831 }
1832 
1834  const size_t target_idx,
1835  const Analyzer::Expr* target_expr,
1836  std::vector<llvm::Value*>& agg_args,
1837  const QueryMemoryDescriptor& query_mem_desc,
1838  const ExecutorDeviceType device_type) {
1839  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1840  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1841  const auto& arg_ti =
1842  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1843  if (arg_ti.is_fp()) {
1844  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1845  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1846  }
1847  const auto& count_distinct_descriptor =
1848  query_mem_desc.getCountDistinctDescriptor(target_idx);
1849  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1850  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1851  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1852  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1853  if (device_type == ExecutorDeviceType::GPU) {
1854  const auto base_dev_addr = getAdditionalLiteral(-1);
1855  const auto base_host_addr = getAdditionalLiteral(-2);
1856  agg_args.push_back(base_dev_addr);
1857  agg_args.push_back(base_host_addr);
1858  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1859  } else {
1860  emitCall("agg_approximate_count_distinct", agg_args);
1861  }
1862  return;
1863  }
1864  std::string agg_fname{"agg_count_distinct"};
1865  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1866  agg_fname += "_bitmap";
1867  agg_args.push_back(LL_INT(static_cast<int64_t>(count_distinct_descriptor.min_val)));
1868  }
1869  if (agg_info.skip_null_val) {
1870  auto null_lv = executor_->cgen_state_->castToTypeIn(
1871  (arg_ti.is_fp()
1872  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1873  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1874  64);
1875  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1876  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1877  agg_fname += "_skip_val";
1878  agg_args.push_back(null_lv);
1879  }
1880  if (device_type == ExecutorDeviceType::GPU) {
1881  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1882  agg_fname += "_gpu";
1883  const auto base_dev_addr = getAdditionalLiteral(-1);
1884  const auto base_host_addr = getAdditionalLiteral(-2);
1885  agg_args.push_back(base_dev_addr);
1886  agg_args.push_back(base_host_addr);
1887  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1888  CHECK_EQ(size_t(0),
1889  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1890  count_distinct_descriptor.sub_bitmap_count);
1891  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1892  count_distinct_descriptor.sub_bitmap_count)));
1893  }
1894  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1895  emitCall(agg_fname, agg_args);
1896  } else {
1897  executor_->cgen_state_->emitExternalCall(
1898  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1899  }
1900 }
1901 
1903  const size_t target_idx,
1904  const Analyzer::Expr* target_expr,
1905  std::vector<llvm::Value*>& agg_args,
1906  const QueryMemoryDescriptor& query_mem_desc,
1907  const ExecutorDeviceType device_type) {
1908  if (device_type == ExecutorDeviceType::GPU) {
1909  throw QueryMustRunOnCpu();
1910  }
1911  llvm::BasicBlock *calc, *skip{nullptr};
1912  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1913  auto const arg_ti =
1914  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1915  bool const nullable = !arg_ti.get_notnull();
1916 
1917  auto* cs = executor_->cgen_state_.get();
1918  auto& irb = cs->ir_builder_;
1919  if (nullable) {
1920  auto* const null_value = cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1921  auto* const skip_cond = arg_ti.is_fp()
1922  ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1923  : irb.CreateICmpEQ(agg_args.back(), null_value);
1924  calc = llvm::BasicBlock::Create(cs->context_, "calc_approx_quantile");
1925  skip = llvm::BasicBlock::Create(cs->context_, "skip_approx_quantile");
1926  irb.CreateCondBr(skip_cond, skip, calc);
1927  cs->current_func_->getBasicBlockList().push_back(calc);
1928  irb.SetInsertPoint(calc);
1929  }
1930  if (!arg_ti.is_fp()) {
1931  auto const agg_info = get_target_info(target_expr, g_bigint_count);
1932  agg_args.back() = executor_->castToFP(agg_args.back(), arg_ti, agg_info.sql_type);
1933  }
1934  cs->emitExternalCall(
1935  "agg_approx_quantile", llvm::Type::getVoidTy(cs->context_), agg_args);
1936  if (nullable) {
1937  irb.CreateBr(skip);
1938  cs->current_func_->getBasicBlockList().push_back(skip);
1939  irb.SetInsertPoint(skip);
1940  }
1941 }
1942 
1943 void GroupByAndAggregate::codegenMode(const size_t target_idx,
1944  const Analyzer::Expr* target_expr,
1945  std::vector<llvm::Value*>& agg_args,
1946  const QueryMemoryDescriptor& query_mem_desc,
1947  const ExecutorDeviceType device_type) {
1948  if (device_type == ExecutorDeviceType::GPU) {
1949  throw QueryMustRunOnCpu();
1950  }
1951  llvm::BasicBlock *calc, *skip{nullptr};
1952  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1953  auto const arg_ti =
1954  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1955  bool const nullable = !arg_ti.get_notnull();
1956  bool const is_fp = arg_ti.is_fp();
1957  auto* cs = executor_->cgen_state_.get();
1958  auto& irb = cs->ir_builder_;
1959  if (nullable) {
1960  auto* const null_value =
1961  is_fp ? cs->inlineNull(arg_ti) : cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1962  auto* const skip_cond = is_fp ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1963  : irb.CreateICmpEQ(agg_args.back(), null_value);
1964  calc = llvm::BasicBlock::Create(cs->context_, "calc_mode");
1965  skip = llvm::BasicBlock::Create(cs->context_, "skip_mode");
1966  irb.CreateCondBr(skip_cond, skip, calc);
1967  cs->current_func_->getBasicBlockList().push_back(calc);
1968  irb.SetInsertPoint(calc);
1969  }
1970  if (is_fp) {
1971  auto* const int_type = get_int_type(8 * arg_ti.get_size(), cs->context_);
1972  agg_args.back() = irb.CreateBitCast(agg_args.back(), int_type);
1973  }
1974  // "agg_mode" collides with existing names, so non-standard suffix "_func" is added.
1975  cs->emitExternalCall("agg_mode_func", llvm::Type::getVoidTy(cs->context_), agg_args);
1976  if (nullable) {
1977  irb.CreateBr(skip);
1978  cs->current_func_->getBasicBlockList().push_back(skip);
1979  irb.SetInsertPoint(skip);
1980  }
1981 }
1982 
1983 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
1984  CHECK_LT(off, 0);
1985  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
1986  auto* bit_cast = LL_BUILDER.CreateBitCast(
1987  lit_buff_lv, llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0));
1988  auto* gep =
1989  LL_BUILDER.CreateGEP(bit_cast->getType()->getScalarType()->getPointerElementType(),
1990  bit_cast,
1991  LL_INT(off));
1992  return LL_BUILDER.CreateLoad(gep->getType()->getPointerElementType(), gep);
1993 }
1994 
1995 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
1996  const Analyzer::Expr* target_expr,
1997  const CompilationOptions& co) {
1998  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1999  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
2000  const auto func_expr = dynamic_cast<const Analyzer::FunctionOper*>(target_expr);
2001  const auto arr_expr = dynamic_cast<const Analyzer::ArrayExpr*>(target_expr);
2002 
2003  // TODO(alex): handle arrays uniformly?
2004  CodeGenerator code_generator(executor_);
2005  if (target_expr) {
2006  const auto& target_ti = target_expr->get_type_info();
2007  if (target_ti.is_buffer() &&
2008  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
2009  const auto target_lvs =
2010  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2011  : code_generator.codegen(
2012  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
2013  if (!func_expr && !arr_expr) {
2014  // Something with the chunk transport is code that was generated from a source
2015  // other than an ARRAY[] expression
2016  if (target_ti.is_bytes()) {
2017  CHECK_EQ(size_t(3), target_lvs.size());
2018  return {target_lvs[1], target_lvs[2]};
2019  }
2020  CHECK(target_ti.is_array());
2021  CHECK_EQ(size_t(1), target_lvs.size());
2022  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
2023  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
2024  const auto i8p_ty =
2025  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
2026  const auto& elem_ti = target_ti.get_elem_type();
2027  return {
2028  executor_->cgen_state_->emitExternalCall(
2029  "array_buff",
2030  i8p_ty,
2031  {target_lvs.front(), code_generator.posArg(target_expr)}),
2032  executor_->cgen_state_->emitExternalCall(
2033  "array_size",
2034  i32_ty,
2035  {target_lvs.front(),
2036  code_generator.posArg(target_expr),
2037  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
2038  } else {
2039  if (agg_expr) {
2040  throw std::runtime_error(
2041  "Using array[] operator as argument to an aggregate operator is not "
2042  "supported");
2043  }
2044  CHECK(func_expr || arr_expr);
2045  if (dynamic_cast<const Analyzer::FunctionOper*>(target_expr)) {
2046  CHECK_EQ(size_t(1), target_lvs.size());
2047  const auto prefix = target_ti.get_buffer_name();
2048  CHECK(target_ti.is_array() || target_ti.is_bytes());
2049  const auto target_lv = LL_BUILDER.CreateLoad(
2050  target_lvs[0]->getType()->getPointerElementType(), target_lvs[0]);
2051  // const auto target_lv_type = target_lvs[0]->getType();
2052  // CHECK(target_lv_type->isStructTy());
2053  // CHECK_EQ(target_lv_type->getNumContainedTypes(), 3u);
2054  const auto i8p_ty = llvm::PointerType::get(
2055  get_int_type(8, executor_->cgen_state_->context_), 0);
2056  const auto ptr = LL_BUILDER.CreatePointerCast(
2057  LL_BUILDER.CreateExtractValue(target_lv, 0), i8p_ty);
2058  const auto size = LL_BUILDER.CreateExtractValue(target_lv, 1);
2059  const auto null_flag = LL_BUILDER.CreateExtractValue(target_lv, 2);
2060  const auto nullcheck_ok_bb =
2061  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_nullcheck_ok_bb", CUR_FUNC);
2062  const auto nullcheck_fail_bb = llvm::BasicBlock::Create(
2063  LL_CONTEXT, prefix + "_nullcheck_fail_bb", CUR_FUNC);
2064 
2065  // TODO(adb): probably better to zext the bool
2066  const auto nullcheck = LL_BUILDER.CreateICmpEQ(
2067  null_flag, executor_->cgen_state_->llInt(static_cast<int8_t>(1)));
2068  LL_BUILDER.CreateCondBr(nullcheck, nullcheck_fail_bb, nullcheck_ok_bb);
2069 
2070  const auto ret_bb =
2071  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_return", CUR_FUNC);
2072  LL_BUILDER.SetInsertPoint(ret_bb);
2073  auto result_phi = LL_BUILDER.CreatePHI(i8p_ty, 2, prefix + "_ptr_return");
2074  result_phi->addIncoming(ptr, nullcheck_ok_bb);
2075  const auto null_arr_sentinel = LL_BUILDER.CreateIntToPtr(
2076  executor_->cgen_state_->llInt(static_cast<int8_t>(0)), i8p_ty);
2077  result_phi->addIncoming(null_arr_sentinel, nullcheck_fail_bb);
2078  LL_BUILDER.SetInsertPoint(nullcheck_ok_bb);
2079  executor_->cgen_state_->emitExternalCall(
2080  "register_buffer_with_executor_rsm",
2081  llvm::Type::getVoidTy(executor_->cgen_state_->context_),
2082  {executor_->cgen_state_->llInt(reinterpret_cast<int64_t>(executor_)), ptr});
2083  LL_BUILDER.CreateBr(ret_bb);
2084  LL_BUILDER.SetInsertPoint(nullcheck_fail_bb);
2085  LL_BUILDER.CreateBr(ret_bb);
2086 
2087  LL_BUILDER.SetInsertPoint(ret_bb);
2088  return {result_phi, size};
2089  }
2090  CHECK_EQ(size_t(2), target_lvs.size());
2091  return {target_lvs[0], target_lvs[1]};
2092  }
2093  }
2094  if (target_ti.is_geometry() &&
2095  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
2096  auto generate_coord_lvs =
2097  [&](auto* selected_target_expr,
2098  bool const fetch_columns) -> std::vector<llvm::Value*> {
2099  const auto target_lvs =
2100  code_generator.codegen(selected_target_expr, fetch_columns, co);
2101  if (dynamic_cast<const Analyzer::GeoOperator*>(target_expr) &&
2102  target_expr->get_type_info().is_geometry()) {
2103  // return a pointer to the temporary alloca
2104  return target_lvs;
2105  }
2106  const auto geo_uoper = dynamic_cast<const Analyzer::GeoUOper*>(target_expr);
2107  const auto geo_binoper = dynamic_cast<const Analyzer::GeoBinOper*>(target_expr);
2108  if (geo_uoper || geo_binoper) {
2109  CHECK(target_expr->get_type_info().is_geometry());
2110  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
2111  target_lvs.size());
2112  return target_lvs;
2113  }
2114  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
2115  target_lvs.size());
2116 
2117  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
2118  const auto i8p_ty =
2119  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
2120  std::vector<llvm::Value*> coords;
2121  size_t ctr = 0;
2122  for (const auto& target_lv : target_lvs) {
2123  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
2124  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
2125  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
2126  // coords array (TINYINT). Subsequent arrays are regular INT.
2127 
2128  const size_t elem_sz = ctr == 0 ? 1 : 4;
2129  ctr++;
2130  int32_t fixlen = -1;
2131  if (target_ti.get_type() == kPOINT) {
2132  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
2133  if (col_var) {
2134  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
2135  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
2136  fixlen = coords_cd->columnType.get_size();
2137  }
2138  }
2139  }
2140  if (fixlen > 0) {
2141  coords.push_back(executor_->cgen_state_->emitExternalCall(
2142  "fast_fixlen_array_buff",
2143  i8p_ty,
2144  {target_lv, code_generator.posArg(selected_target_expr)}));
2145  coords.push_back(executor_->cgen_state_->llInt(int64_t(fixlen)));
2146  continue;
2147  }
2148  coords.push_back(executor_->cgen_state_->emitExternalCall(
2149  "array_buff",
2150  i8p_ty,
2151  {target_lv, code_generator.posArg(selected_target_expr)}));
2152  coords.push_back(executor_->cgen_state_->emitExternalCall(
2153  "array_size",
2154  i32_ty,
2155  {target_lv,
2156  code_generator.posArg(selected_target_expr),
2157  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
2158  }
2159  return coords;
2160  };
2161 
2162  if (agg_expr) {
2163  return generate_coord_lvs(agg_expr->get_arg(), true);
2164  } else {
2165  return generate_coord_lvs(target_expr,
2166  !executor_->plan_state_->allow_lazy_fetch_);
2167  }
2168  }
2169  }
2170  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2171  : code_generator.codegen(
2172  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
2173 }
2174 
2175 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
2176  const std::vector<llvm::Value*>& args) {
2177  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2178  return executor_->cgen_state_->emitCall(fname, args);
2179 }
2180 
2181 void GroupByAndAggregate::checkErrorCode(llvm::Value* retCode) {
2182  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2183  auto zero_const = llvm::ConstantInt::get(retCode->getType(), 0, true);
2184  auto rc_check_condition = executor_->cgen_state_->ir_builder_.CreateICmp(
2185  llvm::ICmpInst::ICMP_EQ, retCode, zero_const);
2186 
2187  executor_->cgen_state_->emitErrorCheck(rc_check_condition, retCode, "rc");
2188 }
2189 
2190 #undef CUR_FUNC
2191 #undef ROW_FUNC
2192 #undef LL_FP
2193 #undef LL_INT
2194 #undef LL_BOOL
2195 #undef LL_BUILDER
2196 #undef LL_CONTEXT
2197 
2199  const RelAlgExecutionUnit& ra_exe_unit) {
2200  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
2201  return 0;
2202  }
2203  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
2204  const auto grouped_col_expr =
2205  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
2206  if (!grouped_col_expr) {
2207  continue;
2208  }
2209  const auto& column_key = grouped_col_expr->getColumnKey();
2210  if (column_key.table_id <= 0) {
2211  return 0;
2212  }
2214  {column_key.db_id, column_key.table_id});
2215  if (td->shardedColumnId == column_key.column_id) {
2216  return td->nShards;
2217  }
2218  }
2219  return 0;
2220 }
RUNTIME_EXPORT void agg_approx_quantile(int64_t *agg, const double val)
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:2576
size_t g_watchdog_baseline_max_groups
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
llvm::Value * getAdditionalLiteral(const int32_t off)
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
llvm::BasicBlock * cond_false_
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
HOST DEVICE int get_size() const
Definition: sqltypes.h:393
#define LL_BUILDER
static bool colvar_comp(const ColumnVar *l, const ColumnVar *r)
Definition: Analyzer.h:215
RUNTIME_EXPORT void agg_count_distinct(int64_t *agg, const int64_t val)
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:113
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
#define LL_CONTEXT
void collect_column_var(std::set< const ColumnVar *, bool(*)(const ColumnVar *, const ColumnVar *)> &colvar_set, bool include_agg) const override
Definition: Analyzer.h:222
ExecutorDeviceType
KeylessInfo get_keyless_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
void codegenMode(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
SQLTypeInfo sql_type
Definition: TargetInfo.h:52
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:285
std::ostream & operator<<(std::ostream &os, const SessionInfo &session_info)
Definition: SessionInfo.cpp:57
void mark_function_always_inline(llvm::Function *func)
bool is_fp() const
Definition: sqltypes.h:584
ColRangeInfo getColRangeInfo()
const std::list< Analyzer::OrderEntry > order_entries
#define LL_INT(v)
QueryDescriptionType hash_type_
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:580
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint)
const TableDescriptor * get_metadata_for_table(const ::shared::TableKey &table_key, bool populate_fragmenter)
#define CHECK_GE(x, y)
Definition: Logger.h:306
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
llvm::Value * codegenVarlenOutputBuffer(const QueryMemoryDescriptor &query_mem_desc)
Expr * get_arg() const
Definition: Analyzer.h:1208
size_t getEffectiveKeyWidth() const
void codegenApproxQuantile(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
void checkErrorCode(llvm::Value *retCode)
CountDistinctDescriptors init_count_distinct_descriptors(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &group_by_range_info, const ExecutorDeviceType device_type, Executor *executor)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:102
#define LLVM_ALIGN(alignment)
RUNTIME_EXPORT void agg_mode_func(int64_t *agg, const int64_t val)
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
CountDistinctImplType impl_type_
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:305
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:88
std::string to_string(char const *&&v)
Helpers for codegen of target expressions.
#define LL_BOOL(v)
size_t getColOnlyOffInBytes(const size_t col_idx) const
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:75
const SQLTypeInfo get_compact_type(const TargetInfo &target)
const size_t limit
llvm::Value * codegenWindowPosition(const WindowFunctionContext *window_func_context, llvm::Value *pos_arg)
Definition: ColumnIR.cpp:227
bool isEmpty() const
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const std::optional< int64_t > &group_cardinality_estimation)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:167
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:34
size_t getGroupbyColCount() const
const ColumnDescriptor * get_column_descriptor_maybe(const shared::ColumnKey &column_key)
Definition: Execute.h:220
RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
const JoinQualsPerNestingLevel join_quals
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
executor_(executor)
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, llvm::Value *varlen_output_buffer, const std::vector< llvm::Value * > &agg_out_vec, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
bool g_bigint_count
Definition: sqldefs.h:77
bool g_enable_watchdog
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, llvm::Value *varlen_output_buffer, DiamondCodegen &diamond_codegen) const
int64_t g_bitmap_memory_limit
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:98
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
DEVICE void allocate()
Definition: quantile.h:601
#define AUTOMATIC_IR_METADATA(CGENSTATE)
This file includes the class specification for the buffer manager (BufferMgr), and related data struc...
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:51
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:79
QueryDescriptionType getQueryDescriptionType() const
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
ExecutorDeviceType device_type
#define RUNTIME_EXPORT
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:43
#define CHECK_LT(x, y)
Definition: Logger.h:303
Definition: sqltypes.h:70
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
const shared::ColumnKey & getColumnKey() const
Definition: Analyzer.h:198
int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo &col_range_info)
#define CHECK_LE(x, y)
Definition: Logger.h:304
bool expr_is_rowid(const Analyzer::Expr *expr)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit)
std::unordered_map< size_t, SQLTypeInfo > target_exprs_original_type_infos
Definition: sqldefs.h:78
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
void add(Value const value)
Definition: AggMode.h:40
Descriptor for the result set buffer layout.
TO bit_cast(FROM &&from)
Definition: misc.h:298
CountDistinctImplType
const std::optional< int64_t > group_cardinality_estimation_
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:291
bool is_geometry() const
Definition: sqltypes.h:592
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
bool g_cluster
int64_t get_epoch_days_from_seconds(const int64_t seconds)
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
static size_t getBaselineThreshold(bool for_count_distinct, ExecutorDeviceType device_type)
Definition: Execute.h:1270
#define CUR_FUNC
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:177
Definition: sqltypes.h:62
constexpr double n
Definition: Utm.h:38
size_t g_leaf_count
Definition: ParserNode.cpp:77
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:388
const RelAlgExecutionUnit & ra_exe_unit_
const size_t offset
Definition: sqldefs.h:76
Definition: sqldefs.h:74
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals