OmniSciDB  085a039ca4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
19 
20 #include "CardinalityEstimator.h"
21 #include "CodeGenerator.h"
23 #include "ExpressionRange.h"
24 #include "ExpressionRewrite.h"
25 #include "GpuInitGroups.h"
26 #include "InPlaceSort.h"
28 #include "MaxwellCodegenPatch.h"
30 #include "TargetExprBuilder.h"
31 
32 #include "../CudaMgr/CudaMgr.h"
33 #include "../Shared/checked_alloc.h"
34 #include "../Shared/funcannotations.h"
35 #include "../Utils/ChunkIter.h"
37 #include "Execute.h"
38 #include "QueryTemplateGenerator.h"
39 #include "RuntimeFunctions.h"
40 #include "StreamingTopN.h"
41 #include "TopKSort.h"
42 #include "WindowContext.h"
43 
44 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
45 
46 #include <cstring> // strcat()
47 #include <limits>
48 #include <numeric>
49 #include <string_view>
50 #include <thread>
51 
52 bool g_cluster{false};
53 bool g_bigint_count{false};
56 extern int64_t g_bitmap_memory_limit;
57 extern size_t g_leaf_count;
58 
59 namespace {
60 
61 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
62  int32_t agg_count{0};
63  for (auto target_expr : target_exprs) {
64  CHECK(target_expr);
65  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
66  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
67  const auto& ti = target_expr->get_type_info();
68  if (ti.is_buffer()) {
69  agg_count += 2;
70  } else if (ti.is_geometry()) {
71  agg_count += ti.get_physical_coord_cols() * 2;
72  } else {
73  ++agg_count;
74  }
75  continue;
76  }
77  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
78  agg_count += 2;
79  } else {
80  ++agg_count;
81  }
82  }
83  return agg_count;
84 }
85 
87  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
88  if (!col) {
89  return false;
90  }
91  const auto cd =
92  get_column_descriptor_maybe(col->get_column_id(), col->get_table_id(), cat);
93  if (!cd || !cd->isVirtualCol) {
94  return false;
95  }
96  CHECK_EQ("rowid", cd->columnName);
97  return true;
98 }
99 
100 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
101  for (const auto& target_expr : ra_exe_unit.target_exprs) {
102  const auto agg_info = get_target_info(target_expr, g_bigint_count);
103  if (agg_info.is_agg && is_distinct_target(agg_info)) {
104  return true;
105  }
106  }
107  return false;
108 }
109 
111  const int64_t max_entry_count) {
112  try {
113  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
114  checked_int64_t(col_range_info.min)) >= max_entry_count;
115  } catch (...) {
116  return true;
117  }
118 }
119 
120 bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate,
121  const ColRangeInfo& col_range_info) {
122  try {
123  // the cardinality estimate is the size of the baseline hash table. further penalize
124  // the baseline hash table by a factor of 2x due to overhead in computing baseline
125  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
126  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
127  // count of the column, we use baseline hash on the filtered set
128  return checked_int64_t(cardinality_estimate) * 2 <
129  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
130  checked_int64_t(col_range_info.min));
131  } catch (...) {
132  return false;
133  }
134 }
135 
137  const std::vector<InputTableInfo>& query_infos,
138  const Analyzer::Expr* expr,
139  Executor* executor) {
140  if (!expr) {
141  return {QueryDescriptionType::Projection, 0, 0, 0, false};
142  }
143 
144  const auto expr_range = getExpressionRange(
145  expr, query_infos, executor, boost::make_optional(ra_exe_unit.simple_quals));
146  switch (expr_range.getType()) {
148  if (expr_range.getIntMin() > expr_range.getIntMax()) {
149  return {
150  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
151  }
153  expr_range.getIntMin(),
154  expr_range.getIntMax(),
155  expr_range.getBucket(),
156  expr_range.hasNulls()};
157  }
160  if (expr_range.getFpMin() > expr_range.getFpMax()) {
161  return {
162  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
163  }
164  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
165  }
167  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
168  default:
169  CHECK(false);
170  }
171  CHECK(false);
172  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
173 }
174 
175 } // namespace
176 
178  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
179  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
180  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
181  // can expect this to be true anyway for grouped queries since the precise version
182  // uses significantly more memory.
183  const int64_t baseline_threshold =
188  if (ra_exe_unit_.groupby_exprs.size() != 1) {
189  try {
190  checked_int64_t cardinality{1};
191  bool has_nulls{false};
192  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
193  auto col_range_info = get_expr_range_info(
194  ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
195  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
196  // going through baseline hash if a non-integer type is encountered
197  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
198  }
199  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
200  CHECK_GE(crt_col_cardinality, 0);
201  cardinality *= crt_col_cardinality;
202  if (col_range_info.has_nulls) {
203  has_nulls = true;
204  }
205  }
206  // For zero or high cardinalities, use baseline layout.
207  if (!cardinality || cardinality > baseline_threshold) {
208  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
209  }
211  0,
212  int64_t(cardinality),
213  0,
214  has_nulls};
215  } catch (...) { // overflow when computing cardinality
216  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
217  }
218  }
219  // For single column groupby on high timestamps, force baseline hash due to wide ranges
220  // we are likely to encounter when applying quals to the expression range
221  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
222  // the range is small enough
223  if (ra_exe_unit_.groupby_exprs.front() &&
224  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
225  ra_exe_unit_.simple_quals.size() > 0) {
226  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
227  }
228  const auto col_range_info = get_expr_range_info(
230  if (!ra_exe_unit_.groupby_exprs.front()) {
231  return col_range_info;
232  }
233  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
234  const int64_t col_count =
236  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
238  max_entry_count = std::min(max_entry_count, baseline_threshold);
239  }
240  const auto& groupby_expr_ti = ra_exe_unit_.groupby_exprs.front()->get_type_info();
241  if (groupby_expr_ti.is_string() && !col_range_info.bucket) {
242  CHECK(groupby_expr_ti.get_compression() == kENCODING_DICT);
243 
244  const bool has_filters =
245  !ra_exe_unit_.quals.empty() || !ra_exe_unit_.simple_quals.empty();
246  if (has_filters &&
247  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
248  // if filters are present, we can use the filter to narrow the cardinality of the
249  // group by in the case of ranges too big for perfect hash. Otherwise, we are better
250  // off attempting perfect hash (since we know the range will be made of
251  // monotonically increasing numbers from min to max for dictionary encoded strings)
252  // and failing later due to excessive memory use.
253  // Check the conditions where baseline hash can provide a performance increase and
254  // return baseline hash (potentially forcing an estimator query) as the range type.
255  // Otherwise, return col_range_info which will likely be perfect hash, though could
256  // be baseline from a previous call of this function prior to the estimator query.
257  if (!ra_exe_unit_.sort_info.order_entries.empty()) {
258  // TODO(adb): allow some sorts to pass through this block by centralizing sort
259  // algorithm decision making
261  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
262  // always use baseline hash for column range too big for perfect hash with count
263  // distinct descriptors. We will need 8GB of CPU memory minimum for the perfect
264  // hash group by in this case.
266  col_range_info.min,
267  col_range_info.max,
268  0,
269  col_range_info.has_nulls};
270  } else {
271  // use original col range for sort
272  return col_range_info;
273  }
274  }
275  // if filters are present and the filtered range is less than the cardinality of
276  // the column, consider baseline hash
279  col_range_info)) {
281  col_range_info.min,
282  col_range_info.max,
283  0,
284  col_range_info.has_nulls};
285  }
286  }
287  } else if ((!expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get(),
288  *executor_->catalog_)) &&
289  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
290  !col_range_info.bucket) {
292  col_range_info.min,
293  col_range_info.max,
294  0,
295  col_range_info.has_nulls};
296  }
297  return col_range_info;
298 }
299 
301  checked_int64_t crt_col_cardinality =
302  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
303  if (col_range_info.bucket) {
304  crt_col_cardinality /= col_range_info.bucket;
305  }
306  return static_cast<int64_t>(crt_col_cardinality +
307  (1 + (col_range_info.has_nulls ? 1 : 0)));
308 }
309 
310 namespace {
311 // Like getBucketedCardinality() without counting nulls.
312 int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo& col_range_info) {
313  if (col_range_info.min <= col_range_info.max) {
314  size_t size = col_range_info.max - col_range_info.min;
315  if (col_range_info.bucket) {
316  size /= col_range_info.bucket;
317  }
318  CHECK_LT(size, std::numeric_limits<int64_t>::max());
319  return static_cast<int64_t>(size + 1);
320  } else {
321  return 0;
322  }
323 }
324 } // namespace
325 
326 #define LL_CONTEXT executor_->cgen_state_->context_
327 #define LL_BUILDER executor_->cgen_state_->ir_builder_
328 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
329 #define LL_INT(v) executor_->cgen_state_->llInt(v)
330 #define LL_FP(v) executor_->cgen_state_->llFp(v)
331 #define ROW_FUNC executor_->cgen_state_->row_func_
332 #define CUR_FUNC executor_->cgen_state_->current_func_
333 
335  Executor* executor,
336  const ExecutorDeviceType device_type,
337  const RelAlgExecutionUnit& ra_exe_unit,
338  const std::vector<InputTableInfo>& query_infos,
339  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
340  const std::optional<int64_t>& group_cardinality_estimation)
341  : executor_(executor)
342  , ra_exe_unit_(ra_exe_unit)
343  , query_infos_(query_infos)
344  , row_set_mem_owner_(row_set_mem_owner)
345  , device_type_(device_type)
346  , group_cardinality_estimation_(group_cardinality_estimation) {
347  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
348  if (!groupby_expr) {
349  continue;
350  }
351  const auto& groupby_ti = groupby_expr->get_type_info();
352  if (groupby_ti.is_bytes()) {
353  throw std::runtime_error(
354  "Cannot group by string columns which are not dictionary encoded.");
355  }
356  if (groupby_ti.is_buffer()) {
357  throw std::runtime_error("Group by buffer not supported");
358  }
359  if (groupby_ti.is_geometry()) {
360  throw std::runtime_error("Group by geometry not supported");
361  }
362  }
363 }
364 
366  const size_t shard_count) const {
367  size_t device_count{0};
369  device_count = executor_->cudaMgr()->getDeviceCount();
370  CHECK_GT(device_count, 0u);
371  }
372 
373  int64_t bucket{col_range_info.bucket};
374 
375  if (shard_count) {
376  CHECK(!col_range_info.bucket);
377  /*
378  when a node has fewer devices than shard count,
379  a) In a distributed setup, the minimum distance between two keys would be
380  device_count because shards are stored consecutively across the physical tables,
381  i.e if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1
382  would have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf
383  node has only 1 device, in this case, all the keys from each node are loaded on
384  the device each.
385 
386  b) In a single node setup, the distance would be minimum of device_count or
387  difference of device_count - shard_count. For example: If a single node server
388  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
389  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9
390  device 3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum
391  of device_count or difference.
392 
393  When a node has device count equal to or more than shard count then the
394  minimum distance is always at least shard_count * no of leaf nodes.
395  */
396  if (device_count < shard_count) {
397  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
398  : std::min(device_count, shard_count - device_count);
399  } else {
400  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
401  }
402  }
403 
404  return bucket;
405 }
406 
407 namespace {
408 
419  const std::vector<InputTableInfo>& query_infos,
420  const bool is_group_by,
421  Executor* executor) {
422  bool keyless{true}, found{false};
423  int32_t num_agg_expr{0};
424  int32_t index{0};
425  for (const auto target_expr : ra_exe_unit.target_exprs) {
426  const auto agg_info = get_target_info(target_expr, g_bigint_count);
427  const auto chosen_type = get_compact_type(agg_info);
428  if (agg_info.is_agg) {
429  num_agg_expr++;
430  }
431  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
432  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
433  CHECK(agg_expr);
434  const auto arg_expr = agg_arg(target_expr);
435  const bool float_argument_input = takes_float_argument(agg_info);
436  switch (agg_info.agg_kind) {
437  case kAVG:
438  ++index;
439  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
440  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
441  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
442  expr_range_info.hasNulls()) {
443  break;
444  }
445  }
446  found = true;
447  break;
448  case kCOUNT:
449  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
450  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
451  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
452  expr_range_info.hasNulls()) {
453  break;
454  }
455  }
456  found = true;
457  break;
458  case kSUM: {
459  auto arg_ti = arg_expr->get_type_info();
460  if (constrained_not_null(arg_expr, ra_exe_unit.quals)) {
461  arg_ti.set_notnull(true);
462  }
463  if (!arg_ti.get_notnull()) {
464  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
465  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
466  !expr_range_info.hasNulls()) {
467  found = true;
468  }
469  } else {
470  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
471  switch (expr_range_info.getType()) {
474  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
475  found = true;
476  }
477  break;
479  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
480  found = true;
481  }
482  break;
483  default:
484  break;
485  }
486  }
487  break;
488  }
489  case kMIN: {
490  CHECK(agg_expr && agg_expr->get_arg());
491  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
492  if (arg_ti.is_string() || arg_ti.is_buffer()) {
493  break;
494  }
495  auto expr_range_info =
496  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
497  auto init_max = get_agg_initial_val(agg_info.agg_kind,
498  chosen_type,
499  is_group_by || float_argument_input,
500  float_argument_input ? sizeof(float) : 8);
501  switch (expr_range_info.getType()) {
504  auto double_max =
505  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
506  if (expr_range_info.getFpMax() < double_max) {
507  found = true;
508  }
509  break;
510  }
512  if (expr_range_info.getIntMax() < init_max) {
513  found = true;
514  }
515  break;
516  default:
517  break;
518  }
519  break;
520  }
521  case kMAX: {
522  CHECK(agg_expr && agg_expr->get_arg());
523  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
524  if (arg_ti.is_string() || arg_ti.is_buffer()) {
525  break;
526  }
527  auto expr_range_info =
528  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
529  // NULL sentinel and init value for kMAX are identical, which results in
530  // ambiguity in detecting empty keys in presence of nulls.
531  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
532  expr_range_info.hasNulls()) {
533  break;
534  }
535  auto init_min = get_agg_initial_val(agg_info.agg_kind,
536  chosen_type,
537  is_group_by || float_argument_input,
538  float_argument_input ? sizeof(float) : 8);
539  switch (expr_range_info.getType()) {
542  auto double_min =
543  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
544  if (expr_range_info.getFpMin() > double_min) {
545  found = true;
546  }
547  break;
548  }
550  if (expr_range_info.getIntMin() > init_min) {
551  found = true;
552  }
553  break;
554  default:
555  break;
556  }
557  break;
558  }
559  default:
560  keyless = false;
561  break;
562  }
563  }
564  if (!keyless) {
565  break;
566  }
567  if (!found) {
568  ++index;
569  }
570  }
571 
572  // shouldn't use keyless for projection only
573  return {
574  keyless && found,
575  index,
576  };
577 }
578 
580  const RelAlgExecutionUnit& ra_exe_unit,
581  const std::vector<InputTableInfo>& query_infos,
582  const ExecutorDeviceType device_type,
583  Executor* executor) {
584  CountDistinctDescriptors count_distinct_descriptors;
585  for (const auto target_expr : ra_exe_unit.target_exprs) {
586  auto agg_info = get_target_info(target_expr, g_bigint_count);
587  if (is_distinct_target(agg_info)) {
588  CHECK(agg_info.is_agg);
589  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
590  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
591  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
592  if (arg_ti.is_bytes()) {
593  throw std::runtime_error(
594  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
595  }
596  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
597  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
598  }
599  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
600  throw std::runtime_error(
601  "APPROX_COUNT_DISTINCT on geometry columns not supported");
602  }
603  if (agg_info.is_distinct && arg_ti.is_geometry()) {
604  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
605  }
606  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
607  auto arg_range_info =
608  arg_ti.is_fp() ? no_range_info
610  ra_exe_unit, query_infos, agg_expr->get_arg(), executor);
612  int64_t bitmap_sz_bits{0};
613  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
614  const auto error_rate = agg_expr->get_arg1();
615  if (error_rate) {
616  CHECK(error_rate->get_type_info().get_type() == kINT);
617  CHECK_GE(error_rate->get_constval().intval, 1);
618  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
619  } else {
620  bitmap_sz_bits = g_hll_precision_bits;
621  }
622  }
623  if (arg_range_info.isEmpty()) {
624  count_distinct_descriptors.emplace_back(
626  0,
627  64,
628  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
629  device_type,
630  1});
631  continue;
632  }
633  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
634  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
635  // implementation for arrays
636  count_distinct_impl_type = CountDistinctImplType::Bitmap;
637  if (agg_info.agg_kind == kCOUNT) {
638  bitmap_sz_bits = get_bucketed_cardinality_without_nulls(arg_range_info);
639  if (bitmap_sz_bits <= 0 || g_bitmap_memory_limit <= bitmap_sz_bits) {
640  count_distinct_impl_type = CountDistinctImplType::UnorderedSet;
641  }
642  }
643  }
644  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
645  count_distinct_impl_type == CountDistinctImplType::UnorderedSet &&
646  !(arg_ti.is_array() || arg_ti.is_geometry())) {
647  count_distinct_impl_type = CountDistinctImplType::Bitmap;
648  }
649 
650  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
651  count_distinct_impl_type == CountDistinctImplType::UnorderedSet) {
652  throw WatchdogException("Cannot use a fast path for COUNT distinct");
653  }
654  const auto sub_bitmap_count =
655  get_count_distinct_sub_bitmap_count(bitmap_sz_bits, ra_exe_unit, device_type);
656  count_distinct_descriptors.emplace_back(
657  CountDistinctDescriptor{count_distinct_impl_type,
658  arg_range_info.min,
659  bitmap_sz_bits,
660  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
661  device_type,
662  sub_bitmap_count});
663  } else {
664  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
665  CountDistinctImplType::Invalid, 0, 0, false, device_type, 0});
666  }
667  }
668  return count_distinct_descriptors;
669 }
670 
671 } // namespace
672 
673 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
674  const bool allow_multifrag,
675  const size_t max_groups_buffer_entry_count,
676  const int8_t crt_min_byte_width,
677  RenderInfo* render_info,
678  const bool output_columnar_hint) {
679  const auto shard_count =
682  : 0;
683  bool sort_on_gpu_hint =
684  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
687  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
688  // but the total output buffer size would be too big or it's a sharded top query.
689  // For the sake of managing risk, use the new result set way very selectively for
690  // this case only (alongside the baseline layout we've enabled for a while now).
691  bool must_use_baseline_sort = shard_count;
692  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
693  while (true) {
694  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
695  max_groups_buffer_entry_count,
696  crt_min_byte_width,
697  sort_on_gpu_hint,
698  render_info,
699  must_use_baseline_sort,
700  output_columnar_hint);
701  CHECK(query_mem_desc);
702  if (query_mem_desc->sortOnGpu() &&
703  (query_mem_desc->getBufferSizeBytes(device_type_) +
704  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
705  2 * 1024 * 1024 * 1024LL) {
706  must_use_baseline_sort = true;
707  sort_on_gpu_hint = false;
708  } else {
709  break;
710  }
711  }
712  return query_mem_desc;
713 }
714 
715 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
716  const bool allow_multifrag,
717  const size_t max_groups_buffer_entry_count,
718  const int8_t crt_min_byte_width,
719  const bool sort_on_gpu_hint,
720  RenderInfo* render_info,
721  const bool must_use_baseline_sort,
722  const bool output_columnar_hint) {
723  const auto count_distinct_descriptors = init_count_distinct_descriptors(
725 
726  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
727 
728  auto col_range_info_nosharding = getColRangeInfo();
729 
730  const auto shard_count =
733  : 0;
734 
735  const auto col_range_info =
736  ColRangeInfo{col_range_info_nosharding.hash_type_,
737  col_range_info_nosharding.min,
738  col_range_info_nosharding.max,
739  getShardedTopBucket(col_range_info_nosharding, shard_count),
740  col_range_info_nosharding.has_nulls};
741 
742  // Non-grouped aggregates do not support accessing aggregated ranges
743  // Keyless hash is currently only supported with single-column perfect hash
744  const auto keyless_info =
745  !(is_group_by &&
746  col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash)
747  ? KeylessInfo{false, -1}
749 
750  if (g_enable_watchdog &&
751  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
752  max_groups_buffer_entry_count > g_watchdog_baseline_max_groups) ||
753  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
754  ra_exe_unit_.groupby_exprs.size() == 1 &&
755  (col_range_info.max - col_range_info.min) /
756  std::max(col_range_info.bucket, int64_t(1)) >
757  130000000))) {
758  throw WatchdogException("Query would use too much memory");
759  }
760  try {
762  ra_exe_unit_,
763  query_infos_,
764  col_range_info,
765  keyless_info,
766  allow_multifrag,
767  device_type_,
768  crt_min_byte_width,
769  sort_on_gpu_hint,
770  shard_count,
771  max_groups_buffer_entry_count,
772  render_info,
773  count_distinct_descriptors,
774  must_use_baseline_sort,
775  output_columnar_hint,
776  /*streaming_top_n_hint=*/true);
777  } catch (const StreamingTopNOOM& e) {
778  LOG(WARNING) << e.what() << " Disabling Streaming Top N.";
780  ra_exe_unit_,
781  query_infos_,
782  col_range_info,
783  keyless_info,
784  allow_multifrag,
785  device_type_,
786  crt_min_byte_width,
787  sort_on_gpu_hint,
788  shard_count,
789  max_groups_buffer_entry_count,
790  render_info,
791  count_distinct_descriptors,
792  must_use_baseline_sort,
793  output_columnar_hint,
794  /*streaming_top_n_hint=*/false);
795  }
796 }
797 
799  const std::list<Analyzer::OrderEntry>& order_entries) {
800  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
801  return false;
802  }
803  for (const auto& order_entry : order_entries) {
804  CHECK_GE(order_entry.tle_no, 1);
805  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
806  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
807  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
808  return false;
809  }
810  // TODO(alex): relax the restrictions
811  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
812  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
813  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
814  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
815  return false;
816  }
817  if (agg_expr->get_arg()) {
818  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
819  if (arg_ti.is_fp()) {
820  return false;
821  }
822  auto expr_range_info =
823  get_expr_range_info(ra_exe_unit_, query_infos_, agg_expr->get_arg(), executor_);
824  // TOD(adb): QMD not actually initialized here?
825  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
826  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
827  expr_range_info.has_nulls) &&
828  order_entry.is_desc == order_entry.nulls_first) {
829  return false;
830  }
831  }
832  const auto& target_ti = target_expr->get_type_info();
833  CHECK(!target_ti.is_buffer());
834  if (!target_ti.is_integer()) {
835  return false;
836  }
837  }
838  return true;
839 }
840 
841 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
842  llvm::BasicBlock* sc_false,
844  const CompilationOptions& co,
845  const GpuSharedMemoryContext& gpu_smem_context) {
846  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
847  CHECK(filter_result);
848 
849  bool can_return_error = false;
850  llvm::BasicBlock* filter_false{nullptr};
851 
852  {
853  const bool is_group_by = !ra_exe_unit_.groupby_exprs.empty();
854 
855  if (executor_->isArchMaxwell(co.device_type)) {
857  }
858  DiamondCodegen filter_cfg(filter_result,
859  executor_,
860  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
861  "filter", // filter_true and filter_false basic blocks
862  nullptr,
863  false);
864  filter_false = filter_cfg.cond_false_;
865 
866  if (is_group_by) {
868  !query_mem_desc.useStreamingTopN()) {
869  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
870  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
871  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
872  llvm::Value* old_total_matched_val{nullptr};
874  old_total_matched_val =
875  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
876  total_matched_ptr,
877  LL_INT(int32_t(1)),
878 #if LLVM_VERSION_MAJOR > 12
879  LLVM_ALIGN(8),
880 #endif
881  llvm::AtomicOrdering::Monotonic);
882  } else {
883  old_total_matched_val = LL_BUILDER.CreateLoad(
884  total_matched_ptr->getType()->getPointerElementType(), total_matched_ptr);
885  LL_BUILDER.CreateStore(
886  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
887  total_matched_ptr);
888  }
889  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
890  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
891  }
892 
893  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
894  auto varlen_output_buffer = codegenVarlenOutputBuffer(query_mem_desc);
895  if (query_mem_desc.usesGetGroupValueFast() ||
896  query_mem_desc.getQueryDescriptionType() ==
898  if (query_mem_desc.getGroupbyColCount() > 1) {
899  filter_cfg.setChainToNext();
900  }
901  // Don't generate null checks if the group slot is guaranteed to be non-null,
902  // as it's the case for get_group_value_fast* family.
903  can_return_error = codegenAggCalls(agg_out_ptr_w_idx,
904  varlen_output_buffer,
905  {},
907  co,
908  gpu_smem_context,
909  filter_cfg);
910  } else {
911  {
912  llvm::Value* nullcheck_cond{nullptr};
913  if (query_mem_desc.didOutputColumnar()) {
914  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
915  LL_INT(int32_t(0)));
916  } else {
917  nullcheck_cond = LL_BUILDER.CreateICmpNE(
918  std::get<0>(agg_out_ptr_w_idx),
919  llvm::ConstantPointerNull::get(
920  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
921  }
922  DiamondCodegen nullcheck_cfg(
923  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
924  codegenAggCalls(agg_out_ptr_w_idx,
925  varlen_output_buffer,
926  {},
928  co,
929  gpu_smem_context,
930  filter_cfg);
931  }
932  can_return_error = true;
933  if (query_mem_desc.getQueryDescriptionType() ==
935  query_mem_desc.useStreamingTopN()) {
936  // Ignore rejection on pushing current row to top-K heap.
937  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
938  } else {
939  CodeGenerator code_generator(executor_);
940  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
941  // TODO(alex): remove the trunc once pos is converted to 32 bits
942  code_generator.posArg(nullptr),
943  get_int_type(32, LL_CONTEXT))));
944  }
945  }
946  } else {
947  if (ra_exe_unit_.estimator) {
948  std::stack<llvm::BasicBlock*> array_loops;
949  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
950  } else {
951  auto arg_it = ROW_FUNC->arg_begin();
952  std::vector<llvm::Value*> agg_out_vec;
953  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
954  agg_out_vec.push_back(&*arg_it++);
955  }
956  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
957  /*varlen_output_buffer=*/nullptr,
958  agg_out_vec,
959  query_mem_desc,
960  co,
961  gpu_smem_context,
962  filter_cfg);
963  }
964  }
965  }
966 
967  if (ra_exe_unit_.join_quals.empty()) {
968  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
969  } else if (sc_false) {
970  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
971  LL_BUILDER.SetInsertPoint(sc_false);
972  LL_BUILDER.CreateBr(filter_false);
973  LL_BUILDER.SetInsertPoint(saved_insert_block);
974  }
975 
976  return can_return_error;
977 }
978 
980  llvm::Value* groups_buffer,
982  const CompilationOptions& co,
983  DiamondCodegen& diamond_codegen) {
984  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
986  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
987  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
988  CHECK(!group_expr);
989  if (!query_mem_desc.didOutputColumnar()) {
990  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
991  }
992  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
993  ? 0
994  : query_mem_desc.getRowSize() / sizeof(int64_t);
995  CodeGenerator code_generator(executor_);
996  if (query_mem_desc.useStreamingTopN()) {
997  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
998  CHECK_GE(only_order_entry.tle_no, int(1));
999  const size_t target_idx = only_order_entry.tle_no - 1;
1000  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1001  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1002  const auto chosen_bytes =
1003  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1004  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1005  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1007  std::string fname = "get_bin_from_k_heap";
1008  const auto& oe_ti = order_entry_expr->get_type_info();
1009  llvm::Value* null_key_lv = nullptr;
1010  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1011  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1012  switch (bit_width) {
1013  case 32:
1014  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1015  break;
1016  case 64:
1017  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1018  break;
1019  default:
1020  CHECK(false);
1021  }
1022  fname += "_int" + std::to_string(bit_width) + "_t";
1023  } else {
1024  CHECK(oe_ti.is_fp());
1025  if (order_entry_lv->getType()->isDoubleTy()) {
1026  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1027  } else {
1028  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1029  }
1030  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1031  }
1032  const auto key_slot_idx =
1034  return emitCall(
1035  fname,
1036  {groups_buffer,
1037  LL_INT(n),
1038  LL_INT(row_size_quad),
1039  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1040  LL_BOOL(only_order_entry.is_desc),
1041  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1042  LL_BOOL(only_order_entry.nulls_first),
1043  null_key_lv,
1044  order_entry_lv});
1045  } else {
1046  auto* arg = get_arg_by_name(ROW_FUNC, "max_matched");
1047  const auto output_buffer_entry_count_lv =
1048  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1049  arg = get_arg_by_name(ROW_FUNC, "old_total_matched");
1050  const auto group_expr_lv =
1051  LL_BUILDER.CreateLoad(arg->getType()->getPointerElementType(), arg);
1052  std::vector<llvm::Value*> args{groups_buffer,
1053  output_buffer_entry_count_lv,
1054  group_expr_lv,
1055  code_generator.posArg(nullptr)};
1056  if (query_mem_desc.didOutputColumnar()) {
1057  const auto columnar_output_offset =
1058  emitCall("get_columnar_scan_output_offset", args);
1059  return columnar_output_offset;
1060  }
1061  args.push_back(LL_INT(row_size_quad));
1062  return emitCall("get_scan_output_slot", args);
1063  }
1064 }
1065 
1066 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1068  const CompilationOptions& co,
1069  DiamondCodegen& diamond_codegen) {
1070  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1071  auto arg_it = ROW_FUNC->arg_begin();
1072  auto groups_buffer = arg_it++;
1073 
1074  std::stack<llvm::BasicBlock*> array_loops;
1075 
1076  // TODO(Saman): move this logic outside of this function.
1078  if (query_mem_desc.didOutputColumnar()) {
1079  return std::make_tuple(
1080  &*groups_buffer,
1081  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1082  } else {
1083  return std::make_tuple(
1084  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1085  nullptr);
1086  }
1087  }
1088 
1089  CHECK(query_mem_desc.getQueryDescriptionType() ==
1091  query_mem_desc.getQueryDescriptionType() ==
1093 
1094  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1095  ? 0
1096  : query_mem_desc.getRowSize() / sizeof(int64_t);
1097 
1098  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1099  ? sizeof(int64_t)
1100  : query_mem_desc.getEffectiveKeyWidth();
1101  // for multi-column group by
1102  llvm::Value* group_key = nullptr;
1103  llvm::Value* key_size_lv = nullptr;
1104 
1105  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1106  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1107  if (query_mem_desc.getQueryDescriptionType() ==
1109  group_key =
1110  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1111  } else if (query_mem_desc.getQueryDescriptionType() ==
1113  group_key =
1114  col_width_size == sizeof(int32_t)
1115  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1116  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1117  }
1118  CHECK(group_key);
1119  CHECK(key_size_lv);
1120  }
1121 
1122  int32_t subkey_idx = 0;
1123  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1124  for (const auto& group_expr : ra_exe_unit_.groupby_exprs) {
1125  const auto col_range_info =
1127  const auto translated_null_value = static_cast<int64_t>(
1128  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1129  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1130  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1131  : checked_int64_t(col_range_info.max) +
1132  (col_range_info.bucket ? col_range_info.bucket : 1));
1133 
1134  const bool col_has_nulls =
1135  query_mem_desc.getQueryDescriptionType() ==
1137  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1138  ? query_mem_desc.hasNulls()
1139  : col_range_info.has_nulls)
1140  : false;
1141 
1142  const auto group_expr_lvs =
1143  executor_->groupByColumnCodegen(group_expr.get(),
1144  col_width_size,
1145  co,
1146  col_has_nulls,
1147  translated_null_value,
1148  diamond_codegen,
1149  array_loops,
1150  query_mem_desc.threadsShareMemory());
1151  const auto group_expr_lv = group_expr_lvs.translated_value;
1152  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1153  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1154  return codegenSingleColumnPerfectHash(query_mem_desc,
1155  co,
1156  &*groups_buffer,
1157  group_expr_lv,
1158  group_expr_lvs.original_value,
1159  row_size_quad);
1160  } else {
1161  // store the sub-key to the buffer
1162  LL_BUILDER.CreateStore(
1163  group_expr_lv,
1164  LL_BUILDER.CreateGEP(
1165  group_key->getType()->getScalarType()->getPointerElementType(),
1166  group_key,
1167  LL_INT(subkey_idx++)));
1168  }
1169  }
1170  if (query_mem_desc.getQueryDescriptionType() ==
1172  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1174  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1175  } else if (query_mem_desc.getQueryDescriptionType() ==
1178  &*groups_buffer,
1179  group_key,
1180  key_size_lv,
1181  query_mem_desc,
1182  col_width_size,
1183  row_size_quad);
1184  }
1185  CHECK(false);
1186  return std::make_tuple(nullptr, nullptr);
1187 }
1188 
1191  if (!query_mem_desc.hasVarlenOutput()) {
1192  return nullptr;
1193  }
1194 
1195  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1196  auto arg_it = ROW_FUNC->arg_begin();
1197  arg_it++; /* groups_buffer */
1198  auto varlen_output_buffer = arg_it++;
1199  CHECK(varlen_output_buffer->getType() == llvm::Type::getInt64PtrTy(LL_CONTEXT));
1200  return varlen_output_buffer;
1201 }
1202 
1203 std::tuple<llvm::Value*, llvm::Value*>
1206  const CompilationOptions& co,
1207  llvm::Value* groups_buffer,
1208  llvm::Value* group_expr_lv_translated,
1209  llvm::Value* group_expr_lv_original,
1210  const int32_t row_size_quad) {
1211  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1212  CHECK(query_mem_desc.usesGetGroupValueFast());
1213  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1214  ? "get_columnar_group_bin_offset"
1215  : "get_group_value_fast"};
1216  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1217  get_group_fn_name += "_keyless";
1218  }
1219  if (query_mem_desc.interleavedBins(co.device_type)) {
1220  CHECK(!query_mem_desc.didOutputColumnar());
1221  CHECK(query_mem_desc.hasKeylessHash());
1222  get_group_fn_name += "_semiprivate";
1223  }
1224  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1225  &*group_expr_lv_translated};
1226  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1227  query_mem_desc.mustUseBaselineSort()) {
1228  get_group_fn_name += "_with_original_key";
1229  get_group_fn_args.push_back(group_expr_lv_original);
1230  }
1231  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1232  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1233  if (!query_mem_desc.hasKeylessHash()) {
1234  if (!query_mem_desc.didOutputColumnar()) {
1235  get_group_fn_args.push_back(LL_INT(row_size_quad));
1236  }
1237  } else {
1238  if (!query_mem_desc.didOutputColumnar()) {
1239  get_group_fn_args.push_back(LL_INT(row_size_quad));
1240  }
1241  if (query_mem_desc.interleavedBins(co.device_type)) {
1242  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1243  get_group_fn_args.push_back(warp_idx);
1244  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1245  }
1246  }
1247  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1248  return std::make_tuple(&*groups_buffer,
1249  emitCall(get_group_fn_name, get_group_fn_args));
1250  }
1251  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1252 }
1253 
1254 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1255  llvm::Value* groups_buffer,
1256  llvm::Value* group_key,
1257  llvm::Value* key_size_lv,
1258  const QueryMemoryDescriptor& query_mem_desc,
1259  const int32_t row_size_quad) {
1260  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1261  CHECK(query_mem_desc.getQueryDescriptionType() ==
1263  // compute the index (perfect hash)
1264  auto perfect_hash_func = codegenPerfectHashFunction();
1265  auto hash_lv =
1266  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1267 
1268  if (query_mem_desc.didOutputColumnar()) {
1269  if (!query_mem_desc.hasKeylessHash()) {
1270  const std::string set_matching_func_name{
1271  "set_matching_group_value_perfect_hash_columnar"};
1272  const std::vector<llvm::Value*> set_matching_func_arg{
1273  groups_buffer,
1274  hash_lv,
1275  group_key,
1276  key_size_lv,
1277  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1278  query_mem_desc.getEntryCount())};
1279  emitCall(set_matching_func_name, set_matching_func_arg);
1280  }
1281  return std::make_tuple(groups_buffer, hash_lv);
1282  } else {
1283  if (query_mem_desc.hasKeylessHash()) {
1284  return std::make_tuple(emitCall("get_matching_group_value_perfect_hash_keyless",
1285  {groups_buffer, hash_lv, LL_INT(row_size_quad)}),
1286  nullptr);
1287  } else {
1288  return std::make_tuple(
1289  emitCall(
1290  "get_matching_group_value_perfect_hash",
1291  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1292  nullptr);
1293  }
1294  }
1295 }
1296 
1297 std::tuple<llvm::Value*, llvm::Value*>
1299  const CompilationOptions& co,
1300  llvm::Value* groups_buffer,
1301  llvm::Value* group_key,
1302  llvm::Value* key_size_lv,
1303  const QueryMemoryDescriptor& query_mem_desc,
1304  const size_t key_width,
1305  const int32_t row_size_quad) {
1306  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1307  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1308  CHECK(key_width == sizeof(int32_t));
1309  group_key =
1310  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1311  }
1312  std::vector<llvm::Value*> func_args{
1313  groups_buffer,
1314  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1315  &*group_key,
1316  &*key_size_lv,
1317  LL_INT(static_cast<int32_t>(key_width))};
1318  std::string func_name{"get_group_value"};
1319  if (query_mem_desc.didOutputColumnar()) {
1320  func_name += "_columnar_slot";
1321  } else {
1322  func_args.push_back(LL_INT(row_size_quad));
1323  }
1324  if (co.with_dynamic_watchdog) {
1325  func_name += "_with_watchdog";
1326  }
1327  if (query_mem_desc.didOutputColumnar()) {
1328  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1329  } else {
1330  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1331  }
1332 }
1333 
1335  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1336  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1337  auto ft = llvm::FunctionType::get(
1338  get_int_type(32, LL_CONTEXT),
1339  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1340  false);
1341  auto key_hash_func = llvm::Function::Create(ft,
1342  llvm::Function::ExternalLinkage,
1343  "perfect_key_hash",
1344  executor_->cgen_state_->module_);
1345  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1346  mark_function_always_inline(key_hash_func);
1347  auto& key_buff_arg = *key_hash_func->args().begin();
1348  llvm::Value* key_buff_lv = &key_buff_arg;
1349  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1350  llvm::IRBuilder<> key_hash_func_builder(bb);
1351  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1352  std::vector<int64_t> cardinalities;
1353  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1354  auto col_range_info =
1355  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1356  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1357  cardinalities.push_back(getBucketedCardinality(col_range_info));
1358  }
1359  size_t dim_idx = 0;
1360  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1361  auto* gep = key_hash_func_builder.CreateGEP(
1362  key_buff_lv->getType()->getScalarType()->getPointerElementType(),
1363  key_buff_lv,
1364  LL_INT(dim_idx));
1365  auto key_comp_lv =
1366  key_hash_func_builder.CreateLoad(gep->getType()->getPointerElementType(), gep);
1367  auto col_range_info =
1368  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1369  auto crt_term_lv =
1370  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1371  if (col_range_info.bucket) {
1372  crt_term_lv =
1373  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1374  }
1375  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1376  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1377  LL_INT(cardinalities[prev_dim_idx]));
1378  }
1379  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1380  ++dim_idx;
1381  }
1382  key_hash_func_builder.CreateRet(
1383  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1384  return key_hash_func;
1385 }
1386 
1388  const TargetInfo& agg_info,
1389  llvm::Value* target) {
1390  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1391  const auto& agg_type = agg_info.sql_type;
1392  const size_t chosen_bytes = agg_type.get_size();
1393 
1394  bool need_conversion{false};
1395  llvm::Value* arg_null{nullptr};
1396  llvm::Value* agg_null{nullptr};
1397  llvm::Value* target_to_cast{target};
1398  if (arg_type.is_fp()) {
1399  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1400  if (agg_type.is_fp()) {
1401  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1402  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1403  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1404  need_conversion = true;
1405  }
1406  } else {
1407  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1408  return target;
1409  }
1410  } else {
1411  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1412  if (agg_type.is_fp()) {
1413  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1414  need_conversion = true;
1415  target_to_cast = executor_->castToFP(target, arg_type, agg_type);
1416  } else {
1417  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1418  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1419  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1420  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1421  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1422  need_conversion = true;
1423  }
1424  }
1425  }
1426  if (need_conversion) {
1427  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1428  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1429  return LL_BUILDER.CreateSelect(
1430  cmp,
1431  agg_null,
1432  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1433  } else {
1434  return target;
1435  }
1436 }
1437 
1439  const Analyzer::WindowFunction* window_func,
1440  const QueryMemoryDescriptor& query_mem_desc,
1441  const CompilationOptions& co,
1442  DiamondCodegen& diamond_codegen) {
1443  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1444  const auto window_func_context =
1446  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1447  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1448  ? 0
1449  : query_mem_desc.getRowSize() / sizeof(int64_t);
1450  auto arg_it = ROW_FUNC->arg_begin();
1451  auto groups_buffer = arg_it++;
1452  CodeGenerator code_generator(executor_);
1453  auto window_pos_lv = code_generator.codegenWindowPosition(
1454  window_func_context, code_generator.posArg(nullptr));
1455  const auto pos_in_window =
1456  LL_BUILDER.CreateTrunc(window_pos_lv, get_int_type(32, LL_CONTEXT));
1457  llvm::Value* entry_count_lv =
1458  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1459  std::vector<llvm::Value*> args{
1460  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1461  if (query_mem_desc.didOutputColumnar()) {
1462  const auto columnar_output_offset =
1463  emitCall("get_columnar_scan_output_offset", args);
1464  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1465  }
1466  args.push_back(LL_INT(row_size_quad));
1467  return emitCall("get_scan_output_slot", args);
1468  }
1469  auto arg_it = ROW_FUNC->arg_begin();
1470  auto groups_buffer = arg_it++;
1471  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1472 }
1473 
1475  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1476  llvm::Value* varlen_output_buffer,
1477  const std::vector<llvm::Value*>& agg_out_vec,
1478  QueryMemoryDescriptor& query_mem_desc,
1479  const CompilationOptions& co,
1480  const GpuSharedMemoryContext& gpu_smem_context,
1481  DiamondCodegen& diamond_codegen) {
1482  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1483  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1484  // TODO(alex): unify the two cases, the output for non-group by queries
1485  // should be a contiguous buffer
1486  const bool is_group_by = std::get<0>(agg_out_ptr_w_idx);
1487  bool can_return_error = false;
1488  if (is_group_by) {
1489  CHECK(agg_out_vec.empty());
1490  } else {
1491  CHECK(!agg_out_vec.empty());
1492  }
1493 
1494  // output buffer is casted into a byte stream to be able to handle data elements of
1495  // different sizes (only used when actual column width sizes are used)
1496  llvm::Value* output_buffer_byte_stream{nullptr};
1497  llvm::Value* out_row_idx{nullptr};
1498  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1500  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1501  std::get<0>(agg_out_ptr_w_idx),
1502  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1503  output_buffer_byte_stream->setName("out_buff_b_stream");
1504  CHECK(std::get<1>(agg_out_ptr_w_idx));
1505  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1506  llvm::Type::getInt64Ty(LL_CONTEXT));
1507  out_row_idx->setName("out_row_idx");
1508  }
1509 
1510  TargetExprCodegenBuilder target_builder(ra_exe_unit_, is_group_by);
1511  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1512  ++target_idx) {
1513  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1514  CHECK(target_expr);
1515 
1516  target_builder(target_expr, executor_, query_mem_desc, co);
1517  }
1518 
1519  target_builder.codegen(this,
1520  executor_,
1521  query_mem_desc,
1522  co,
1523  gpu_smem_context,
1524  agg_out_ptr_w_idx,
1525  agg_out_vec,
1526  output_buffer_byte_stream,
1527  out_row_idx,
1528  varlen_output_buffer,
1529  diamond_codegen);
1530 
1531  for (auto target_expr : ra_exe_unit_.target_exprs) {
1532  CHECK(target_expr);
1533  executor_->plan_state_->isLazyFetchColumn(target_expr);
1534  }
1535 
1536  return can_return_error;
1537 }
1538 
1543  llvm::Value* output_buffer_byte_stream,
1544  llvm::Value* out_row_idx,
1545  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1546  const QueryMemoryDescriptor& query_mem_desc,
1547  const size_t chosen_bytes,
1548  const size_t agg_out_off,
1549  const size_t target_idx) {
1550  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1551  llvm::Value* agg_col_ptr{nullptr};
1552  if (query_mem_desc.didOutputColumnar()) {
1553  // TODO(Saman): remove the second columnar branch, and support all query description
1554  // types through the first branch. Then, input arguments should also be cleaned up
1555  if (!g_cluster &&
1557  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1558  chosen_bytes == 8);
1559  CHECK(output_buffer_byte_stream);
1560  CHECK(out_row_idx);
1561  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1562  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1563  auto out_per_col_byte_idx =
1564 #ifdef _WIN32
1565  LL_BUILDER.CreateShl(out_row_idx, __lzcnt(chosen_bytes) - 1);
1566 #else
1567  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1568 #endif
1569  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1570  LL_INT(static_cast<int64_t>(col_off)));
1571  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1572  auto output_ptr = LL_BUILDER.CreateGEP(
1573  output_buffer_byte_stream->getType()->getScalarType()->getPointerElementType(),
1574  output_buffer_byte_stream,
1575  byte_offset);
1576  agg_col_ptr = LL_BUILDER.CreateBitCast(
1577  output_ptr,
1578  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1579  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1580  } else {
1581  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1582  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1583  col_off /= chosen_bytes;
1584  CHECK(std::get<1>(agg_out_ptr_w_idx));
1585  auto offset = LL_BUILDER.CreateAdd(std::get<1>(agg_out_ptr_w_idx), LL_INT(col_off));
1586  auto* bit_cast = LL_BUILDER.CreateBitCast(
1587  std::get<0>(agg_out_ptr_w_idx),
1588  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1589  agg_col_ptr = LL_BUILDER.CreateGEP(
1590  bit_cast->getType()->getScalarType()->getPointerElementType(),
1591  bit_cast,
1592  offset);
1593  }
1594  } else {
1595  uint32_t col_off = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1596  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1597  col_off /= chosen_bytes;
1598  auto* bit_cast = LL_BUILDER.CreateBitCast(
1599  std::get<0>(agg_out_ptr_w_idx),
1600  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1601  agg_col_ptr = LL_BUILDER.CreateGEP(
1602  bit_cast->getType()->getScalarType()->getPointerElementType(),
1603  bit_cast,
1604  LL_INT(col_off));
1605  }
1606  CHECK(agg_col_ptr);
1607  return agg_col_ptr;
1608 }
1609 
1610 void GroupByAndAggregate::codegenEstimator(std::stack<llvm::BasicBlock*>& array_loops,
1611  DiamondCodegen& diamond_codegen,
1612  const QueryMemoryDescriptor& query_mem_desc,
1613  const CompilationOptions& co) {
1614  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1615  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1616  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1617  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1618  estimator_comp_count_lv);
1619  int32_t subkey_idx = 0;
1620  for (const auto& estimator_arg_comp : estimator_arg) {
1621  const auto estimator_arg_comp_lvs =
1622  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1623  query_mem_desc.getEffectiveKeyWidth(),
1624  co,
1625  false,
1626  0,
1627  diamond_codegen,
1628  array_loops,
1629  true);
1630  CHECK(!estimator_arg_comp_lvs.original_value);
1631  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1632  // store the sub-key to the buffer
1633  LL_BUILDER.CreateStore(
1634  estimator_arg_comp_lv,
1635  LL_BUILDER.CreateGEP(
1636  estimator_key_lv->getType()->getScalarType()->getPointerElementType(),
1637  estimator_key_lv,
1638  LL_INT(subkey_idx++)));
1639  }
1640  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1641  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1642  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1643  const auto estimator_comp_bytes_lv =
1644  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1645  const auto bitmap_size_lv =
1646  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1647  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1648  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1649 }
1650 
1651 extern "C" RUNTIME_EXPORT void agg_count_distinct(int64_t* agg, const int64_t val) {
1652  reinterpret_cast<CountDistinctSet*>(*agg)->insert(val);
1653 }
1654 
1655 extern "C" RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t* agg,
1656  const int64_t val,
1657  const int64_t skip_val) {
1658  if (val != skip_val) {
1659  agg_count_distinct(agg, val);
1660  }
1661 }
1662 
1663 extern "C" RUNTIME_EXPORT void agg_approx_quantile(int64_t* agg, const double val) {
1664  auto* t_digest = reinterpret_cast<quantile::TDigest*>(*agg);
1665  t_digest->allocate();
1666  t_digest->add(val);
1667 }
1668 
1670  const size_t target_idx,
1671  const Analyzer::Expr* target_expr,
1672  std::vector<llvm::Value*>& agg_args,
1673  const QueryMemoryDescriptor& query_mem_desc,
1674  const ExecutorDeviceType device_type) {
1675  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1676  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1677  const auto& arg_ti =
1678  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1679  if (arg_ti.is_fp()) {
1680  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1681  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1682  }
1683  const auto& count_distinct_descriptor =
1684  query_mem_desc.getCountDistinctDescriptor(target_idx);
1685  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1686  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1687  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1688  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1689  if (device_type == ExecutorDeviceType::GPU) {
1690  const auto base_dev_addr = getAdditionalLiteral(-1);
1691  const auto base_host_addr = getAdditionalLiteral(-2);
1692  agg_args.push_back(base_dev_addr);
1693  agg_args.push_back(base_host_addr);
1694  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1695  } else {
1696  emitCall("agg_approximate_count_distinct", agg_args);
1697  }
1698  return;
1699  }
1700  std::string agg_fname{"agg_count_distinct"};
1701  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1702  agg_fname += "_bitmap";
1703  agg_args.push_back(LL_INT(static_cast<int64_t>(count_distinct_descriptor.min_val)));
1704  }
1705  if (agg_info.skip_null_val) {
1706  auto null_lv = executor_->cgen_state_->castToTypeIn(
1707  (arg_ti.is_fp()
1708  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1709  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1710  64);
1711  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1712  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1713  agg_fname += "_skip_val";
1714  agg_args.push_back(null_lv);
1715  }
1716  if (device_type == ExecutorDeviceType::GPU) {
1717  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1718  agg_fname += "_gpu";
1719  const auto base_dev_addr = getAdditionalLiteral(-1);
1720  const auto base_host_addr = getAdditionalLiteral(-2);
1721  agg_args.push_back(base_dev_addr);
1722  agg_args.push_back(base_host_addr);
1723  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1724  CHECK_EQ(size_t(0),
1725  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1726  count_distinct_descriptor.sub_bitmap_count);
1727  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1728  count_distinct_descriptor.sub_bitmap_count)));
1729  }
1730  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1731  emitCall(agg_fname, agg_args);
1732  } else {
1733  executor_->cgen_state_->emitExternalCall(
1734  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1735  }
1736 }
1737 
1739  const size_t target_idx,
1740  const Analyzer::Expr* target_expr,
1741  std::vector<llvm::Value*>& agg_args,
1742  const QueryMemoryDescriptor& query_mem_desc,
1743  const ExecutorDeviceType device_type) {
1744  if (device_type == ExecutorDeviceType::GPU) {
1745  throw QueryMustRunOnCpu();
1746  }
1747  llvm::BasicBlock *calc, *skip;
1748  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1749  auto const arg_ti =
1750  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1751  bool const nullable = !arg_ti.get_notnull();
1752 
1753  auto* cs = executor_->cgen_state_.get();
1754  auto& irb = cs->ir_builder_;
1755  if (nullable) {
1756  auto* const null_value = cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1757  auto* const skip_cond = arg_ti.is_fp()
1758  ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1759  : irb.CreateICmpEQ(agg_args.back(), null_value);
1760  calc = llvm::BasicBlock::Create(cs->context_, "calc_approx_quantile");
1761  skip = llvm::BasicBlock::Create(cs->context_, "skip_approx_quantile");
1762  irb.CreateCondBr(skip_cond, skip, calc);
1763  cs->current_func_->getBasicBlockList().push_back(calc);
1764  irb.SetInsertPoint(calc);
1765  }
1766  if (!arg_ti.is_fp()) {
1767  auto const agg_info = get_target_info(target_expr, g_bigint_count);
1768  agg_args.back() = executor_->castToFP(agg_args.back(), arg_ti, agg_info.sql_type);
1769  }
1770  cs->emitExternalCall(
1771  "agg_approx_quantile", llvm::Type::getVoidTy(cs->context_), agg_args);
1772  if (nullable) {
1773  irb.CreateBr(skip);
1774  cs->current_func_->getBasicBlockList().push_back(skip);
1775  irb.SetInsertPoint(skip);
1776  }
1777 }
1778 
1779 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
1780  CHECK_LT(off, 0);
1781  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
1782  auto* bit_cast = LL_BUILDER.CreateBitCast(
1783  lit_buff_lv, llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0));
1784  auto* gep =
1785  LL_BUILDER.CreateGEP(bit_cast->getType()->getScalarType()->getPointerElementType(),
1786  bit_cast,
1787  LL_INT(off));
1788  return LL_BUILDER.CreateLoad(gep->getType()->getPointerElementType(), gep);
1789 }
1790 
1791 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
1792  const Analyzer::Expr* target_expr,
1793  const CompilationOptions& co) {
1794  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1795  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
1796  const auto func_expr = dynamic_cast<const Analyzer::FunctionOper*>(target_expr);
1797  const auto arr_expr = dynamic_cast<const Analyzer::ArrayExpr*>(target_expr);
1798 
1799  // TODO(alex): handle arrays uniformly?
1800  CodeGenerator code_generator(executor_);
1801  if (target_expr) {
1802  const auto& target_ti = target_expr->get_type_info();
1803  if (target_ti.is_buffer() &&
1804  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1805  const auto target_lvs =
1806  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1807  : code_generator.codegen(
1808  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1809  if (!func_expr && !arr_expr) {
1810  // Something with the chunk transport is code that was generated from a source
1811  // other than an ARRAY[] expression
1812  if (target_ti.is_bytes()) {
1813  CHECK_EQ(size_t(3), target_lvs.size());
1814  return {target_lvs[1], target_lvs[2]};
1815  }
1816  CHECK(target_ti.is_array());
1817  CHECK_EQ(size_t(1), target_lvs.size());
1818  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
1819  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1820  const auto i8p_ty =
1821  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1822  const auto& elem_ti = target_ti.get_elem_type();
1823  return {
1824  executor_->cgen_state_->emitExternalCall(
1825  "array_buff",
1826  i8p_ty,
1827  {target_lvs.front(), code_generator.posArg(target_expr)}),
1828  executor_->cgen_state_->emitExternalCall(
1829  "array_size",
1830  i32_ty,
1831  {target_lvs.front(),
1832  code_generator.posArg(target_expr),
1833  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
1834  } else {
1835  if (agg_expr) {
1836  throw std::runtime_error(
1837  "Using array[] operator as argument to an aggregate operator is not "
1838  "supported");
1839  }
1840  CHECK(func_expr || arr_expr);
1841  if (dynamic_cast<const Analyzer::FunctionOper*>(target_expr)) {
1842  CHECK_EQ(size_t(1), target_lvs.size());
1843  const auto prefix = target_ti.get_buffer_name();
1844  CHECK(target_ti.is_array() || target_ti.is_bytes());
1845  const auto target_lv = LL_BUILDER.CreateLoad(
1846  target_lvs[0]->getType()->getPointerElementType(), target_lvs[0]);
1847  // const auto target_lv_type = target_lvs[0]->getType();
1848  // CHECK(target_lv_type->isStructTy());
1849  // CHECK_EQ(target_lv_type->getNumContainedTypes(), 3u);
1850  const auto i8p_ty = llvm::PointerType::get(
1851  get_int_type(8, executor_->cgen_state_->context_), 0);
1852  const auto ptr = LL_BUILDER.CreatePointerCast(
1853  LL_BUILDER.CreateExtractValue(target_lv, 0), i8p_ty);
1854  const auto size = LL_BUILDER.CreateExtractValue(target_lv, 1);
1855  const auto null_flag = LL_BUILDER.CreateExtractValue(target_lv, 2);
1856  const auto nullcheck_ok_bb =
1857  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_nullcheck_ok_bb", CUR_FUNC);
1858  const auto nullcheck_fail_bb = llvm::BasicBlock::Create(
1859  LL_CONTEXT, prefix + "_nullcheck_fail_bb", CUR_FUNC);
1860 
1861  // TODO(adb): probably better to zext the bool
1862  const auto nullcheck = LL_BUILDER.CreateICmpEQ(
1863  null_flag, executor_->cgen_state_->llInt(static_cast<int8_t>(1)));
1864  LL_BUILDER.CreateCondBr(nullcheck, nullcheck_fail_bb, nullcheck_ok_bb);
1865 
1866  const auto ret_bb =
1867  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_return", CUR_FUNC);
1868  LL_BUILDER.SetInsertPoint(ret_bb);
1869  auto result_phi = LL_BUILDER.CreatePHI(i8p_ty, 2, prefix + "_ptr_return");
1870  result_phi->addIncoming(ptr, nullcheck_ok_bb);
1871  const auto null_arr_sentinel = LL_BUILDER.CreateIntToPtr(
1872  executor_->cgen_state_->llInt(static_cast<int8_t>(0)), i8p_ty);
1873  result_phi->addIncoming(null_arr_sentinel, nullcheck_fail_bb);
1874  LL_BUILDER.SetInsertPoint(nullcheck_ok_bb);
1875  executor_->cgen_state_->emitExternalCall(
1876  "register_buffer_with_executor_rsm",
1877  llvm::Type::getVoidTy(executor_->cgen_state_->context_),
1878  {executor_->cgen_state_->llInt(reinterpret_cast<int64_t>(executor_)), ptr});
1879  LL_BUILDER.CreateBr(ret_bb);
1880  LL_BUILDER.SetInsertPoint(nullcheck_fail_bb);
1881  LL_BUILDER.CreateBr(ret_bb);
1882 
1883  LL_BUILDER.SetInsertPoint(ret_bb);
1884  return {result_phi, size};
1885  }
1886  CHECK_EQ(size_t(2), target_lvs.size());
1887  return {target_lvs[0], target_lvs[1]};
1888  }
1889  }
1890  if (target_ti.is_geometry() &&
1891  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1892  auto generate_coord_lvs =
1893  [&](auto* selected_target_expr,
1894  bool const fetch_columns) -> std::vector<llvm::Value*> {
1895  const auto target_lvs =
1896  code_generator.codegen(selected_target_expr, fetch_columns, co);
1897  if (dynamic_cast<const Analyzer::GeoOperator*>(target_expr) &&
1898  target_expr->get_type_info().is_geometry()) {
1899  // return a pointer to the temporary alloca
1900  return target_lvs;
1901  }
1902  const auto geo_uoper = dynamic_cast<const Analyzer::GeoUOper*>(target_expr);
1903  const auto geo_binoper = dynamic_cast<const Analyzer::GeoBinOper*>(target_expr);
1904  if (geo_uoper || geo_binoper) {
1905  CHECK(target_expr->get_type_info().is_geometry());
1906  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
1907  target_lvs.size());
1908  return target_lvs;
1909  }
1910  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
1911  target_lvs.size());
1912 
1913  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1914  const auto i8p_ty =
1915  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1916  std::vector<llvm::Value*> coords;
1917  size_t ctr = 0;
1918  for (const auto& target_lv : target_lvs) {
1919  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
1920  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
1921  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
1922  // coords array (TINYINT). Subsequent arrays are regular INT.
1923 
1924  const size_t elem_sz = ctr == 0 ? 1 : 4;
1925  ctr++;
1926  int32_t fixlen = -1;
1927  if (target_ti.get_type() == kPOINT) {
1928  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
1929  if (col_var) {
1930  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
1931  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
1932  fixlen = coords_cd->columnType.get_size();
1933  }
1934  }
1935  }
1936  if (fixlen > 0) {
1937  coords.push_back(executor_->cgen_state_->emitExternalCall(
1938  "fast_fixlen_array_buff",
1939  i8p_ty,
1940  {target_lv, code_generator.posArg(selected_target_expr)}));
1941  coords.push_back(executor_->cgen_state_->llInt(int64_t(fixlen)));
1942  continue;
1943  }
1944  coords.push_back(executor_->cgen_state_->emitExternalCall(
1945  "array_buff",
1946  i8p_ty,
1947  {target_lv, code_generator.posArg(selected_target_expr)}));
1948  coords.push_back(executor_->cgen_state_->emitExternalCall(
1949  "array_size",
1950  i32_ty,
1951  {target_lv,
1952  code_generator.posArg(selected_target_expr),
1953  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
1954  }
1955  return coords;
1956  };
1957 
1958  if (agg_expr) {
1959  return generate_coord_lvs(agg_expr->get_arg(), true);
1960  } else {
1961  return generate_coord_lvs(target_expr,
1962  !executor_->plan_state_->allow_lazy_fetch_);
1963  }
1964  }
1965  }
1966  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1967  : code_generator.codegen(
1968  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1969 }
1970 
1971 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
1972  const std::vector<llvm::Value*>& args) {
1973  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1974  return executor_->cgen_state_->emitCall(fname, args);
1975 }
1976 
1977 void GroupByAndAggregate::checkErrorCode(llvm::Value* retCode) {
1978  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1979  auto zero_const = llvm::ConstantInt::get(retCode->getType(), 0, true);
1980  auto rc_check_condition = executor_->cgen_state_->ir_builder_.CreateICmp(
1981  llvm::ICmpInst::ICMP_EQ, retCode, zero_const);
1982 
1983  executor_->cgen_state_->emitErrorCheck(rc_check_condition, retCode, "rc");
1984 }
1985 
1986 #undef CUR_FUNC
1987 #undef ROW_FUNC
1988 #undef LL_FP
1989 #undef LL_INT
1990 #undef LL_BOOL
1991 #undef LL_BUILDER
1992 #undef LL_CONTEXT
1993 
1995  const RelAlgExecutionUnit& ra_exe_unit,
1996  const Catalog_Namespace::Catalog& catalog) {
1997  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
1998  return 0;
1999  }
2000  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
2001  const auto grouped_col_expr =
2002  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
2003  if (!grouped_col_expr) {
2004  continue;
2005  }
2006  if (grouped_col_expr->get_table_id() <= 0) {
2007  return 0;
2008  }
2009  const auto td = catalog.getMetadataForTable(grouped_col_expr->get_table_id());
2010  if (td->shardedColumnId == grouped_col_expr->get_column_id()) {
2011  return td->nShards;
2012  }
2013  }
2014  return 0;
2015 }
RUNTIME_EXPORT void agg_approx_quantile(int64_t *agg, const double val)
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:231
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:2261
size_t g_watchdog_baseline_max_groups
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:37
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
llvm::Value * getAdditionalLiteral(const int32_t off)
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
llvm::BasicBlock * cond_false_
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
HOST DEVICE int get_size() const
Definition: sqltypes.h:339
std::string cat(Ts &&...args)
#define LL_BUILDER
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:114
RUNTIME_EXPORT void agg_count_distinct(int64_t *agg, const int64_t val)
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:115
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
#define LL_CONTEXT
bool expr_is_rowid(const Analyzer::Expr *expr, const Catalog_Namespace::Catalog &cat)
ExecutorDeviceType
KeylessInfo get_keyless_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
SQLTypeInfo sql_type
Definition: TargetInfo.h:51
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:217
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
void mark_function_always_inline(llvm::Function *func)
bool is_fp() const
Definition: sqltypes.h:514
ColRangeInfo getColRangeInfo()
const std::list< Analyzer::OrderEntry > order_entries
#define LL_INT(v)
static const size_t baseline_threshold
Definition: Execute.h:1276
QueryDescriptionType hash_type_
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:515
CountDistinctDescriptors init_count_distinct_descriptors(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ExecutorDeviceType device_type, Executor *executor)
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint)
#define CHECK_GE(x, y)
Definition: Logger.h:236
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
llvm::Value * codegenVarlenOutputBuffer(const QueryMemoryDescriptor &query_mem_desc)
Expr * get_arg() const
Definition: Analyzer.h:1202
size_t getEffectiveKeyWidth() const
void codegenApproxQuantile(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
void checkErrorCode(llvm::Value *retCode)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:157
#define LLVM_ALIGN(alignment)
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:235
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
std::string to_string(char const *&&v)
Helpers for codegen of target expressions.
#define LL_BOOL(v)
size_t getColOnlyOffInBytes(const size_t col_idx) const
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:75
const SQLTypeInfo get_compact_type(const TargetInfo &target)
const size_t limit
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const std::optional< int64_t > &group_cardinality_estimation)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:166
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:220
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:36
size_t getGroupbyColCount() const
RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
const JoinQualsPerNestingLevel join_quals
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, llvm::Value *varlen_output_buffer, const std::vector< llvm::Value * > &agg_out_vec, QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
bool g_bigint_count
Definition: sqldefs.h:77
bool g_enable_watchdog
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, llvm::Value *varlen_output_buffer, DiamondCodegen &diamond_codegen) const
int64_t g_bitmap_memory_limit
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:153
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
DEVICE void allocate()
Definition: quantile.h:602
#define AUTOMATIC_IR_METADATA(CGENSTATE)
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:50
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:81
QueryDescriptionType getQueryDescriptionType() const
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
ExecutorDeviceType device_type
#define RUNTIME_EXPORT
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:42
#define CHECK_LT(x, y)
Definition: Logger.h:233
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
int64_t get_bucketed_cardinality_without_nulls(const ColRangeInfo &col_range_info)
#define CHECK_LE(x, y)
Definition: Logger.h:234
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
Definition: sqldefs.h:78
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
Descriptor for the result set buffer layout.
CountDistinctImplType
const std::optional< int64_t > group_cardinality_estimation_
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:223
bool is_geometry() const
Definition: sqltypes.h:522
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
llvm::Value * codegenWindowPosition(WindowFunctionContext *window_func_context, llvm::Value *pos_arg)
Definition: ColumnIR.cpp:226
bool g_cluster
RUNTIME_EXPORT ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
#define CUR_FUNC
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:176
Definition: sqltypes.h:45
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
constexpr double n
Definition: Utm.h:38
size_t g_leaf_count
Definition: ParserNode.cpp:78
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:336
const RelAlgExecutionUnit & ra_exe_unit_
const size_t offset
Definition: sqldefs.h:76
Definition: sqldefs.h:74
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)