OmniSciDB  6686921089
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
19 
20 #include "CardinalityEstimator.h"
21 #include "CodeGenerator.h"
23 #include "ExpressionRange.h"
24 #include "ExpressionRewrite.h"
25 #include "GpuInitGroups.h"
26 #include "InPlaceSort.h"
28 #include "MaxwellCodegenPatch.h"
30 #include "TargetExprBuilder.h"
31 
32 #include "../CudaMgr/CudaMgr.h"
33 #include "../Shared/checked_alloc.h"
34 #include "../Shared/funcannotations.h"
35 #include "../Utils/ChunkIter.h"
37 #include "Execute.h"
38 #include "QueryTemplateGenerator.h"
39 #include "RuntimeFunctions.h"
40 #include "StreamingTopN.h"
41 #include "TopKSort.h"
42 #include "WindowContext.h"
43 
44 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
45 
46 #include <cstring> // strcat()
47 #include <numeric>
48 #include <string_view>
49 #include <thread>
50 
51 bool g_cluster{false};
52 bool g_bigint_count{false};
55 extern size_t g_leaf_count;
56 
57 namespace {
58 
59 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
60  int32_t agg_count{0};
61  for (auto target_expr : target_exprs) {
62  CHECK(target_expr);
63  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
64  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
65  const auto& ti = target_expr->get_type_info();
66  if (ti.is_buffer()) {
67  agg_count += 2;
68  } else if (ti.is_geometry()) {
69  agg_count += ti.get_physical_coord_cols() * 2;
70  } else {
71  ++agg_count;
72  }
73  continue;
74  }
75  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
76  agg_count += 2;
77  } else {
78  ++agg_count;
79  }
80  }
81  return agg_count;
82 }
83 
85  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
86  if (!col) {
87  return false;
88  }
89  const auto cd =
90  get_column_descriptor_maybe(col->get_column_id(), col->get_table_id(), cat);
91  if (!cd || !cd->isVirtualCol) {
92  return false;
93  }
94  CHECK_EQ("rowid", cd->columnName);
95  return true;
96 }
97 
98 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
99  for (const auto& target_expr : ra_exe_unit.target_exprs) {
100  const auto agg_info = get_target_info(target_expr, g_bigint_count);
101  if (agg_info.is_agg && is_distinct_target(agg_info)) {
102  return true;
103  }
104  }
105  return false;
106 }
107 
109  const int64_t max_entry_count) {
110  try {
111  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
112  checked_int64_t(col_range_info.min)) >= max_entry_count;
113  } catch (...) {
114  return true;
115  }
116 }
117 
118 bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate,
119  const ColRangeInfo& col_range_info) {
120  try {
121  // the cardinality estimate is the size of the baseline hash table. further penalize
122  // the baseline hash table by a factor of 2x due to overhead in computing baseline
123  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
124  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
125  // count of the column, we use baseline hash on the filtered set
126  return checked_int64_t(cardinality_estimate) * 2 <
127  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
128  checked_int64_t(col_range_info.min));
129  } catch (...) {
130  return false;
131  }
132 }
133 
135  const std::vector<InputTableInfo>& query_infos,
136  const Analyzer::Expr* expr,
137  Executor* executor) {
138  if (!expr) {
139  return {QueryDescriptionType::Projection, 0, 0, 0, false};
140  }
141 
142  const auto expr_range = getExpressionRange(
143  expr, query_infos, executor, boost::make_optional(ra_exe_unit.simple_quals));
144  switch (expr_range.getType()) {
146  if (expr_range.getIntMin() > expr_range.getIntMax()) {
147  return {
148  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
149  }
151  expr_range.getIntMin(),
152  expr_range.getIntMax(),
153  expr_range.getBucket(),
154  expr_range.hasNulls()};
155  }
158  if (expr_range.getFpMin() > expr_range.getFpMax()) {
159  return {
160  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
161  }
162  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
163  }
165  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
166  default:
167  CHECK(false);
168  }
169  CHECK(false);
170  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
171 }
172 
173 } // namespace
174 
176  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
177  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
178  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
179  // can expect this to be true anyway for grouped queries since the precise version
180  // uses significantly more memory.
181  const int64_t baseline_threshold =
186  if (ra_exe_unit_.groupby_exprs.size() != 1) {
187  try {
188  checked_int64_t cardinality{1};
189  bool has_nulls{false};
190  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
191  auto col_range_info = get_expr_range_info(
192  ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
193  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
194  // going through baseline hash if a non-integer type is encountered
195  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
196  }
197  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
198  CHECK_GE(crt_col_cardinality, 0);
199  cardinality *= crt_col_cardinality;
200  if (col_range_info.has_nulls) {
201  has_nulls = true;
202  }
203  }
204  // For zero or high cardinalities, use baseline layout.
205  if (!cardinality || cardinality > baseline_threshold) {
206  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
207  }
209  0,
210  int64_t(cardinality),
211  0,
212  has_nulls};
213  } catch (...) { // overflow when computing cardinality
214  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
215  }
216  }
217  // For single column groupby on high timestamps, force baseline hash due to wide ranges
218  // we are likely to encounter when applying quals to the expression range
219  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
220  // the range is small enough
221  if (ra_exe_unit_.groupby_exprs.front() &&
222  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
223  ra_exe_unit_.simple_quals.size() > 0) {
224  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
225  }
226  const auto col_range_info = get_expr_range_info(
228  if (!ra_exe_unit_.groupby_exprs.front()) {
229  return col_range_info;
230  }
231  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
232  const int64_t col_count =
234  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
236  max_entry_count = std::min(max_entry_count, baseline_threshold);
237  }
238  const auto& groupby_expr_ti = ra_exe_unit_.groupby_exprs.front()->get_type_info();
239  if (groupby_expr_ti.is_string() && !col_range_info.bucket) {
240  CHECK(groupby_expr_ti.get_compression() == kENCODING_DICT);
241 
242  const bool has_filters =
243  !ra_exe_unit_.quals.empty() || !ra_exe_unit_.simple_quals.empty();
244  if (has_filters &&
245  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
246  // if filters are present, we can use the filter to narrow the cardinality of the
247  // group by in the case of ranges too big for perfect hash. Otherwise, we are better
248  // off attempting perfect hash (since we know the range will be made of
249  // monotonically increasing numbers from min to max for dictionary encoded strings)
250  // and failing later due to excessive memory use.
251  // Check the conditions where baseline hash can provide a performance increase and
252  // return baseline hash (potentially forcing an estimator query) as the range type.
253  // Otherwise, return col_range_info which will likely be perfect hash, though could
254  // be baseline from a previous call of this function prior to the estimator query.
255  if (!ra_exe_unit_.sort_info.order_entries.empty()) {
256  // TODO(adb): allow some sorts to pass through this block by centralizing sort
257  // algorithm decision making
259  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
260  // always use baseline hash for column range too big for perfect hash with count
261  // distinct descriptors. We will need 8GB of CPU memory minimum for the perfect
262  // hash group by in this case.
264  col_range_info.min,
265  col_range_info.max,
266  0,
267  col_range_info.has_nulls};
268  } else {
269  // use original col range for sort
270  return col_range_info;
271  }
272  }
273  // if filters are present and the filtered range is less than the cardinality of
274  // the column, consider baseline hash
277  col_range_info)) {
279  col_range_info.min,
280  col_range_info.max,
281  0,
282  col_range_info.has_nulls};
283  }
284  }
285  } else if ((!expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get(),
286  *executor_->catalog_)) &&
287  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
288  !col_range_info.bucket) {
290  col_range_info.min,
291  col_range_info.max,
292  0,
293  col_range_info.has_nulls};
294  }
295  return col_range_info;
296 }
297 
299  checked_int64_t crt_col_cardinality =
300  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
301  if (col_range_info.bucket) {
302  crt_col_cardinality /= col_range_info.bucket;
303  }
304  return static_cast<int64_t>(crt_col_cardinality +
305  (1 + (col_range_info.has_nulls ? 1 : 0)));
306 }
307 
308 #define LL_CONTEXT executor_->cgen_state_->context_
309 #define LL_BUILDER executor_->cgen_state_->ir_builder_
310 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
311 #define LL_INT(v) executor_->cgen_state_->llInt(v)
312 #define LL_FP(v) executor_->cgen_state_->llFp(v)
313 #define ROW_FUNC executor_->cgen_state_->row_func_
314 #define CUR_FUNC executor_->cgen_state_->current_func_
315 
317  Executor* executor,
318  const ExecutorDeviceType device_type,
319  const RelAlgExecutionUnit& ra_exe_unit,
320  const std::vector<InputTableInfo>& query_infos,
321  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
322  const std::optional<int64_t>& group_cardinality_estimation)
323  : executor_(executor)
324  , ra_exe_unit_(ra_exe_unit)
325  , query_infos_(query_infos)
326  , row_set_mem_owner_(row_set_mem_owner)
327  , device_type_(device_type)
328  , group_cardinality_estimation_(group_cardinality_estimation) {
329  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
330  if (!groupby_expr) {
331  continue;
332  }
333  const auto& groupby_ti = groupby_expr->get_type_info();
334  if (groupby_ti.is_bytes()) {
335  throw std::runtime_error(
336  "Cannot group by string columns which are not dictionary encoded.");
337  }
338  if (groupby_ti.is_buffer()) {
339  throw std::runtime_error("Group by buffer not supported");
340  }
341  if (groupby_ti.is_geometry()) {
342  throw std::runtime_error("Group by geometry not supported");
343  }
344  }
345 }
346 
348  const size_t shard_count) const {
349  size_t device_count{0};
351  device_count = executor_->cudaMgr()->getDeviceCount();
352  CHECK_GT(device_count, 0u);
353  }
354 
355  int64_t bucket{col_range_info.bucket};
356 
357  if (shard_count) {
358  CHECK(!col_range_info.bucket);
359  /*
360  when a node has fewer devices than shard count,
361  a) In a distributed setup, the minimum distance between two keys would be
362  device_count because shards are stored consecutively across the physical tables,
363  i.e if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1
364  would have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf
365  node has only 1 device, in this case, all the keys from each node are loaded on
366  the device each.
367 
368  b) In a single node setup, the distance would be minimum of device_count or
369  difference of device_count - shard_count. For example: If a single node server
370  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
371  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9
372  device 3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum
373  of device_count or difference.
374 
375  When a node has device count equal to or more than shard count then the
376  minimum distance is always at least shard_count * no of leaf nodes.
377  */
378  if (device_count < shard_count) {
379  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
380  : std::min(device_count, shard_count - device_count);
381  } else {
382  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
383  }
384  }
385 
386  return bucket;
387 }
388 
389 namespace {
390 
401  const std::vector<InputTableInfo>& query_infos,
402  const bool is_group_by,
403  Executor* executor) {
404  bool keyless{true}, found{false};
405  int32_t num_agg_expr{0};
406  int32_t index{0};
407  for (const auto target_expr : ra_exe_unit.target_exprs) {
408  const auto agg_info = get_target_info(target_expr, g_bigint_count);
409  const auto chosen_type = get_compact_type(agg_info);
410  if (agg_info.is_agg) {
411  num_agg_expr++;
412  }
413  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
414  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
415  CHECK(agg_expr);
416  const auto arg_expr = agg_arg(target_expr);
417  const bool float_argument_input = takes_float_argument(agg_info);
418  switch (agg_info.agg_kind) {
419  case kAVG:
420  ++index;
421  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
422  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
423  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
424  expr_range_info.hasNulls()) {
425  break;
426  }
427  }
428  found = true;
429  break;
430  case kCOUNT:
431  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
432  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
433  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
434  expr_range_info.hasNulls()) {
435  break;
436  }
437  }
438  found = true;
439  break;
440  case kSUM: {
441  auto arg_ti = arg_expr->get_type_info();
442  if (constrained_not_null(arg_expr, ra_exe_unit.quals)) {
443  arg_ti.set_notnull(true);
444  }
445  if (!arg_ti.get_notnull()) {
446  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
447  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
448  !expr_range_info.hasNulls()) {
449  found = true;
450  }
451  } else {
452  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
453  switch (expr_range_info.getType()) {
456  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
457  found = true;
458  }
459  break;
461  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
462  found = true;
463  }
464  break;
465  default:
466  break;
467  }
468  }
469  break;
470  }
471  case kMIN: {
472  CHECK(agg_expr && agg_expr->get_arg());
473  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
474  if (arg_ti.is_string() || arg_ti.is_buffer()) {
475  break;
476  }
477  auto expr_range_info =
478  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
479  auto init_max = get_agg_initial_val(agg_info.agg_kind,
480  chosen_type,
481  is_group_by || float_argument_input,
482  float_argument_input ? sizeof(float) : 8);
483  switch (expr_range_info.getType()) {
486  auto double_max =
487  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
488  if (expr_range_info.getFpMax() < double_max) {
489  found = true;
490  }
491  break;
492  }
494  if (expr_range_info.getIntMax() < init_max) {
495  found = true;
496  }
497  break;
498  default:
499  break;
500  }
501  break;
502  }
503  case kMAX: {
504  CHECK(agg_expr && agg_expr->get_arg());
505  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
506  if (arg_ti.is_string() || arg_ti.is_buffer()) {
507  break;
508  }
509  auto expr_range_info =
510  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
511  // NULL sentinel and init value for kMAX are identical, which results in
512  // ambiguity in detecting empty keys in presence of nulls.
513  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
514  expr_range_info.hasNulls()) {
515  break;
516  }
517  auto init_min = get_agg_initial_val(agg_info.agg_kind,
518  chosen_type,
519  is_group_by || float_argument_input,
520  float_argument_input ? sizeof(float) : 8);
521  switch (expr_range_info.getType()) {
524  auto double_min =
525  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
526  if (expr_range_info.getFpMin() > double_min) {
527  found = true;
528  }
529  break;
530  }
532  if (expr_range_info.getIntMin() > init_min) {
533  found = true;
534  }
535  break;
536  default:
537  break;
538  }
539  break;
540  }
541  default:
542  keyless = false;
543  break;
544  }
545  }
546  if (!keyless) {
547  break;
548  }
549  if (!found) {
550  ++index;
551  }
552  }
553 
554  // shouldn't use keyless for projection only
555  return {
556  keyless && found,
557  index,
558  };
559 }
560 
562  const RelAlgExecutionUnit& ra_exe_unit,
563  const std::vector<InputTableInfo>& query_infos,
564  const ExecutorDeviceType device_type,
565  Executor* executor) {
566  CountDistinctDescriptors count_distinct_descriptors;
567  for (const auto target_expr : ra_exe_unit.target_exprs) {
568  auto agg_info = get_target_info(target_expr, g_bigint_count);
569  if (is_distinct_target(agg_info)) {
570  CHECK(agg_info.is_agg);
571  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
572  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
573  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
574  if (arg_ti.is_bytes()) {
575  throw std::runtime_error(
576  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
577  }
578  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
579  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
580  }
581  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
582  throw std::runtime_error(
583  "APPROX_COUNT_DISTINCT on geometry columns not supported");
584  }
585  if (agg_info.is_distinct && arg_ti.is_geometry()) {
586  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
587  }
588  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
589  auto arg_range_info =
590  arg_ti.is_fp() ? no_range_info
592  ra_exe_unit, query_infos, agg_expr->get_arg(), executor);
593  CountDistinctImplType count_distinct_impl_type{CountDistinctImplType::StdSet};
594  int64_t bitmap_sz_bits{0};
595  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
596  const auto error_rate = agg_expr->get_arg1();
597  if (error_rate) {
598  CHECK(error_rate->get_type_info().get_type() == kINT);
599  CHECK_GE(error_rate->get_constval().intval, 1);
600  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
601  } else {
602  bitmap_sz_bits = g_hll_precision_bits;
603  }
604  }
605  if (arg_range_info.isEmpty()) {
606  count_distinct_descriptors.emplace_back(
608  0,
609  64,
610  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
611  device_type,
612  1});
613  continue;
614  }
615  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
616  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
617  // implementation for arrays
618  count_distinct_impl_type = CountDistinctImplType::Bitmap;
619  if (agg_info.agg_kind == kCOUNT) {
620  bitmap_sz_bits = arg_range_info.max - arg_range_info.min + 1;
621  const int64_t MAX_BITMAP_BITS{8 * 1000 * 1000 * 1000LL};
622  if (bitmap_sz_bits <= 0 || bitmap_sz_bits > MAX_BITMAP_BITS) {
623  count_distinct_impl_type = CountDistinctImplType::StdSet;
624  }
625  }
626  }
627  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
628  count_distinct_impl_type == CountDistinctImplType::StdSet &&
629  !(arg_ti.is_array() || arg_ti.is_geometry())) {
630  count_distinct_impl_type = CountDistinctImplType::Bitmap;
631  }
632 
633  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
634  count_distinct_impl_type == CountDistinctImplType::StdSet) {
635  throw WatchdogException("Cannot use a fast path for COUNT distinct");
636  }
637  const auto sub_bitmap_count =
638  get_count_distinct_sub_bitmap_count(bitmap_sz_bits, ra_exe_unit, device_type);
639  count_distinct_descriptors.emplace_back(
640  CountDistinctDescriptor{count_distinct_impl_type,
641  arg_range_info.min,
642  bitmap_sz_bits,
643  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
644  device_type,
645  sub_bitmap_count});
646  } else {
647  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
648  CountDistinctImplType::Invalid, 0, 0, false, device_type, 0});
649  }
650  }
651  return count_distinct_descriptors;
652 }
653 
654 } // namespace
655 
656 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
657  const bool allow_multifrag,
658  const size_t max_groups_buffer_entry_count,
659  const int8_t crt_min_byte_width,
660  RenderInfo* render_info,
661  const bool output_columnar_hint) {
662  const auto shard_count =
665  : 0;
666  bool sort_on_gpu_hint =
667  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
670  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
671  // but the total output buffer size would be too big or it's a sharded top query.
672  // For the sake of managing risk, use the new result set way very selectively for
673  // this case only (alongside the baseline layout we've enabled for a while now).
674  bool must_use_baseline_sort = shard_count;
675  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
676  while (true) {
677  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
678  max_groups_buffer_entry_count,
679  crt_min_byte_width,
680  sort_on_gpu_hint,
681  render_info,
682  must_use_baseline_sort,
683  output_columnar_hint);
684  CHECK(query_mem_desc);
685  if (query_mem_desc->sortOnGpu() &&
686  (query_mem_desc->getBufferSizeBytes(device_type_) +
687  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
688  2 * 1024 * 1024 * 1024LL) {
689  must_use_baseline_sort = true;
690  sort_on_gpu_hint = false;
691  } else {
692  break;
693  }
694  }
695  return query_mem_desc;
696 }
697 
698 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
699  const bool allow_multifrag,
700  const size_t max_groups_buffer_entry_count,
701  const int8_t crt_min_byte_width,
702  const bool sort_on_gpu_hint,
703  RenderInfo* render_info,
704  const bool must_use_baseline_sort,
705  const bool output_columnar_hint) {
706  const auto count_distinct_descriptors = init_count_distinct_descriptors(
708 
709  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
710 
711  auto col_range_info_nosharding = getColRangeInfo();
712 
713  const auto shard_count =
716  : 0;
717 
718  const auto col_range_info =
719  ColRangeInfo{col_range_info_nosharding.hash_type_,
720  col_range_info_nosharding.min,
721  col_range_info_nosharding.max,
722  getShardedTopBucket(col_range_info_nosharding, shard_count),
723  col_range_info_nosharding.has_nulls};
724 
725  // Non-grouped aggregates do not support accessing aggregated ranges
726  // Keyless hash is currently only supported with single-column perfect hash
727  const auto keyless_info =
728  !(is_group_by &&
729  col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash)
730  ? KeylessInfo{false, -1}
732 
733  if (g_enable_watchdog &&
734  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
735  max_groups_buffer_entry_count > g_watchdog_baseline_max_groups) ||
736  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
737  ra_exe_unit_.groupby_exprs.size() == 1 &&
738  (col_range_info.max - col_range_info.min) /
739  std::max(col_range_info.bucket, int64_t(1)) >
740  130000000))) {
741  throw WatchdogException("Query would use too much memory");
742  }
743  try {
745  ra_exe_unit_,
746  query_infos_,
747  col_range_info,
748  keyless_info,
749  allow_multifrag,
750  device_type_,
751  crt_min_byte_width,
752  sort_on_gpu_hint,
753  shard_count,
754  max_groups_buffer_entry_count,
755  render_info,
756  count_distinct_descriptors,
757  must_use_baseline_sort,
758  output_columnar_hint,
759  /*streaming_top_n_hint=*/true);
760  } catch (const StreamingTopNOOM& e) {
761  LOG(WARNING) << e.what() << " Disabling Streaming Top N.";
763  ra_exe_unit_,
764  query_infos_,
765  col_range_info,
766  keyless_info,
767  allow_multifrag,
768  device_type_,
769  crt_min_byte_width,
770  sort_on_gpu_hint,
771  shard_count,
772  max_groups_buffer_entry_count,
773  render_info,
774  count_distinct_descriptors,
775  must_use_baseline_sort,
776  output_columnar_hint,
777  /*streaming_top_n_hint=*/false);
778  }
779 }
780 
782  const std::list<Analyzer::OrderEntry>& order_entries) {
783  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
784  return false;
785  }
786  for (const auto& order_entry : order_entries) {
787  CHECK_GE(order_entry.tle_no, 1);
788  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
789  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
790  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
791  return false;
792  }
793  // TODO(alex): relax the restrictions
794  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
795  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
796  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
797  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
798  return false;
799  }
800  if (agg_expr->get_arg()) {
801  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
802  if (arg_ti.is_fp()) {
803  return false;
804  }
805  auto expr_range_info =
806  get_expr_range_info(ra_exe_unit_, query_infos_, agg_expr->get_arg(), executor_);
807  // TOD(adb): QMD not actually initialized here?
808  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
809  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
810  expr_range_info.has_nulls) &&
811  order_entry.is_desc == order_entry.nulls_first) {
812  return false;
813  }
814  }
815  const auto& target_ti = target_expr->get_type_info();
816  CHECK(!target_ti.is_buffer());
817  if (!target_ti.is_integer()) {
818  return false;
819  }
820  }
821  return true;
822 }
823 
824 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
825  llvm::BasicBlock* sc_false,
827  const CompilationOptions& co,
828  const GpuSharedMemoryContext& gpu_smem_context) {
829  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
830  CHECK(filter_result);
831 
832  bool can_return_error = false;
833  llvm::BasicBlock* filter_false{nullptr};
834 
835  {
836  const bool is_group_by = !ra_exe_unit_.groupby_exprs.empty();
837 
838  if (executor_->isArchMaxwell(co.device_type)) {
840  }
841  DiamondCodegen filter_cfg(filter_result,
842  executor_,
843  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
844  "filter", // filter_true and filter_false basic blocks
845  nullptr,
846  false);
847  filter_false = filter_cfg.cond_false_;
848 
849  if (is_group_by) {
851  !query_mem_desc.useStreamingTopN()) {
852  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
853  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
854  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
855  llvm::Value* old_total_matched_val{nullptr};
857  old_total_matched_val =
858  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
859  total_matched_ptr,
860  LL_INT(int32_t(1)),
861  llvm::AtomicOrdering::Monotonic);
862  } else {
863  old_total_matched_val = LL_BUILDER.CreateLoad(total_matched_ptr);
864  LL_BUILDER.CreateStore(
865  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
866  total_matched_ptr);
867  }
868  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
869  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
870  }
871 
872  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
873  auto varlen_output_buffer = codegenVarlenOutputBuffer(query_mem_desc);
874  if (query_mem_desc.usesGetGroupValueFast() ||
875  query_mem_desc.getQueryDescriptionType() ==
877  if (query_mem_desc.getGroupbyColCount() > 1) {
878  filter_cfg.setChainToNext();
879  }
880  // Don't generate null checks if the group slot is guaranteed to be non-null,
881  // as it's the case for get_group_value_fast* family.
882  can_return_error = codegenAggCalls(agg_out_ptr_w_idx,
883  varlen_output_buffer,
884  {},
886  co,
887  gpu_smem_context,
888  filter_cfg);
889  } else {
890  {
891  llvm::Value* nullcheck_cond{nullptr};
892  if (query_mem_desc.didOutputColumnar()) {
893  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
894  LL_INT(int32_t(0)));
895  } else {
896  nullcheck_cond = LL_BUILDER.CreateICmpNE(
897  std::get<0>(agg_out_ptr_w_idx),
898  llvm::ConstantPointerNull::get(
899  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
900  }
901  DiamondCodegen nullcheck_cfg(
902  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
903  codegenAggCalls(agg_out_ptr_w_idx,
904  varlen_output_buffer,
905  {},
907  co,
908  gpu_smem_context,
909  filter_cfg);
910  }
911  can_return_error = true;
912  if (query_mem_desc.getQueryDescriptionType() ==
914  query_mem_desc.useStreamingTopN()) {
915  // Ignore rejection on pushing current row to top-K heap.
916  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
917  } else {
918  CodeGenerator code_generator(executor_);
919  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
920  // TODO(alex): remove the trunc once pos is converted to 32 bits
921  code_generator.posArg(nullptr),
922  get_int_type(32, LL_CONTEXT))));
923  }
924  }
925  } else {
926  if (ra_exe_unit_.estimator) {
927  std::stack<llvm::BasicBlock*> array_loops;
928  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
929  } else {
930  auto arg_it = ROW_FUNC->arg_begin();
931  std::vector<llvm::Value*> agg_out_vec;
932  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
933  agg_out_vec.push_back(&*arg_it++);
934  }
935  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
936  /*varlen_output_buffer=*/nullptr,
937  agg_out_vec,
938  query_mem_desc,
939  co,
940  gpu_smem_context,
941  filter_cfg);
942  }
943  }
944  }
945 
946  if (ra_exe_unit_.join_quals.empty()) {
947  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
948  } else if (sc_false) {
949  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
950  LL_BUILDER.SetInsertPoint(sc_false);
951  LL_BUILDER.CreateBr(filter_false);
952  LL_BUILDER.SetInsertPoint(saved_insert_block);
953  }
954 
955  return can_return_error;
956 }
957 
959  llvm::Value* groups_buffer,
961  const CompilationOptions& co,
962  DiamondCodegen& diamond_codegen) {
963  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
965  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
966  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
967  CHECK(!group_expr);
968  if (!query_mem_desc.didOutputColumnar()) {
969  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
970  }
971  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
972  ? 0
973  : query_mem_desc.getRowSize() / sizeof(int64_t);
974  CodeGenerator code_generator(executor_);
975  if (query_mem_desc.useStreamingTopN()) {
976  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
977  CHECK_GE(only_order_entry.tle_no, int(1));
978  const size_t target_idx = only_order_entry.tle_no - 1;
979  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
980  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
981  const auto chosen_bytes =
982  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
983  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
984  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
986  std::string fname = "get_bin_from_k_heap";
987  const auto& oe_ti = order_entry_expr->get_type_info();
988  llvm::Value* null_key_lv = nullptr;
989  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
990  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
991  switch (bit_width) {
992  case 32:
993  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
994  break;
995  case 64:
996  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
997  break;
998  default:
999  CHECK(false);
1000  }
1001  fname += "_int" + std::to_string(bit_width) + "_t";
1002  } else {
1003  CHECK(oe_ti.is_fp());
1004  if (order_entry_lv->getType()->isDoubleTy()) {
1005  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1006  } else {
1007  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1008  }
1009  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1010  }
1011  const auto key_slot_idx =
1013  return emitCall(
1014  fname,
1015  {groups_buffer,
1016  LL_INT(n),
1017  LL_INT(row_size_quad),
1018  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1019  LL_BOOL(only_order_entry.is_desc),
1020  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1021  LL_BOOL(only_order_entry.nulls_first),
1022  null_key_lv,
1023  order_entry_lv});
1024  } else {
1025  const auto output_buffer_entry_count_lv =
1026  LL_BUILDER.CreateLoad(get_arg_by_name(ROW_FUNC, "max_matched"));
1027  const auto group_expr_lv =
1028  LL_BUILDER.CreateLoad(get_arg_by_name(ROW_FUNC, "old_total_matched"));
1029  std::vector<llvm::Value*> args{groups_buffer,
1030  output_buffer_entry_count_lv,
1031  group_expr_lv,
1032  code_generator.posArg(nullptr)};
1033  if (query_mem_desc.didOutputColumnar()) {
1034  const auto columnar_output_offset =
1035  emitCall("get_columnar_scan_output_offset", args);
1036  return columnar_output_offset;
1037  }
1038  args.push_back(LL_INT(row_size_quad));
1039  return emitCall("get_scan_output_slot", args);
1040  }
1041 }
1042 
1043 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1045  const CompilationOptions& co,
1046  DiamondCodegen& diamond_codegen) {
1047  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1048  auto arg_it = ROW_FUNC->arg_begin();
1049  auto groups_buffer = arg_it++;
1050 
1051  std::stack<llvm::BasicBlock*> array_loops;
1052 
1053  // TODO(Saman): move this logic outside of this function.
1055  if (query_mem_desc.didOutputColumnar()) {
1056  return std::make_tuple(
1057  &*groups_buffer,
1058  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1059  } else {
1060  return std::make_tuple(
1061  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1062  nullptr);
1063  }
1064  }
1065 
1066  CHECK(query_mem_desc.getQueryDescriptionType() ==
1068  query_mem_desc.getQueryDescriptionType() ==
1070 
1071  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1072  ? 0
1073  : query_mem_desc.getRowSize() / sizeof(int64_t);
1074 
1075  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1076  ? sizeof(int64_t)
1077  : query_mem_desc.getEffectiveKeyWidth();
1078  // for multi-column group by
1079  llvm::Value* group_key = nullptr;
1080  llvm::Value* key_size_lv = nullptr;
1081 
1082  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1083  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1084  if (query_mem_desc.getQueryDescriptionType() ==
1086  group_key =
1087  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1088  } else if (query_mem_desc.getQueryDescriptionType() ==
1090  group_key =
1091  col_width_size == sizeof(int32_t)
1092  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1093  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1094  }
1095  CHECK(group_key);
1096  CHECK(key_size_lv);
1097  }
1098 
1099  int32_t subkey_idx = 0;
1100  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1101  for (const auto& group_expr : ra_exe_unit_.groupby_exprs) {
1102  const auto col_range_info =
1104  const auto translated_null_value = static_cast<int64_t>(
1105  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1106  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1107  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1108  : checked_int64_t(col_range_info.max) +
1109  (col_range_info.bucket ? col_range_info.bucket : 1));
1110 
1111  const bool col_has_nulls =
1112  query_mem_desc.getQueryDescriptionType() ==
1114  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1115  ? query_mem_desc.hasNulls()
1116  : col_range_info.has_nulls)
1117  : false;
1118 
1119  const auto group_expr_lvs =
1120  executor_->groupByColumnCodegen(group_expr.get(),
1121  col_width_size,
1122  co,
1123  col_has_nulls,
1124  translated_null_value,
1125  diamond_codegen,
1126  array_loops,
1127  query_mem_desc.threadsShareMemory());
1128  const auto group_expr_lv = group_expr_lvs.translated_value;
1129  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1130  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1131  return codegenSingleColumnPerfectHash(query_mem_desc,
1132  co,
1133  &*groups_buffer,
1134  group_expr_lv,
1135  group_expr_lvs.original_value,
1136  row_size_quad);
1137  } else {
1138  // store the sub-key to the buffer
1139  LL_BUILDER.CreateStore(group_expr_lv,
1140  LL_BUILDER.CreateGEP(group_key, LL_INT(subkey_idx++)));
1141  }
1142  }
1143  if (query_mem_desc.getQueryDescriptionType() ==
1145  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1147  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1148  } else if (query_mem_desc.getQueryDescriptionType() ==
1151  &*groups_buffer,
1152  group_key,
1153  key_size_lv,
1154  query_mem_desc,
1155  col_width_size,
1156  row_size_quad);
1157  }
1158  CHECK(false);
1159  return std::make_tuple(nullptr, nullptr);
1160 }
1161 
1164  if (!query_mem_desc.hasVarlenOutput()) {
1165  return nullptr;
1166  }
1167 
1168  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1169  auto arg_it = ROW_FUNC->arg_begin();
1170  arg_it++; /* groups_buffer */
1171  auto varlen_output_buffer = arg_it++;
1172  CHECK(varlen_output_buffer->getType() == llvm::Type::getInt64PtrTy(LL_CONTEXT));
1173  return varlen_output_buffer;
1174 }
1175 
1176 std::tuple<llvm::Value*, llvm::Value*>
1179  const CompilationOptions& co,
1180  llvm::Value* groups_buffer,
1181  llvm::Value* group_expr_lv_translated,
1182  llvm::Value* group_expr_lv_original,
1183  const int32_t row_size_quad) {
1184  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1185  CHECK(query_mem_desc.usesGetGroupValueFast());
1186  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1187  ? "get_columnar_group_bin_offset"
1188  : "get_group_value_fast"};
1189  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1190  get_group_fn_name += "_keyless";
1191  }
1192  if (query_mem_desc.interleavedBins(co.device_type)) {
1193  CHECK(!query_mem_desc.didOutputColumnar());
1194  CHECK(query_mem_desc.hasKeylessHash());
1195  get_group_fn_name += "_semiprivate";
1196  }
1197  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1198  &*group_expr_lv_translated};
1199  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1200  query_mem_desc.mustUseBaselineSort()) {
1201  get_group_fn_name += "_with_original_key";
1202  get_group_fn_args.push_back(group_expr_lv_original);
1203  }
1204  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1205  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1206  if (!query_mem_desc.hasKeylessHash()) {
1207  if (!query_mem_desc.didOutputColumnar()) {
1208  get_group_fn_args.push_back(LL_INT(row_size_quad));
1209  }
1210  } else {
1211  if (!query_mem_desc.didOutputColumnar()) {
1212  get_group_fn_args.push_back(LL_INT(row_size_quad));
1213  }
1214  if (query_mem_desc.interleavedBins(co.device_type)) {
1215  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1216  get_group_fn_args.push_back(warp_idx);
1217  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1218  }
1219  }
1220  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1221  return std::make_tuple(&*groups_buffer,
1222  emitCall(get_group_fn_name, get_group_fn_args));
1223  }
1224  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1225 }
1226 
1227 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1228  llvm::Value* groups_buffer,
1229  llvm::Value* group_key,
1230  llvm::Value* key_size_lv,
1231  const QueryMemoryDescriptor& query_mem_desc,
1232  const int32_t row_size_quad) {
1233  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1234  CHECK(query_mem_desc.getQueryDescriptionType() ==
1236  // compute the index (perfect hash)
1237  auto perfect_hash_func = codegenPerfectHashFunction();
1238  auto hash_lv =
1239  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1240 
1241  if (query_mem_desc.didOutputColumnar()) {
1242  if (!query_mem_desc.hasKeylessHash()) {
1243  const std::string set_matching_func_name{
1244  "set_matching_group_value_perfect_hash_columnar"};
1245  const std::vector<llvm::Value*> set_matching_func_arg{
1246  groups_buffer,
1247  hash_lv,
1248  group_key,
1249  key_size_lv,
1250  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1251  query_mem_desc.getEntryCount())};
1252  emitCall(set_matching_func_name, set_matching_func_arg);
1253  }
1254  return std::make_tuple(groups_buffer, hash_lv);
1255  } else {
1256  if (query_mem_desc.hasKeylessHash()) {
1257  return std::make_tuple(emitCall("get_matching_group_value_perfect_hash_keyless",
1258  {groups_buffer, hash_lv, LL_INT(row_size_quad)}),
1259  nullptr);
1260  } else {
1261  return std::make_tuple(
1262  emitCall(
1263  "get_matching_group_value_perfect_hash",
1264  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1265  nullptr);
1266  }
1267  }
1268 }
1269 
1270 std::tuple<llvm::Value*, llvm::Value*>
1272  const CompilationOptions& co,
1273  llvm::Value* groups_buffer,
1274  llvm::Value* group_key,
1275  llvm::Value* key_size_lv,
1276  const QueryMemoryDescriptor& query_mem_desc,
1277  const size_t key_width,
1278  const int32_t row_size_quad) {
1279  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1280  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1281  CHECK(key_width == sizeof(int32_t));
1282  group_key =
1283  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1284  }
1285  std::vector<llvm::Value*> func_args{
1286  groups_buffer,
1287  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1288  &*group_key,
1289  &*key_size_lv,
1290  LL_INT(static_cast<int32_t>(key_width))};
1291  std::string func_name{"get_group_value"};
1292  if (query_mem_desc.didOutputColumnar()) {
1293  func_name += "_columnar_slot";
1294  } else {
1295  func_args.push_back(LL_INT(row_size_quad));
1296  }
1297  if (co.with_dynamic_watchdog) {
1298  func_name += "_with_watchdog";
1299  }
1300  if (query_mem_desc.didOutputColumnar()) {
1301  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1302  } else {
1303  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1304  }
1305 }
1306 
1308  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1309  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1310  auto ft = llvm::FunctionType::get(
1311  get_int_type(32, LL_CONTEXT),
1312  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1313  false);
1314  auto key_hash_func = llvm::Function::Create(ft,
1315  llvm::Function::ExternalLinkage,
1316  "perfect_key_hash",
1317  executor_->cgen_state_->module_);
1318  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1319  mark_function_always_inline(key_hash_func);
1320  auto& key_buff_arg = *key_hash_func->args().begin();
1321  llvm::Value* key_buff_lv = &key_buff_arg;
1322  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1323  llvm::IRBuilder<> key_hash_func_builder(bb);
1324  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1325  std::vector<int64_t> cardinalities;
1326  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1327  auto col_range_info =
1328  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1329  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1330  cardinalities.push_back(getBucketedCardinality(col_range_info));
1331  }
1332  size_t dim_idx = 0;
1333  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1334  auto key_comp_lv = key_hash_func_builder.CreateLoad(
1335  key_hash_func_builder.CreateGEP(key_buff_lv, LL_INT(dim_idx)));
1336  auto col_range_info =
1337  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1338  auto crt_term_lv =
1339  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1340  if (col_range_info.bucket) {
1341  crt_term_lv =
1342  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1343  }
1344  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1345  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1346  LL_INT(cardinalities[prev_dim_idx]));
1347  }
1348  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1349  ++dim_idx;
1350  }
1351  key_hash_func_builder.CreateRet(
1352  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1353  return key_hash_func;
1354 }
1355 
1357  const TargetInfo& agg_info,
1358  llvm::Value* target) {
1359  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1360  const auto& agg_type = agg_info.sql_type;
1361  const size_t chosen_bytes = agg_type.get_size();
1362 
1363  bool need_conversion{false};
1364  llvm::Value* arg_null{nullptr};
1365  llvm::Value* agg_null{nullptr};
1366  llvm::Value* target_to_cast{target};
1367  if (arg_type.is_fp()) {
1368  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1369  if (agg_type.is_fp()) {
1370  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1371  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1372  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1373  need_conversion = true;
1374  }
1375  } else {
1376  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1377  return target;
1378  }
1379  } else {
1380  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1381  if (agg_type.is_fp()) {
1382  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1383  need_conversion = true;
1384  target_to_cast = executor_->castToFP(target, arg_type, agg_type);
1385  } else {
1386  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1387  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1388  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1389  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1390  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1391  need_conversion = true;
1392  }
1393  }
1394  }
1395  if (need_conversion) {
1396  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1397  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1398  return LL_BUILDER.CreateSelect(
1399  cmp,
1400  agg_null,
1401  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1402  } else {
1403  return target;
1404  }
1405 }
1406 
1408  const Analyzer::WindowFunction* window_func,
1409  const QueryMemoryDescriptor& query_mem_desc,
1410  const CompilationOptions& co,
1411  DiamondCodegen& diamond_codegen) {
1412  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1413  const auto window_func_context =
1415  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1416  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1417  ? 0
1418  : query_mem_desc.getRowSize() / sizeof(int64_t);
1419  auto arg_it = ROW_FUNC->arg_begin();
1420  auto groups_buffer = arg_it++;
1421  CodeGenerator code_generator(executor_);
1422  auto window_pos_lv = code_generator.codegenWindowPosition(
1423  window_func_context, code_generator.posArg(nullptr));
1424  const auto pos_in_window =
1425  LL_BUILDER.CreateTrunc(window_pos_lv, get_int_type(32, LL_CONTEXT));
1426  llvm::Value* entry_count_lv =
1427  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1428  std::vector<llvm::Value*> args{
1429  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1430  if (query_mem_desc.didOutputColumnar()) {
1431  const auto columnar_output_offset =
1432  emitCall("get_columnar_scan_output_offset", args);
1433  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1434  }
1435  args.push_back(LL_INT(row_size_quad));
1436  return emitCall("get_scan_output_slot", args);
1437  }
1438  auto arg_it = ROW_FUNC->arg_begin();
1439  auto groups_buffer = arg_it++;
1440  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1441 }
1442 
1444  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1445  llvm::Value* varlen_output_buffer,
1446  const std::vector<llvm::Value*>& agg_out_vec,
1447  const QueryMemoryDescriptor& query_mem_desc,
1448  const CompilationOptions& co,
1449  const GpuSharedMemoryContext& gpu_smem_context,
1450  DiamondCodegen& diamond_codegen) {
1451  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1452  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1453  // TODO(alex): unify the two cases, the output for non-group by queries
1454  // should be a contiguous buffer
1455  const bool is_group_by = std::get<0>(agg_out_ptr_w_idx);
1456  bool can_return_error = false;
1457  if (is_group_by) {
1458  CHECK(agg_out_vec.empty());
1459  } else {
1460  CHECK(!agg_out_vec.empty());
1461  }
1462 
1463  // output buffer is casted into a byte stream to be able to handle data elements of
1464  // different sizes (only used when actual column width sizes are used)
1465  llvm::Value* output_buffer_byte_stream{nullptr};
1466  llvm::Value* out_row_idx{nullptr};
1467  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1469  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1470  std::get<0>(agg_out_ptr_w_idx),
1471  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1472  output_buffer_byte_stream->setName("out_buff_b_stream");
1473  CHECK(std::get<1>(agg_out_ptr_w_idx));
1474  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1475  llvm::Type::getInt64Ty(LL_CONTEXT));
1476  out_row_idx->setName("out_row_idx");
1477  }
1478 
1479  TargetExprCodegenBuilder target_builder(query_mem_desc, ra_exe_unit_, is_group_by);
1480  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1481  ++target_idx) {
1482  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1483  CHECK(target_expr);
1484 
1485  target_builder(target_expr, executor_, co);
1486  }
1487 
1488  target_builder.codegen(this,
1489  executor_,
1490  query_mem_desc,
1491  co,
1492  gpu_smem_context,
1493  agg_out_ptr_w_idx,
1494  agg_out_vec,
1495  output_buffer_byte_stream,
1496  out_row_idx,
1497  varlen_output_buffer,
1498  diamond_codegen);
1499 
1500  for (auto target_expr : ra_exe_unit_.target_exprs) {
1501  CHECK(target_expr);
1502  executor_->plan_state_->isLazyFetchColumn(target_expr);
1503  }
1504 
1505  return can_return_error;
1506 }
1507 
1512  llvm::Value* output_buffer_byte_stream,
1513  llvm::Value* out_row_idx,
1514  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1515  const QueryMemoryDescriptor& query_mem_desc,
1516  const size_t chosen_bytes,
1517  const size_t agg_out_off,
1518  const size_t target_idx) {
1519  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1520  llvm::Value* agg_col_ptr{nullptr};
1521  if (query_mem_desc.didOutputColumnar()) {
1522  // TODO(Saman): remove the second columnar branch, and support all query description
1523  // types through the first branch. Then, input arguments should also be cleaned up
1524  if (!g_cluster &&
1526  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1527  chosen_bytes == 8);
1528  CHECK(output_buffer_byte_stream);
1529  CHECK(out_row_idx);
1530  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1531  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1532  auto out_per_col_byte_idx =
1533 #ifdef _WIN32
1534  LL_BUILDER.CreateShl(out_row_idx, __lzcnt(chosen_bytes) - 1);
1535 #else
1536  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1537 #endif
1538  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1539  LL_INT(static_cast<int64_t>(col_off)));
1540  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1541  auto output_ptr = LL_BUILDER.CreateGEP(output_buffer_byte_stream, byte_offset);
1542  agg_col_ptr = LL_BUILDER.CreateBitCast(
1543  output_ptr,
1544  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1545  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1546  } else {
1547  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1548  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1549  col_off /= chosen_bytes;
1550  CHECK(std::get<1>(agg_out_ptr_w_idx));
1551  auto offset = LL_BUILDER.CreateAdd(std::get<1>(agg_out_ptr_w_idx), LL_INT(col_off));
1552  agg_col_ptr = LL_BUILDER.CreateGEP(
1553  LL_BUILDER.CreateBitCast(
1554  std::get<0>(agg_out_ptr_w_idx),
1555  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0)),
1556  offset);
1557  }
1558  } else {
1559  uint32_t col_off = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1560  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1561  col_off /= chosen_bytes;
1562  agg_col_ptr = LL_BUILDER.CreateGEP(
1563  LL_BUILDER.CreateBitCast(
1564  std::get<0>(agg_out_ptr_w_idx),
1565  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0)),
1566  LL_INT(col_off));
1567  }
1568  CHECK(agg_col_ptr);
1569  return agg_col_ptr;
1570 }
1571 
1572 void GroupByAndAggregate::codegenEstimator(std::stack<llvm::BasicBlock*>& array_loops,
1573  DiamondCodegen& diamond_codegen,
1574  const QueryMemoryDescriptor& query_mem_desc,
1575  const CompilationOptions& co) {
1576  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1577  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1578  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1579  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1580  estimator_comp_count_lv);
1581  int32_t subkey_idx = 0;
1582  for (const auto& estimator_arg_comp : estimator_arg) {
1583  const auto estimator_arg_comp_lvs =
1584  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1585  query_mem_desc.getEffectiveKeyWidth(),
1586  co,
1587  false,
1588  0,
1589  diamond_codegen,
1590  array_loops,
1591  true);
1592  CHECK(!estimator_arg_comp_lvs.original_value);
1593  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1594  // store the sub-key to the buffer
1595  LL_BUILDER.CreateStore(estimator_arg_comp_lv,
1596  LL_BUILDER.CreateGEP(estimator_key_lv, LL_INT(subkey_idx++)));
1597  }
1598  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1599  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1600  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1601  const auto estimator_comp_bytes_lv =
1602  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1603  const auto bitmap_size_lv =
1604  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1605  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1606  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1607 }
1608 
1609 extern "C" RUNTIME_EXPORT void agg_count_distinct(int64_t* agg, const int64_t val) {
1610  reinterpret_cast<std::set<int64_t>*>(*agg)->insert(val);
1611 }
1612 
1613 extern "C" RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t* agg,
1614  const int64_t val,
1615  const int64_t skip_val) {
1616  if (val != skip_val) {
1617  agg_count_distinct(agg, val);
1618  }
1619 }
1620 
1621 extern "C" RUNTIME_EXPORT void agg_approx_quantile(int64_t* agg, const double val) {
1622  auto* t_digest = reinterpret_cast<quantile::TDigest*>(*agg);
1623  t_digest->allocate();
1624  t_digest->add(val);
1625 }
1626 
1628  const size_t target_idx,
1629  const Analyzer::Expr* target_expr,
1630  std::vector<llvm::Value*>& agg_args,
1631  const QueryMemoryDescriptor& query_mem_desc,
1632  const ExecutorDeviceType device_type) {
1633  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1634  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1635  const auto& arg_ti =
1636  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1637  if (arg_ti.is_fp()) {
1638  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1639  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1640  }
1641  const auto& count_distinct_descriptor =
1642  query_mem_desc.getCountDistinctDescriptor(target_idx);
1643  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1644  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1645  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1646  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1647  if (device_type == ExecutorDeviceType::GPU) {
1648  const auto base_dev_addr = getAdditionalLiteral(-1);
1649  const auto base_host_addr = getAdditionalLiteral(-2);
1650  agg_args.push_back(base_dev_addr);
1651  agg_args.push_back(base_host_addr);
1652  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1653  } else {
1654  emitCall("agg_approximate_count_distinct", agg_args);
1655  }
1656  return;
1657  }
1658  std::string agg_fname{"agg_count_distinct"};
1659  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1660  agg_fname += "_bitmap";
1661  agg_args.push_back(LL_INT(static_cast<int64_t>(count_distinct_descriptor.min_val)));
1662  }
1663  if (agg_info.skip_null_val) {
1664  auto null_lv = executor_->cgen_state_->castToTypeIn(
1665  (arg_ti.is_fp()
1666  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1667  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1668  64);
1669  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1670  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1671  agg_fname += "_skip_val";
1672  agg_args.push_back(null_lv);
1673  }
1674  if (device_type == ExecutorDeviceType::GPU) {
1675  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1676  agg_fname += "_gpu";
1677  const auto base_dev_addr = getAdditionalLiteral(-1);
1678  const auto base_host_addr = getAdditionalLiteral(-2);
1679  agg_args.push_back(base_dev_addr);
1680  agg_args.push_back(base_host_addr);
1681  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1682  CHECK_EQ(size_t(0),
1683  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1684  count_distinct_descriptor.sub_bitmap_count);
1685  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1686  count_distinct_descriptor.sub_bitmap_count)));
1687  }
1688  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1689  emitCall(agg_fname, agg_args);
1690  } else {
1691  executor_->cgen_state_->emitExternalCall(
1692  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1693  }
1694 }
1695 
1697  const size_t target_idx,
1698  const Analyzer::Expr* target_expr,
1699  std::vector<llvm::Value*>& agg_args,
1700  const QueryMemoryDescriptor& query_mem_desc,
1701  const ExecutorDeviceType device_type) {
1702  if (device_type == ExecutorDeviceType::GPU) {
1703  throw QueryMustRunOnCpu();
1704  }
1705  llvm::BasicBlock *calc, *skip;
1706  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1707  auto const arg_ti =
1708  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1709  bool const nullable = !arg_ti.get_notnull();
1710 
1711  auto* cs = executor_->cgen_state_.get();
1712  auto& irb = cs->ir_builder_;
1713  if (nullable) {
1714  auto* const null_value = cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1715  auto* const skip_cond = arg_ti.is_fp()
1716  ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1717  : irb.CreateICmpEQ(agg_args.back(), null_value);
1718  calc = llvm::BasicBlock::Create(cs->context_, "calc_approx_quantile");
1719  skip = llvm::BasicBlock::Create(cs->context_, "skip_approx_quantile");
1720  irb.CreateCondBr(skip_cond, skip, calc);
1721  cs->current_func_->getBasicBlockList().push_back(calc);
1722  irb.SetInsertPoint(calc);
1723  }
1724  if (!arg_ti.is_fp()) {
1725  auto const agg_info = get_target_info(target_expr, g_bigint_count);
1726  agg_args.back() = executor_->castToFP(agg_args.back(), arg_ti, agg_info.sql_type);
1727  }
1728  cs->emitExternalCall(
1729  "agg_approx_quantile", llvm::Type::getVoidTy(cs->context_), agg_args);
1730  if (nullable) {
1731  irb.CreateBr(skip);
1732  cs->current_func_->getBasicBlockList().push_back(skip);
1733  irb.SetInsertPoint(skip);
1734  }
1735 }
1736 
1737 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
1738  CHECK_LT(off, 0);
1739  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
1740  return LL_BUILDER.CreateLoad(LL_BUILDER.CreateGEP(
1741  LL_BUILDER.CreateBitCast(lit_buff_lv,
1742  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)),
1743  LL_INT(off)));
1744 }
1745 
1746 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
1747  const Analyzer::Expr* target_expr,
1748  const CompilationOptions& co) {
1749  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1750  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
1751  const auto func_expr = dynamic_cast<const Analyzer::FunctionOper*>(target_expr);
1752  const auto arr_expr = dynamic_cast<const Analyzer::ArrayExpr*>(target_expr);
1753 
1754  // TODO(alex): handle arrays uniformly?
1755  CodeGenerator code_generator(executor_);
1756  if (target_expr) {
1757  const auto& target_ti = target_expr->get_type_info();
1758  if (target_ti.is_buffer() &&
1759  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1760  const auto target_lvs =
1761  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1762  : code_generator.codegen(
1763  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1764  if (!func_expr && !arr_expr) {
1765  // Something with the chunk transport is code that was generated from a source
1766  // other than an ARRAY[] expression
1767  if (target_ti.is_bytes()) {
1768  CHECK_EQ(size_t(3), target_lvs.size());
1769  return {target_lvs[1], target_lvs[2]};
1770  }
1771  CHECK(target_ti.is_array());
1772  CHECK_EQ(size_t(1), target_lvs.size());
1773  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
1774  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1775  const auto i8p_ty =
1776  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1777  const auto& elem_ti = target_ti.get_elem_type();
1778  return {
1779  executor_->cgen_state_->emitExternalCall(
1780  "array_buff",
1781  i8p_ty,
1782  {target_lvs.front(), code_generator.posArg(target_expr)}),
1783  executor_->cgen_state_->emitExternalCall(
1784  "array_size",
1785  i32_ty,
1786  {target_lvs.front(),
1787  code_generator.posArg(target_expr),
1788  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
1789  } else {
1790  if (agg_expr) {
1791  throw std::runtime_error(
1792  "Using array[] operator as argument to an aggregate operator is not "
1793  "supported");
1794  }
1795  CHECK(func_expr || arr_expr);
1796  if (dynamic_cast<const Analyzer::FunctionOper*>(target_expr)) {
1797  CHECK_EQ(size_t(1), target_lvs.size());
1798  const auto prefix = target_ti.get_buffer_name();
1799  CHECK(target_ti.is_array() || target_ti.is_bytes());
1800  const auto target_lv = LL_BUILDER.CreateLoad(target_lvs[0]);
1801  // const auto target_lv_type = target_lvs[0]->getType();
1802  // CHECK(target_lv_type->isStructTy());
1803  // CHECK_EQ(target_lv_type->getNumContainedTypes(), 3u);
1804  const auto i8p_ty = llvm::PointerType::get(
1805  get_int_type(8, executor_->cgen_state_->context_), 0);
1806  const auto ptr = LL_BUILDER.CreatePointerCast(
1807  LL_BUILDER.CreateExtractValue(target_lv, 0), i8p_ty);
1808  const auto size = LL_BUILDER.CreateExtractValue(target_lv, 1);
1809  const auto null_flag = LL_BUILDER.CreateExtractValue(target_lv, 2);
1810  const auto nullcheck_ok_bb =
1811  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_nullcheck_ok_bb", CUR_FUNC);
1812  const auto nullcheck_fail_bb = llvm::BasicBlock::Create(
1813  LL_CONTEXT, prefix + "_nullcheck_fail_bb", CUR_FUNC);
1814 
1815  // TODO(adb): probably better to zext the bool
1816  const auto nullcheck = LL_BUILDER.CreateICmpEQ(
1817  null_flag, executor_->cgen_state_->llInt(static_cast<int8_t>(1)));
1818  LL_BUILDER.CreateCondBr(nullcheck, nullcheck_fail_bb, nullcheck_ok_bb);
1819 
1820  const auto ret_bb =
1821  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_return", CUR_FUNC);
1822  LL_BUILDER.SetInsertPoint(ret_bb);
1823  auto result_phi = LL_BUILDER.CreatePHI(i8p_ty, 2, prefix + "_ptr_return");
1824  result_phi->addIncoming(ptr, nullcheck_ok_bb);
1825  const auto null_arr_sentinel = LL_BUILDER.CreateIntToPtr(
1826  executor_->cgen_state_->llInt(static_cast<int8_t>(0)), i8p_ty);
1827  result_phi->addIncoming(null_arr_sentinel, nullcheck_fail_bb);
1828  LL_BUILDER.SetInsertPoint(nullcheck_ok_bb);
1829  executor_->cgen_state_->emitExternalCall(
1830  "register_buffer_with_executor_rsm",
1831  llvm::Type::getVoidTy(executor_->cgen_state_->context_),
1832  {executor_->cgen_state_->llInt(reinterpret_cast<int64_t>(executor_)), ptr});
1833  LL_BUILDER.CreateBr(ret_bb);
1834  LL_BUILDER.SetInsertPoint(nullcheck_fail_bb);
1835  LL_BUILDER.CreateBr(ret_bb);
1836 
1837  LL_BUILDER.SetInsertPoint(ret_bb);
1838  return {result_phi, size};
1839  }
1840  CHECK_EQ(size_t(2), target_lvs.size());
1841  return {target_lvs[0], target_lvs[1]};
1842  }
1843  }
1844  if (target_ti.is_geometry() &&
1845  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1846  auto generate_coord_lvs =
1847  [&](auto* selected_target_expr,
1848  bool const fetch_columns) -> std::vector<llvm::Value*> {
1849  const auto target_lvs =
1850  code_generator.codegen(selected_target_expr, fetch_columns, co);
1851  if (dynamic_cast<const Analyzer::GeoOperator*>(target_expr) &&
1852  target_expr->get_type_info().is_geometry()) {
1853  // return a pointer to the temporary alloca
1854  return target_lvs;
1855  }
1856  const auto geo_uoper = dynamic_cast<const Analyzer::GeoUOper*>(target_expr);
1857  const auto geo_binoper = dynamic_cast<const Analyzer::GeoBinOper*>(target_expr);
1858  if (geo_uoper || geo_binoper) {
1859  CHECK(target_expr->get_type_info().is_geometry());
1860  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
1861  target_lvs.size());
1862  return target_lvs;
1863  }
1864  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
1865  target_lvs.size());
1866 
1867  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1868  const auto i8p_ty =
1869  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1870  std::vector<llvm::Value*> coords;
1871  size_t ctr = 0;
1872  for (const auto& target_lv : target_lvs) {
1873  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
1874  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
1875  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
1876  // coords array (TINYINT). Subsequent arrays are regular INT.
1877 
1878  const size_t elem_sz = ctr == 0 ? 1 : 4;
1879  ctr++;
1880  int32_t fixlen = -1;
1881  if (target_ti.get_type() == kPOINT) {
1882  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
1883  if (col_var) {
1884  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
1885  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
1886  fixlen = coords_cd->columnType.get_size();
1887  }
1888  }
1889  }
1890  if (fixlen > 0) {
1891  coords.push_back(executor_->cgen_state_->emitExternalCall(
1892  "fast_fixlen_array_buff",
1893  i8p_ty,
1894  {target_lv, code_generator.posArg(selected_target_expr)}));
1895  coords.push_back(executor_->cgen_state_->llInt(int64_t(fixlen)));
1896  continue;
1897  }
1898  coords.push_back(executor_->cgen_state_->emitExternalCall(
1899  "array_buff",
1900  i8p_ty,
1901  {target_lv, code_generator.posArg(selected_target_expr)}));
1902  coords.push_back(executor_->cgen_state_->emitExternalCall(
1903  "array_size",
1904  i32_ty,
1905  {target_lv,
1906  code_generator.posArg(selected_target_expr),
1907  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
1908  }
1909  return coords;
1910  };
1911 
1912  if (agg_expr) {
1913  return generate_coord_lvs(agg_expr->get_arg(), true);
1914  } else {
1915  return generate_coord_lvs(target_expr,
1916  !executor_->plan_state_->allow_lazy_fetch_);
1917  }
1918  }
1919  }
1920  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1921  : code_generator.codegen(
1922  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1923 }
1924 
1925 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
1926  const std::vector<llvm::Value*>& args) {
1927  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1928  return executor_->cgen_state_->emitCall(fname, args);
1929 }
1930 
1931 void GroupByAndAggregate::checkErrorCode(llvm::Value* retCode) {
1932  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1933  auto zero_const = llvm::ConstantInt::get(retCode->getType(), 0, true);
1934  auto rc_check_condition = executor_->cgen_state_->ir_builder_.CreateICmp(
1935  llvm::ICmpInst::ICMP_EQ, retCode, zero_const);
1936 
1937  executor_->cgen_state_->emitErrorCheck(rc_check_condition, retCode, "rc");
1938 }
1939 
1940 #undef CUR_FUNC
1941 #undef ROW_FUNC
1942 #undef LL_FP
1943 #undef LL_INT
1944 #undef LL_BOOL
1945 #undef LL_BUILDER
1946 #undef LL_CONTEXT
1947 
1949  const RelAlgExecutionUnit& ra_exe_unit,
1950  const Catalog_Namespace::Catalog& catalog) {
1951  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
1952  return 0;
1953  }
1954  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
1955  const auto grouped_col_expr =
1956  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
1957  if (!grouped_col_expr) {
1958  continue;
1959  }
1960  if (grouped_col_expr->get_table_id() <= 0) {
1961  return 0;
1962  }
1963  const auto td = catalog.getMetadataForTable(grouped_col_expr->get_table_id());
1964  if (td->shardedColumnId == grouped_col_expr->get_column_id()) {
1965  return td->nShards;
1966  }
1967  }
1968  return 0;
1969 }
RUNTIME_EXPORT void agg_approx_quantile(int64_t *agg, const double val)
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:217
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:1607
size_t g_watchdog_baseline_max_groups
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
llvm::Value * getAdditionalLiteral(const int32_t off)
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
llvm::BasicBlock * cond_false_
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
HOST DEVICE int get_size() const
Definition: sqltypes.h:339
bool g_enable_watchdog
std::string cat(Ts &&...args)
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, llvm::Value *varlen_output_buffer, const std::vector< llvm::Value * > &agg_out_vec, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
#define LL_BUILDER
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:111
RUNTIME_EXPORT void agg_count_distinct(int64_t *agg, const int64_t val)
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:115
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
#define LL_CONTEXT
bool expr_is_rowid(const Analyzer::Expr *expr, const Catalog_Namespace::Catalog &cat)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
ExecutorDeviceType
KeylessInfo get_keyless_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
SQLTypeInfo sql_type
Definition: TargetInfo.h:51
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:203
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
void mark_function_always_inline(llvm::Function *func)
bool is_fp() const
Definition: sqltypes.h:513
ColRangeInfo getColRangeInfo()
const std::list< Analyzer::OrderEntry > order_entries
#define LL_INT(v)
static const size_t baseline_threshold
Definition: Execute.h:1091
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
QueryDescriptionType hash_type_
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:512
CountDistinctDescriptors init_count_distinct_descriptors(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ExecutorDeviceType device_type, Executor *executor)
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint)
#define CHECK_GE(x, y)
Definition: Logger.h:222
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
llvm::Value * codegenVarlenOutputBuffer(const QueryMemoryDescriptor &query_mem_desc)
Expr * get_arg() const
Definition: Analyzer.h:1250
size_t getEffectiveKeyWidth() const
void codegenApproxQuantile(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
void checkErrorCode(llvm::Value *retCode)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:157
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:221
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
std::string to_string(char const *&&v)
Helpers for codegen of target expressions.
#define LL_BOOL(v)
size_t getColOnlyOffInBytes(const size_t col_idx) const
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:73
const SQLTypeInfo get_compact_type(const TargetInfo &target)
const size_t limit
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const std::optional< int64_t > &group_cardinality_estimation)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:164
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:218
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:35
size_t getGroupbyColCount() const
RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
const JoinQualsPerNestingLevel join_quals
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
bool g_bigint_count
Definition: sqldefs.h:75
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, llvm::Value *varlen_output_buffer, DiamondCodegen &diamond_codegen) const
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:153
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
DEVICE void allocate()
Definition: quantile.h:590
#define AUTOMATIC_IR_METADATA(CGENSTATE)
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:50
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:77
QueryDescriptionType getQueryDescriptionType() const
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
ExecutorDeviceType device_type
#define RUNTIME_EXPORT
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:30
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:42
#define CHECK_LT(x, y)
Definition: Logger.h:219
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
#define CHECK_LE(x, y)
Definition: Logger.h:220
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
Definition: sqldefs.h:76
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
Descriptor for the result set buffer layout.
CountDistinctImplType
const std::optional< int64_t > group_cardinality_estimation_
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:209
bool is_geometry() const
Definition: sqltypes.h:521
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
llvm::Value * codegenWindowPosition(WindowFunctionContext *window_func_context, llvm::Value *pos_arg)
Definition: ColumnIR.cpp:226
bool g_cluster
#define CUR_FUNC
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:174
Definition: sqltypes.h:45
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
constexpr double n
Definition: Utm.h:46
size_t g_leaf_count
Definition: ParserNode.cpp:78
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:336
const RelAlgExecutionUnit & ra_exe_unit_
const size_t offset
Definition: sqldefs.h:74
Definition: sqldefs.h:72
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)