OmniSciDB  8fa3bf436f
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
19 
20 #include "CardinalityEstimator.h"
21 #include "CodeGenerator.h"
23 #include "ExpressionRange.h"
24 #include "ExpressionRewrite.h"
25 #include "GpuInitGroups.h"
26 #include "InPlaceSort.h"
28 #include "MaxwellCodegenPatch.h"
30 #include "TargetExprBuilder.h"
31 
32 #include "../CudaMgr/CudaMgr.h"
33 #include "../Shared/checked_alloc.h"
34 #include "../Shared/funcannotations.h"
35 #include "../Utils/ChunkIter.h"
37 #include "Execute.h"
38 #include "QueryTemplateGenerator.h"
39 #include "RuntimeFunctions.h"
40 #include "StreamingTopN.h"
41 #include "TopKSort.h"
42 #include "WindowContext.h"
43 
44 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
45 
46 #include <cstring> // strcat()
47 #include <numeric>
48 #include <string_view>
49 #include <thread>
50 
51 bool g_cluster{false};
52 bool g_bigint_count{false};
54 extern size_t g_leaf_count;
55 
56 namespace {
57 
58 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
59  int32_t agg_count{0};
60  for (auto target_expr : target_exprs) {
61  CHECK(target_expr);
62  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
63  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
64  const auto& ti = target_expr->get_type_info();
65  // TODO(pavan): or if is_geometry()
66  if (ti.is_buffer()) {
67  agg_count += 2;
68  } else if (ti.is_geometry()) {
69  agg_count += ti.get_physical_coord_cols() * 2;
70  } else {
71  ++agg_count;
72  }
73  continue;
74  }
75  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
76  agg_count += 2;
77  } else {
78  ++agg_count;
79  }
80  }
81  return agg_count;
82 }
83 
85  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
86  if (!col) {
87  return false;
88  }
89  const auto cd =
90  get_column_descriptor_maybe(col->get_column_id(), col->get_table_id(), cat);
91  if (!cd || !cd->isVirtualCol) {
92  return false;
93  }
94  CHECK_EQ("rowid", cd->columnName);
95  return true;
96 }
97 
98 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
99  for (const auto& target_expr : ra_exe_unit.target_exprs) {
100  const auto agg_info = get_target_info(target_expr, g_bigint_count);
101  if (agg_info.is_agg && is_distinct_target(agg_info)) {
102  return true;
103  }
104  }
105  return false;
106 }
107 
109  const int64_t max_entry_count) {
110  try {
111  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
112  checked_int64_t(col_range_info.min)) >= max_entry_count;
113  } catch (...) {
114  return true;
115  }
116 }
117 
118 bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate,
119  const ColRangeInfo& col_range_info) {
120  try {
121  // the cardinality estimate is the size of the baseline hash table. further penalize
122  // the baseline hash table by a factor of 2x due to overhead in computing baseline
123  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
124  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
125  // count of the column, we use baseline hash on the filtered set
126  return checked_int64_t(cardinality_estimate) * 2 <
127  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
128  checked_int64_t(col_range_info.min));
129  } catch (...) {
130  return false;
131  }
132 }
133 
135  const std::vector<InputTableInfo>& query_infos,
136  const Analyzer::Expr* expr,
137  Executor* executor) {
138  if (!expr) {
139  return {QueryDescriptionType::Projection, 0, 0, 0, false};
140  }
141 
142  const auto expr_range = getExpressionRange(
143  expr, query_infos, executor, boost::make_optional(ra_exe_unit.simple_quals));
144  switch (expr_range.getType()) {
146  if (expr_range.getIntMin() > expr_range.getIntMax()) {
147  return {
148  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
149  }
151  expr_range.getIntMin(),
152  expr_range.getIntMax(),
153  expr_range.getBucket(),
154  expr_range.hasNulls()};
155  }
158  if (expr_range.getFpMin() > expr_range.getFpMax()) {
159  return {
160  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
161  }
162  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
163  }
165  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
166  default:
167  CHECK(false);
168  }
169  CHECK(false);
170  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
171 }
172 
173 } // namespace
174 
176  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
177  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
178  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
179  // can expect this to be true anyway for grouped queries since the precise version
180  // uses significantly more memory.
181  const int64_t baseline_threshold =
186  if (ra_exe_unit_.groupby_exprs.size() != 1) {
187  try {
188  checked_int64_t cardinality{1};
189  bool has_nulls{false};
190  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
191  auto col_range_info = get_expr_range_info(
192  ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
193  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
194  // going through baseline hash if a non-integer type is encountered
195  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
196  }
197  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
198  CHECK_GE(crt_col_cardinality, 0);
199  cardinality *= crt_col_cardinality;
200  if (col_range_info.has_nulls) {
201  has_nulls = true;
202  }
203  }
204  // For zero or high cardinalities, use baseline layout.
205  if (!cardinality || cardinality > baseline_threshold) {
206  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
207  }
209  0,
210  int64_t(cardinality),
211  0,
212  has_nulls};
213  } catch (...) { // overflow when computing cardinality
214  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
215  }
216  }
217  // For single column groupby on high timestamps, force baseline hash due to wide ranges
218  // we are likely to encounter when applying quals to the expression range
219  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
220  // the range is small enough
221  if (ra_exe_unit_.groupby_exprs.front() &&
222  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
223  ra_exe_unit_.simple_quals.size() > 0) {
224  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
225  }
226  const auto col_range_info = get_expr_range_info(
228  if (!ra_exe_unit_.groupby_exprs.front()) {
229  return col_range_info;
230  }
231  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
232  const int64_t col_count =
234  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
236  max_entry_count = std::min(max_entry_count, baseline_threshold);
237  }
238  const auto& groupby_expr_ti = ra_exe_unit_.groupby_exprs.front()->get_type_info();
239  if (groupby_expr_ti.is_string() && !col_range_info.bucket) {
240  CHECK(groupby_expr_ti.get_compression() == kENCODING_DICT);
241 
242  const bool has_filters =
243  !ra_exe_unit_.quals.empty() || !ra_exe_unit_.simple_quals.empty();
244  if (has_filters &&
245  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
246  // if filters are present, we can use the filter to narrow the cardinality of the
247  // group by in the case of ranges too big for perfect hash. Otherwise, we are better
248  // off attempting perfect hash (since we know the range will be made of
249  // monotonically increasing numbers from min to max for dictionary encoded strings)
250  // and failing later due to excessive memory use.
251  // Check the conditions where baseline hash can provide a performance increase and
252  // return baseline hash (potentially forcing an estimator query) as the range type.
253  // Otherwise, return col_range_info which will likely be perfect hash, though could
254  // be baseline from a previous call of this function prior to the estimator query.
255  if (!ra_exe_unit_.sort_info.order_entries.empty()) {
256  // TODO(adb): allow some sorts to pass through this block by centralizing sort
257  // algorithm decision making
259  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
260  // always use baseline hash for column range too big for perfect hash with count
261  // distinct descriptors. We will need 8GB of CPU memory minimum for the perfect
262  // hash group by in this case.
264  col_range_info.min,
265  col_range_info.max,
266  0,
267  col_range_info.has_nulls};
268  } else {
269  // use original col range for sort
270  return col_range_info;
271  }
272  }
273  // if filters are present and the filtered range is less than the cardinality of
274  // the column, consider baseline hash
277  col_range_info)) {
279  col_range_info.min,
280  col_range_info.max,
281  0,
282  col_range_info.has_nulls};
283  }
284  }
285  } else if ((!expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get(),
286  *executor_->catalog_)) &&
287  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
288  !col_range_info.bucket) {
290  col_range_info.min,
291  col_range_info.max,
292  0,
293  col_range_info.has_nulls};
294  }
295  return col_range_info;
296 }
297 
299  checked_int64_t crt_col_cardinality =
300  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
301  if (col_range_info.bucket) {
302  crt_col_cardinality /= col_range_info.bucket;
303  }
304  return static_cast<int64_t>(crt_col_cardinality +
305  (1 + (col_range_info.has_nulls ? 1 : 0)));
306 }
307 
308 #define LL_CONTEXT executor_->cgen_state_->context_
309 #define LL_BUILDER executor_->cgen_state_->ir_builder_
310 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
311 #define LL_INT(v) executor_->cgen_state_->llInt(v)
312 #define LL_FP(v) executor_->cgen_state_->llFp(v)
313 #define ROW_FUNC executor_->cgen_state_->row_func_
314 #define CUR_FUNC executor_->cgen_state_->current_func_
315 
317  Executor* executor,
318  const ExecutorDeviceType device_type,
319  const RelAlgExecutionUnit& ra_exe_unit,
320  const std::vector<InputTableInfo>& query_infos,
321  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
322  const std::optional<int64_t>& group_cardinality_estimation)
323  : executor_(executor)
324  , ra_exe_unit_(ra_exe_unit)
325  , query_infos_(query_infos)
326  , row_set_mem_owner_(row_set_mem_owner)
327  , device_type_(device_type)
328  , group_cardinality_estimation_(group_cardinality_estimation) {
329  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
330  if (!groupby_expr) {
331  continue;
332  }
333  const auto& groupby_ti = groupby_expr->get_type_info();
334  if (groupby_ti.is_bytes()) {
335  throw std::runtime_error(
336  "Cannot group by string columns which are not dictionary encoded.");
337  }
338  if (groupby_ti.is_buffer()) {
339  throw std::runtime_error("Group by buffer not supported");
340  }
341  if (groupby_ti.is_geometry()) {
342  throw std::runtime_error("Group by geometry not supported");
343  }
344  }
345 }
346 
348  const size_t shard_count) const {
349  size_t device_count{0};
351  auto cuda_mgr = executor_->getCatalog()->getDataMgr().getCudaMgr();
352  CHECK(cuda_mgr);
353  device_count = executor_->getCatalog()->getDataMgr().getCudaMgr()->getDeviceCount();
354  CHECK_GT(device_count, 0u);
355  }
356 
357  int64_t bucket{col_range_info.bucket};
358 
359  if (shard_count) {
360  CHECK(!col_range_info.bucket);
361  /*
362  when a node has fewer devices than shard count,
363  a) In a distributed setup, the minimum distance between two keys would be
364  device_count because shards are stored consecutively across the physical tables,
365  i.e if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1
366  would have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf
367  node has only 1 device, in this case, all the keys from each node are loaded on
368  the device each.
369 
370  b) In a single node setup, the distance would be minimum of device_count or
371  difference of device_count - shard_count. For example: If a single node server
372  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
373  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9
374  device 3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum
375  of device_count or difference.
376 
377  When a node has device count equal to or more than shard count then the
378  minimum distance is always at least shard_count * no of leaf nodes.
379  */
380  if (device_count < shard_count) {
381  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
382  : std::min(device_count, shard_count - device_count);
383  } else {
384  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
385  }
386  }
387 
388  return bucket;
389 }
390 
391 namespace {
392 
403  const std::vector<InputTableInfo>& query_infos,
404  const bool is_group_by,
405  Executor* executor) {
406  bool keyless{true}, found{false};
407  int32_t num_agg_expr{0};
408  int32_t index{0};
409  for (const auto target_expr : ra_exe_unit.target_exprs) {
410  const auto agg_info = get_target_info(target_expr, g_bigint_count);
411  const auto chosen_type = get_compact_type(agg_info);
412  if (agg_info.is_agg) {
413  num_agg_expr++;
414  }
415  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
416  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
417  CHECK(agg_expr);
418  const auto arg_expr = agg_arg(target_expr);
419  const bool float_argument_input = takes_float_argument(agg_info);
420  switch (agg_info.agg_kind) {
421  case kAVG:
422  ++index;
423  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
424  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
425  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
426  expr_range_info.hasNulls()) {
427  break;
428  }
429  }
430  found = true;
431  break;
432  case kCOUNT:
433  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
434  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
435  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
436  expr_range_info.hasNulls()) {
437  break;
438  }
439  }
440  found = true;
441  break;
442  case kSUM: {
443  auto arg_ti = arg_expr->get_type_info();
444  if (constrained_not_null(arg_expr, ra_exe_unit.quals)) {
445  arg_ti.set_notnull(true);
446  }
447  if (!arg_ti.get_notnull()) {
448  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
449  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
450  !expr_range_info.hasNulls()) {
451  found = true;
452  }
453  } else {
454  auto expr_range_info = getExpressionRange(arg_expr, query_infos, executor);
455  switch (expr_range_info.getType()) {
458  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
459  found = true;
460  }
461  break;
463  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
464  found = true;
465  }
466  break;
467  default:
468  break;
469  }
470  }
471  break;
472  }
473  case kMIN: {
474  CHECK(agg_expr && agg_expr->get_arg());
475  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
476  if (arg_ti.is_string() || arg_ti.is_buffer()) {
477  break;
478  }
479  auto expr_range_info =
480  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
481  auto init_max = get_agg_initial_val(agg_info.agg_kind,
482  chosen_type,
483  is_group_by || float_argument_input,
484  float_argument_input ? sizeof(float) : 8);
485  switch (expr_range_info.getType()) {
488  auto double_max =
489  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
490  if (expr_range_info.getFpMax() < double_max) {
491  found = true;
492  }
493  break;
494  }
496  if (expr_range_info.getIntMax() < init_max) {
497  found = true;
498  }
499  break;
500  default:
501  break;
502  }
503  break;
504  }
505  case kMAX: {
506  CHECK(agg_expr && agg_expr->get_arg());
507  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
508  if (arg_ti.is_string() || arg_ti.is_buffer()) {
509  break;
510  }
511  auto expr_range_info =
512  getExpressionRange(agg_expr->get_arg(), query_infos, executor);
513  // NULL sentinel and init value for kMAX are identical, which results in
514  // ambiguity in detecting empty keys in presence of nulls.
515  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
516  expr_range_info.hasNulls()) {
517  break;
518  }
519  auto init_min = get_agg_initial_val(agg_info.agg_kind,
520  chosen_type,
521  is_group_by || float_argument_input,
522  float_argument_input ? sizeof(float) : 8);
523  switch (expr_range_info.getType()) {
526  auto double_min =
527  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
528  if (expr_range_info.getFpMin() > double_min) {
529  found = true;
530  }
531  break;
532  }
534  if (expr_range_info.getIntMin() > init_min) {
535  found = true;
536  }
537  break;
538  default:
539  break;
540  }
541  break;
542  }
543  default:
544  keyless = false;
545  break;
546  }
547  }
548  if (!keyless) {
549  break;
550  }
551  if (!found) {
552  ++index;
553  }
554  }
555 
556  // shouldn't use keyless for projection only
557  return {
558  keyless && found,
559  index,
560  };
561 }
562 
564  const RelAlgExecutionUnit& ra_exe_unit,
565  const std::vector<InputTableInfo>& query_infos,
566  const ExecutorDeviceType device_type,
567  Executor* executor) {
568  CountDistinctDescriptors count_distinct_descriptors;
569  for (const auto target_expr : ra_exe_unit.target_exprs) {
570  auto agg_info = get_target_info(target_expr, g_bigint_count);
571  if (is_distinct_target(agg_info)) {
572  CHECK(agg_info.is_agg);
573  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
574  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
575  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
576  if (arg_ti.is_bytes()) {
577  throw std::runtime_error(
578  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
579  }
580  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
581  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
582  }
583  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
584  throw std::runtime_error(
585  "APPROX_COUNT_DISTINCT on geometry columns not supported");
586  }
587  if (agg_info.is_distinct && arg_ti.is_geometry()) {
588  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
589  }
590  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
591  auto arg_range_info =
592  arg_ti.is_fp() ? no_range_info
594  ra_exe_unit, query_infos, agg_expr->get_arg(), executor);
595  CountDistinctImplType count_distinct_impl_type{CountDistinctImplType::StdSet};
596  int64_t bitmap_sz_bits{0};
597  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
598  const auto error_rate = agg_expr->get_error_rate();
599  if (error_rate) {
600  CHECK(error_rate->get_type_info().get_type() == kINT);
601  CHECK_GE(error_rate->get_constval().intval, 1);
602  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
603  } else {
604  bitmap_sz_bits = g_hll_precision_bits;
605  }
606  }
607  if (arg_range_info.isEmpty()) {
608  count_distinct_descriptors.emplace_back(
610  0,
611  64,
612  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
613  device_type,
614  1});
615  continue;
616  }
617  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
618  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
619  // implementation for arrays
620  count_distinct_impl_type = CountDistinctImplType::Bitmap;
621  if (agg_info.agg_kind == kCOUNT) {
622  bitmap_sz_bits = arg_range_info.max - arg_range_info.min + 1;
623  const int64_t MAX_BITMAP_BITS{8 * 1000 * 1000 * 1000LL};
624  if (bitmap_sz_bits <= 0 || bitmap_sz_bits > MAX_BITMAP_BITS) {
625  count_distinct_impl_type = CountDistinctImplType::StdSet;
626  }
627  }
628  }
629  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
630  count_distinct_impl_type == CountDistinctImplType::StdSet &&
631  !(arg_ti.is_array() || arg_ti.is_geometry())) {
632  count_distinct_impl_type = CountDistinctImplType::Bitmap;
633  }
634 
635  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
636  count_distinct_impl_type == CountDistinctImplType::StdSet) {
637  throw WatchdogException("Cannot use a fast path for COUNT distinct");
638  }
639  const auto sub_bitmap_count =
640  get_count_distinct_sub_bitmap_count(bitmap_sz_bits, ra_exe_unit, device_type);
641  count_distinct_descriptors.emplace_back(
642  CountDistinctDescriptor{count_distinct_impl_type,
643  arg_range_info.min,
644  bitmap_sz_bits,
645  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
646  device_type,
647  sub_bitmap_count});
648  } else {
649  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
650  CountDistinctImplType::Invalid, 0, 0, false, device_type, 0});
651  }
652  }
653  return count_distinct_descriptors;
654 }
655 
656 } // namespace
657 
658 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
659  const bool allow_multifrag,
660  const size_t max_groups_buffer_entry_count,
661  const int8_t crt_min_byte_width,
662  RenderInfo* render_info,
663  const bool output_columnar_hint) {
664  const auto shard_count =
667  : 0;
668  bool sort_on_gpu_hint =
669  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
672  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
673  // but the total output buffer size would be too big or it's a sharded top query.
674  // For the sake of managing risk, use the new result set way very selectively for
675  // this case only (alongside the baseline layout we've enabled for a while now).
676  bool must_use_baseline_sort = shard_count;
677  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
678  while (true) {
679  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
680  max_groups_buffer_entry_count,
681  crt_min_byte_width,
682  sort_on_gpu_hint,
683  render_info,
684  must_use_baseline_sort,
685  output_columnar_hint);
686  CHECK(query_mem_desc);
687  if (query_mem_desc->sortOnGpu() &&
688  (query_mem_desc->getBufferSizeBytes(device_type_) +
689  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
690  2 * 1024 * 1024 * 1024LL) {
691  must_use_baseline_sort = true;
692  sort_on_gpu_hint = false;
693  } else {
694  break;
695  }
696  }
697  return query_mem_desc;
698 }
699 
700 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
701  const bool allow_multifrag,
702  const size_t max_groups_buffer_entry_count,
703  const int8_t crt_min_byte_width,
704  const bool sort_on_gpu_hint,
705  RenderInfo* render_info,
706  const bool must_use_baseline_sort,
707  const bool output_columnar_hint) {
709 
710  const auto count_distinct_descriptors = init_count_distinct_descriptors(
712 
713  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
714 
715  auto col_range_info_nosharding = getColRangeInfo();
716 
717  const auto shard_count =
720  : 0;
721 
722  const auto col_range_info =
723  ColRangeInfo{col_range_info_nosharding.hash_type_,
724  col_range_info_nosharding.min,
725  col_range_info_nosharding.max,
726  getShardedTopBucket(col_range_info_nosharding, shard_count),
727  col_range_info_nosharding.has_nulls};
728 
729  // Non-grouped aggregates do not support accessing aggregated ranges
730  // Keyless hash is currently only supported with single-column perfect hash
731  const auto keyless_info =
732  !(is_group_by &&
733  col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash)
734  ? KeylessInfo{false, -1}
736 
737  if (g_enable_watchdog &&
738  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
739  max_groups_buffer_entry_count > 120000000) ||
740  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
741  ra_exe_unit_.groupby_exprs.size() == 1 &&
742  (col_range_info.max - col_range_info.min) /
743  std::max(col_range_info.bucket, int64_t(1)) >
744  130000000))) {
745  throw WatchdogException("Query would use too much memory");
746  }
747  try {
749  ra_exe_unit_,
750  query_infos_,
751  col_range_info,
752  keyless_info,
753  allow_multifrag,
754  device_type_,
755  crt_min_byte_width,
756  sort_on_gpu_hint,
757  shard_count,
758  max_groups_buffer_entry_count,
759  render_info,
760  count_distinct_descriptors,
761  must_use_baseline_sort,
762  output_columnar_hint,
763  /*streaming_top_n_hint=*/true);
764  } catch (const StreamingTopNOOM& e) {
765  LOG(WARNING) << e.what() << " Disabling Streaming Top N.";
767  ra_exe_unit_,
768  query_infos_,
769  col_range_info,
770  keyless_info,
771  allow_multifrag,
772  device_type_,
773  crt_min_byte_width,
774  sort_on_gpu_hint,
775  shard_count,
776  max_groups_buffer_entry_count,
777  render_info,
778  count_distinct_descriptors,
779  must_use_baseline_sort,
780  output_columnar_hint,
781  /*streaming_top_n_hint=*/false);
782  }
783 }
784 
787 }
788 
789 namespace {
790 
792  const Analyzer::Expr* expr,
793  Executor* executor,
794  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner) {
795  if (!expr) {
796  return;
797  }
798 
799  const auto array_expr = dynamic_cast<const Analyzer::ArrayExpr*>(expr);
800  if (array_expr) {
801  for (size_t i = 0; i < array_expr->getElementCount(); i++) {
803  array_expr->getElement(i), executor, row_set_mem_owner);
804  }
805  return;
806  }
807 
808  const auto cast_expr = dynamic_cast<const Analyzer::UOper*>(expr);
809  const auto& expr_ti = expr->get_type_info();
810  if (cast_expr && cast_expr->get_optype() == kCAST && expr_ti.is_string()) {
811  CHECK_EQ(kENCODING_DICT, expr_ti.get_compression());
812  auto sdp = executor->getStringDictionaryProxy(
813  expr_ti.get_comp_param(), row_set_mem_owner, true);
814  CHECK(sdp);
815  const auto str_lit_expr =
816  dynamic_cast<const Analyzer::Constant*>(cast_expr->get_operand());
817  if (str_lit_expr && str_lit_expr->get_constval().stringval) {
818  sdp->getOrAddTransient(*str_lit_expr->get_constval().stringval);
819  }
820  return;
821  }
822  const auto case_expr = dynamic_cast<const Analyzer::CaseExpr*>(expr);
823  if (!case_expr) {
824  return;
825  }
826  Analyzer::DomainSet domain_set;
827  case_expr->get_domain(domain_set);
828  if (domain_set.empty()) {
829  return;
830  }
831  if (expr_ti.is_string()) {
832  CHECK_EQ(kENCODING_DICT, expr_ti.get_compression());
833  auto sdp = executor->getStringDictionaryProxy(
834  expr_ti.get_comp_param(), row_set_mem_owner, true);
835  CHECK(sdp);
836  for (const auto domain_expr : domain_set) {
837  const auto cast_expr = dynamic_cast<const Analyzer::UOper*>(domain_expr);
838  const auto str_lit_expr =
839  cast_expr && cast_expr->get_optype() == kCAST
840  ? dynamic_cast<const Analyzer::Constant*>(cast_expr->get_operand())
841  : dynamic_cast<const Analyzer::Constant*>(domain_expr);
842  if (str_lit_expr && str_lit_expr->get_constval().stringval) {
843  sdp->getOrAddTransient(*str_lit_expr->get_constval().stringval);
844  }
845  }
846  }
847 }
848 
849 } // namespace
850 
852  const RelAlgExecutionUnit& ra_exe_unit,
853  Executor* executor,
854  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner) {
855  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
857  group_expr.get(), executor, row_set_mem_owner);
858  }
859  for (const auto target_expr : ra_exe_unit.target_exprs) {
860  const auto& target_type = target_expr->get_type_info();
861  if (target_type.is_string() && target_type.get_compression() != kENCODING_DICT) {
862  continue;
863  }
864  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
865  if (agg_expr) {
866  if (agg_expr->get_aggtype() == kSINGLE_VALUE ||
867  agg_expr->get_aggtype() == kSAMPLE) {
869  agg_expr->get_arg(), executor, row_set_mem_owner);
870  }
871  } else {
873  target_expr, executor, row_set_mem_owner);
874  }
875  }
876 }
877 
879  const std::list<Analyzer::OrderEntry>& order_entries) {
880  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
881  return false;
882  }
883  for (const auto& order_entry : order_entries) {
884  CHECK_GE(order_entry.tle_no, 1);
885  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
886  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
887  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
888  return false;
889  }
890  // TODO(alex): relax the restrictions
891  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
892  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
893  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
894  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
895  return false;
896  }
897  if (agg_expr->get_arg()) {
898  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
899  if (arg_ti.is_fp()) {
900  return false;
901  }
902  auto expr_range_info =
903  get_expr_range_info(ra_exe_unit_, query_infos_, agg_expr->get_arg(), executor_);
904  // TOD(adb): QMD not actually initialized here?
905  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
906  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
907  expr_range_info.has_nulls) &&
908  order_entry.is_desc == order_entry.nulls_first) {
909  return false;
910  }
911  }
912  const auto& target_ti = target_expr->get_type_info();
913  CHECK(!target_ti.is_buffer());
914  if (!target_ti.is_integer()) {
915  return false;
916  }
917  }
918  return true;
919 }
920 
921 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
922  llvm::BasicBlock* sc_false,
924  const CompilationOptions& co,
925  const GpuSharedMemoryContext& gpu_smem_context) {
926  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
927  CHECK(filter_result);
928 
929  bool can_return_error = false;
930  llvm::BasicBlock* filter_false{nullptr};
931 
932  {
933  const bool is_group_by = !ra_exe_unit_.groupby_exprs.empty();
934 
935  if (executor_->isArchMaxwell(co.device_type)) {
937  }
938  DiamondCodegen filter_cfg(filter_result,
939  executor_,
940  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
941  "filter", // filter_true and filter_false basic blocks
942  nullptr,
943  false);
944  filter_false = filter_cfg.cond_false_;
945 
946  if (is_group_by) {
948  !query_mem_desc.useStreamingTopN()) {
949  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
950  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
951  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
952  llvm::Value* old_total_matched_val{nullptr};
954  old_total_matched_val =
955  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
956  total_matched_ptr,
957  LL_INT(int32_t(1)),
958  llvm::AtomicOrdering::Monotonic);
959  } else {
960  old_total_matched_val = LL_BUILDER.CreateLoad(total_matched_ptr);
961  LL_BUILDER.CreateStore(
962  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
963  total_matched_ptr);
964  }
965  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
966  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
967  }
968 
969  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
970  if (query_mem_desc.usesGetGroupValueFast() ||
971  query_mem_desc.getQueryDescriptionType() ==
973  if (query_mem_desc.getGroupbyColCount() > 1) {
974  filter_cfg.setChainToNext();
975  }
976  // Don't generate null checks if the group slot is guaranteed to be non-null,
977  // as it's the case for get_group_value_fast* family.
978  can_return_error = codegenAggCalls(
979  agg_out_ptr_w_idx, {}, query_mem_desc, co, gpu_smem_context, filter_cfg);
980  } else {
981  {
982  llvm::Value* nullcheck_cond{nullptr};
983  if (query_mem_desc.didOutputColumnar()) {
984  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
985  LL_INT(int32_t(0)));
986  } else {
987  nullcheck_cond = LL_BUILDER.CreateICmpNE(
988  std::get<0>(agg_out_ptr_w_idx),
989  llvm::ConstantPointerNull::get(
990  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
991  }
992  DiamondCodegen nullcheck_cfg(
993  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
995  agg_out_ptr_w_idx, {}, query_mem_desc, co, gpu_smem_context, filter_cfg);
996  }
997  can_return_error = true;
998  if (query_mem_desc.getQueryDescriptionType() ==
1000  query_mem_desc.useStreamingTopN()) {
1001  // Ignore rejection on pushing current row to top-K heap.
1002  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
1003  } else {
1004  CodeGenerator code_generator(executor_);
1005  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
1006  // TODO(alex): remove the trunc once pos is converted to 32 bits
1007  code_generator.posArg(nullptr),
1008  get_int_type(32, LL_CONTEXT))));
1009  }
1010  }
1011  } else {
1012  if (ra_exe_unit_.estimator) {
1013  std::stack<llvm::BasicBlock*> array_loops;
1014  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
1015  } else {
1016  auto arg_it = ROW_FUNC->arg_begin();
1017  std::vector<llvm::Value*> agg_out_vec;
1018  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
1019  agg_out_vec.push_back(&*arg_it++);
1020  }
1021  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
1022  agg_out_vec,
1023  query_mem_desc,
1024  co,
1025  gpu_smem_context,
1026  filter_cfg);
1027  }
1028  }
1029  }
1030 
1031  if (ra_exe_unit_.join_quals.empty()) {
1032  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
1033  } else if (sc_false) {
1034  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
1035  LL_BUILDER.SetInsertPoint(sc_false);
1036  LL_BUILDER.CreateBr(filter_false);
1037  LL_BUILDER.SetInsertPoint(saved_insert_block);
1038  }
1039 
1040  return can_return_error;
1041 }
1042 
1044  llvm::Value* groups_buffer,
1046  const CompilationOptions& co,
1047  DiamondCodegen& diamond_codegen) {
1048  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1050  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1051  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
1052  CHECK(!group_expr);
1053  if (!query_mem_desc.didOutputColumnar()) {
1054  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
1055  }
1056  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1057  ? 0
1058  : query_mem_desc.getRowSize() / sizeof(int64_t);
1059  CodeGenerator code_generator(executor_);
1060  if (query_mem_desc.useStreamingTopN()) {
1061  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
1062  CHECK_GE(only_order_entry.tle_no, int(1));
1063  const size_t target_idx = only_order_entry.tle_no - 1;
1064  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1065  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1066  const auto chosen_bytes =
1067  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1068  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1069  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1071  std::string fname = "get_bin_from_k_heap";
1072  const auto& oe_ti = order_entry_expr->get_type_info();
1073  llvm::Value* null_key_lv = nullptr;
1074  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1075  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1076  switch (bit_width) {
1077  case 32:
1078  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1079  break;
1080  case 64:
1081  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1082  break;
1083  default:
1084  CHECK(false);
1085  }
1086  fname += "_int" + std::to_string(bit_width) + "_t";
1087  } else {
1088  CHECK(oe_ti.is_fp());
1089  if (order_entry_lv->getType()->isDoubleTy()) {
1090  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1091  } else {
1092  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1093  }
1094  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1095  }
1096  const auto key_slot_idx =
1098  return emitCall(
1099  fname,
1100  {groups_buffer,
1101  LL_INT(n),
1102  LL_INT(row_size_quad),
1103  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1104  LL_BOOL(only_order_entry.is_desc),
1105  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1106  LL_BOOL(only_order_entry.nulls_first),
1107  null_key_lv,
1108  order_entry_lv});
1109  } else {
1110  llvm::Value* output_buffer_entry_count_lv{nullptr};
1112  output_buffer_entry_count_lv =
1113  LL_BUILDER.CreateLoad(get_arg_by_name(ROW_FUNC, "max_matched"));
1114  CHECK(output_buffer_entry_count_lv);
1115  }
1116  const auto group_expr_lv =
1117  LL_BUILDER.CreateLoad(get_arg_by_name(ROW_FUNC, "old_total_matched"));
1118  std::vector<llvm::Value*> args{
1119  groups_buffer,
1120  output_buffer_entry_count_lv
1121  ? output_buffer_entry_count_lv
1122  : LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1123  group_expr_lv,
1124  code_generator.posArg(nullptr)};
1125  if (query_mem_desc.didOutputColumnar()) {
1126  const auto columnar_output_offset =
1127  emitCall("get_columnar_scan_output_offset", args);
1128  return columnar_output_offset;
1129  }
1130  args.push_back(LL_INT(row_size_quad));
1131  return emitCall("get_scan_output_slot", args);
1132  }
1133 }
1134 
1135 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1137  const CompilationOptions& co,
1138  DiamondCodegen& diamond_codegen) {
1139  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1140  auto arg_it = ROW_FUNC->arg_begin();
1141  auto groups_buffer = arg_it++;
1142 
1143  std::stack<llvm::BasicBlock*> array_loops;
1144 
1145  // TODO(Saman): move this logic outside of this function.
1147  if (query_mem_desc.didOutputColumnar()) {
1148  return std::make_tuple(
1149  &*groups_buffer,
1150  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1151  } else {
1152  return std::make_tuple(
1153  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1154  nullptr);
1155  }
1156  }
1157 
1158  CHECK(query_mem_desc.getQueryDescriptionType() ==
1160  query_mem_desc.getQueryDescriptionType() ==
1162 
1163  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1164  ? 0
1165  : query_mem_desc.getRowSize() / sizeof(int64_t);
1166 
1167  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1168  ? sizeof(int64_t)
1169  : query_mem_desc.getEffectiveKeyWidth();
1170  // for multi-column group by
1171  llvm::Value* group_key = nullptr;
1172  llvm::Value* key_size_lv = nullptr;
1173 
1174  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1175  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1176  if (query_mem_desc.getQueryDescriptionType() ==
1178  group_key =
1179  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1180  } else if (query_mem_desc.getQueryDescriptionType() ==
1182  group_key =
1183  col_width_size == sizeof(int32_t)
1184  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1185  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1186  }
1187  CHECK(group_key);
1188  CHECK(key_size_lv);
1189  }
1190 
1191  int32_t subkey_idx = 0;
1192  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1193  for (const auto& group_expr : ra_exe_unit_.groupby_exprs) {
1194  const auto col_range_info =
1196  const auto translated_null_value = static_cast<int64_t>(
1197  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1198  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1199  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1200  : checked_int64_t(col_range_info.max) +
1201  (col_range_info.bucket ? col_range_info.bucket : 1));
1202 
1203  const bool col_has_nulls =
1204  query_mem_desc.getQueryDescriptionType() ==
1206  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1207  ? query_mem_desc.hasNulls()
1208  : col_range_info.has_nulls)
1209  : false;
1210 
1211  const auto group_expr_lvs =
1212  executor_->groupByColumnCodegen(group_expr.get(),
1213  col_width_size,
1214  co,
1215  col_has_nulls,
1216  translated_null_value,
1217  diamond_codegen,
1218  array_loops,
1219  query_mem_desc.threadsShareMemory());
1220  const auto group_expr_lv = group_expr_lvs.translated_value;
1221  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1222  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1223  return codegenSingleColumnPerfectHash(query_mem_desc,
1224  co,
1225  &*groups_buffer,
1226  group_expr_lv,
1227  group_expr_lvs.original_value,
1228  row_size_quad);
1229  } else {
1230  // store the sub-key to the buffer
1231  LL_BUILDER.CreateStore(group_expr_lv,
1232  LL_BUILDER.CreateGEP(group_key, LL_INT(subkey_idx++)));
1233  }
1234  }
1235  if (query_mem_desc.getQueryDescriptionType() ==
1237  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1239  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1240  } else if (query_mem_desc.getQueryDescriptionType() ==
1243  &*groups_buffer,
1244  group_key,
1245  key_size_lv,
1246  query_mem_desc,
1247  col_width_size,
1248  row_size_quad);
1249  }
1250  CHECK(false);
1251  return std::make_tuple(nullptr, nullptr);
1252 }
1253 
1254 std::tuple<llvm::Value*, llvm::Value*>
1257  const CompilationOptions& co,
1258  llvm::Value* groups_buffer,
1259  llvm::Value* group_expr_lv_translated,
1260  llvm::Value* group_expr_lv_original,
1261  const int32_t row_size_quad) {
1262  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1263  CHECK(query_mem_desc.usesGetGroupValueFast());
1264  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1265  ? "get_columnar_group_bin_offset"
1266  : "get_group_value_fast"};
1267  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1268  get_group_fn_name += "_keyless";
1269  }
1270  if (query_mem_desc.interleavedBins(co.device_type)) {
1271  CHECK(!query_mem_desc.didOutputColumnar());
1272  CHECK(query_mem_desc.hasKeylessHash());
1273  get_group_fn_name += "_semiprivate";
1274  }
1275  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1276  &*group_expr_lv_translated};
1277  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1278  query_mem_desc.mustUseBaselineSort()) {
1279  get_group_fn_name += "_with_original_key";
1280  get_group_fn_args.push_back(group_expr_lv_original);
1281  }
1282  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1283  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1284  if (!query_mem_desc.hasKeylessHash()) {
1285  if (!query_mem_desc.didOutputColumnar()) {
1286  get_group_fn_args.push_back(LL_INT(row_size_quad));
1287  }
1288  } else {
1289  if (!query_mem_desc.didOutputColumnar()) {
1290  get_group_fn_args.push_back(LL_INT(row_size_quad));
1291  }
1292  if (query_mem_desc.interleavedBins(co.device_type)) {
1293  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1294  get_group_fn_args.push_back(warp_idx);
1295  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1296  }
1297  }
1298  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1299  return std::make_tuple(&*groups_buffer,
1300  emitCall(get_group_fn_name, get_group_fn_args));
1301  }
1302  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1303 }
1304 
1305 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1306  llvm::Value* groups_buffer,
1307  llvm::Value* group_key,
1308  llvm::Value* key_size_lv,
1309  const QueryMemoryDescriptor& query_mem_desc,
1310  const int32_t row_size_quad) {
1311  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1312  CHECK(query_mem_desc.getQueryDescriptionType() ==
1314  // compute the index (perfect hash)
1315  auto perfect_hash_func = codegenPerfectHashFunction();
1316  auto hash_lv =
1317  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1318 
1319  if (query_mem_desc.didOutputColumnar()) {
1320  if (!query_mem_desc.hasKeylessHash()) {
1321  const std::string set_matching_func_name{
1322  "set_matching_group_value_perfect_hash_columnar"};
1323  const std::vector<llvm::Value*> set_matching_func_arg{
1324  groups_buffer,
1325  hash_lv,
1326  group_key,
1327  key_size_lv,
1328  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1329  query_mem_desc.getEntryCount())};
1330  emitCall(set_matching_func_name, set_matching_func_arg);
1331  }
1332  return std::make_tuple(groups_buffer, hash_lv);
1333  } else {
1334  if (query_mem_desc.hasKeylessHash()) {
1335  return std::make_tuple(emitCall("get_matching_group_value_perfect_hash_keyless",
1336  {groups_buffer, hash_lv, LL_INT(row_size_quad)}),
1337  nullptr);
1338  } else {
1339  return std::make_tuple(
1340  emitCall(
1341  "get_matching_group_value_perfect_hash",
1342  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1343  nullptr);
1344  }
1345  }
1346 }
1347 
1348 std::tuple<llvm::Value*, llvm::Value*>
1350  const CompilationOptions& co,
1351  llvm::Value* groups_buffer,
1352  llvm::Value* group_key,
1353  llvm::Value* key_size_lv,
1354  const QueryMemoryDescriptor& query_mem_desc,
1355  const size_t key_width,
1356  const int32_t row_size_quad) {
1357  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1358  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1359  CHECK(key_width == sizeof(int32_t));
1360  group_key =
1361  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1362  }
1363  std::vector<llvm::Value*> func_args{
1364  groups_buffer,
1365  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1366  &*group_key,
1367  &*key_size_lv,
1368  LL_INT(static_cast<int32_t>(key_width))};
1369  std::string func_name{"get_group_value"};
1370  if (query_mem_desc.didOutputColumnar()) {
1371  func_name += "_columnar_slot";
1372  } else {
1373  func_args.push_back(LL_INT(row_size_quad));
1374  }
1375  if (co.with_dynamic_watchdog) {
1376  func_name += "_with_watchdog";
1377  }
1378  if (query_mem_desc.didOutputColumnar()) {
1379  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1380  } else {
1381  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1382  }
1383 }
1384 
1386  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1387  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1388  auto ft = llvm::FunctionType::get(
1389  get_int_type(32, LL_CONTEXT),
1390  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1391  false);
1392  auto key_hash_func = llvm::Function::Create(ft,
1393  llvm::Function::ExternalLinkage,
1394  "perfect_key_hash",
1395  executor_->cgen_state_->module_);
1396  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1397  mark_function_always_inline(key_hash_func);
1398  auto& key_buff_arg = *key_hash_func->args().begin();
1399  llvm::Value* key_buff_lv = &key_buff_arg;
1400  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1401  llvm::IRBuilder<> key_hash_func_builder(bb);
1402  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1403  std::vector<int64_t> cardinalities;
1404  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1405  auto col_range_info =
1406  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1407  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1408  cardinalities.push_back(getBucketedCardinality(col_range_info));
1409  }
1410  size_t dim_idx = 0;
1411  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1412  auto key_comp_lv = key_hash_func_builder.CreateLoad(
1413  key_hash_func_builder.CreateGEP(key_buff_lv, LL_INT(dim_idx)));
1414  auto col_range_info =
1415  get_expr_range_info(ra_exe_unit_, query_infos_, groupby_expr.get(), executor_);
1416  auto crt_term_lv =
1417  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1418  if (col_range_info.bucket) {
1419  crt_term_lv =
1420  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1421  }
1422  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1423  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1424  LL_INT(cardinalities[prev_dim_idx]));
1425  }
1426  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1427  ++dim_idx;
1428  }
1429  key_hash_func_builder.CreateRet(
1430  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1431  return key_hash_func;
1432 }
1433 
1435  const TargetInfo& agg_info,
1436  llvm::Value* target) {
1437  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1438  const auto& agg_type = agg_info.sql_type;
1439  const size_t chosen_bytes = agg_type.get_size();
1440 
1441  bool need_conversion{false};
1442  llvm::Value* arg_null{nullptr};
1443  llvm::Value* agg_null{nullptr};
1444  llvm::Value* target_to_cast{target};
1445  if (arg_type.is_fp()) {
1446  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1447  if (agg_type.is_fp()) {
1448  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1449  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1450  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1451  need_conversion = true;
1452  }
1453  } else {
1454  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1455  return target;
1456  }
1457  } else {
1458  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1459  if (agg_type.is_fp()) {
1460  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1461  need_conversion = true;
1462  target_to_cast = executor_->castToFP(target, arg_type, agg_type);
1463  } else {
1464  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1465  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1466  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1467  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1468  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1469  need_conversion = true;
1470  }
1471  }
1472  }
1473  if (need_conversion) {
1474  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1475  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1476  return LL_BUILDER.CreateSelect(
1477  cmp,
1478  agg_null,
1479  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1480  } else {
1481  return target;
1482  }
1483 }
1484 
1486  const Analyzer::WindowFunction* window_func,
1487  const QueryMemoryDescriptor& query_mem_desc,
1488  const CompilationOptions& co,
1489  DiamondCodegen& diamond_codegen) {
1490  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1491  const auto window_func_context =
1493  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1494  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1495  ? 0
1496  : query_mem_desc.getRowSize() / sizeof(int64_t);
1497  auto arg_it = ROW_FUNC->arg_begin();
1498  auto groups_buffer = arg_it++;
1499  CodeGenerator code_generator(executor_);
1500  auto window_pos_lv = code_generator.codegenWindowPosition(
1501  window_func_context, code_generator.posArg(nullptr));
1502  const auto pos_in_window =
1503  LL_BUILDER.CreateTrunc(window_pos_lv, get_int_type(32, LL_CONTEXT));
1504  llvm::Value* entry_count_lv =
1505  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1506  std::vector<llvm::Value*> args{
1507  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1508  if (query_mem_desc.didOutputColumnar()) {
1509  const auto columnar_output_offset =
1510  emitCall("get_columnar_scan_output_offset", args);
1511  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1512  }
1513  args.push_back(LL_INT(row_size_quad));
1514  return emitCall("get_scan_output_slot", args);
1515  }
1516  auto arg_it = ROW_FUNC->arg_begin();
1517  auto groups_buffer = arg_it++;
1518  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1519 }
1520 
1522  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1523  const std::vector<llvm::Value*>& agg_out_vec,
1524  const QueryMemoryDescriptor& query_mem_desc,
1525  const CompilationOptions& co,
1526  const GpuSharedMemoryContext& gpu_smem_context,
1527  DiamondCodegen& diamond_codegen) {
1528  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1529  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1530  // TODO(alex): unify the two cases, the output for non-group by queries
1531  // should be a contiguous buffer
1532  const bool is_group_by = std::get<0>(agg_out_ptr_w_idx);
1533  bool can_return_error = false;
1534  if (is_group_by) {
1535  CHECK(agg_out_vec.empty());
1536  } else {
1537  CHECK(!agg_out_vec.empty());
1538  }
1539 
1540  // output buffer is casted into a byte stream to be able to handle data elements of
1541  // different sizes (only used when actual column width sizes are used)
1542  llvm::Value* output_buffer_byte_stream{nullptr};
1543  llvm::Value* out_row_idx{nullptr};
1544  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1546  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1547  std::get<0>(agg_out_ptr_w_idx),
1548  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1549  output_buffer_byte_stream->setName("out_buff_b_stream");
1550  CHECK(std::get<1>(agg_out_ptr_w_idx));
1551  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1552  llvm::Type::getInt64Ty(LL_CONTEXT));
1553  out_row_idx->setName("out_row_idx");
1554  }
1555 
1556  TargetExprCodegenBuilder target_builder(query_mem_desc, ra_exe_unit_, is_group_by);
1557  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1558  ++target_idx) {
1559  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1560  CHECK(target_expr);
1561 
1562  target_builder(target_expr, executor_, co);
1563  }
1564 
1565  target_builder.codegen(this,
1566  executor_,
1567  query_mem_desc,
1568  co,
1569  gpu_smem_context,
1570  agg_out_ptr_w_idx,
1571  agg_out_vec,
1572  output_buffer_byte_stream,
1573  out_row_idx,
1574  diamond_codegen);
1575 
1576  for (auto target_expr : ra_exe_unit_.target_exprs) {
1577  CHECK(target_expr);
1578  executor_->plan_state_->isLazyFetchColumn(target_expr);
1579  }
1580 
1581  return can_return_error;
1582 }
1583 
1588  llvm::Value* output_buffer_byte_stream,
1589  llvm::Value* out_row_idx,
1590  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1591  const QueryMemoryDescriptor& query_mem_desc,
1592  const size_t chosen_bytes,
1593  const size_t agg_out_off,
1594  const size_t target_idx) {
1595  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1596  llvm::Value* agg_col_ptr{nullptr};
1597  if (query_mem_desc.didOutputColumnar()) {
1598  // TODO(Saman): remove the second columnar branch, and support all query description
1599  // types through the first branch. Then, input arguments should also be cleaned up
1600  if (!g_cluster &&
1602  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1603  chosen_bytes == 8);
1604  CHECK(output_buffer_byte_stream);
1605  CHECK(out_row_idx);
1606  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1607  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1608  auto out_per_col_byte_idx =
1609  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1610  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1611  LL_INT(static_cast<int64_t>(col_off)));
1612  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1613  auto output_ptr = LL_BUILDER.CreateGEP(output_buffer_byte_stream, byte_offset);
1614  agg_col_ptr = LL_BUILDER.CreateBitCast(
1615  output_ptr,
1616  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1617  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1618  } else {
1619  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1620  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1621  col_off /= chosen_bytes;
1622  CHECK(std::get<1>(agg_out_ptr_w_idx));
1623  auto offset = LL_BUILDER.CreateAdd(std::get<1>(agg_out_ptr_w_idx), LL_INT(col_off));
1624  agg_col_ptr = LL_BUILDER.CreateGEP(
1625  LL_BUILDER.CreateBitCast(
1626  std::get<0>(agg_out_ptr_w_idx),
1627  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0)),
1628  offset);
1629  }
1630  } else {
1631  uint32_t col_off = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1632  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1633  col_off /= chosen_bytes;
1634  agg_col_ptr = LL_BUILDER.CreateGEP(
1635  LL_BUILDER.CreateBitCast(
1636  std::get<0>(agg_out_ptr_w_idx),
1637  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0)),
1638  LL_INT(col_off));
1639  }
1640  CHECK(agg_col_ptr);
1641  return agg_col_ptr;
1642 }
1643 
1644 void GroupByAndAggregate::codegenEstimator(std::stack<llvm::BasicBlock*>& array_loops,
1645  DiamondCodegen& diamond_codegen,
1646  const QueryMemoryDescriptor& query_mem_desc,
1647  const CompilationOptions& co) {
1648  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1649  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1650  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1651  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1652  estimator_comp_count_lv);
1653  int32_t subkey_idx = 0;
1654  for (const auto& estimator_arg_comp : estimator_arg) {
1655  const auto estimator_arg_comp_lvs =
1656  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1657  query_mem_desc.getEffectiveKeyWidth(),
1658  co,
1659  false,
1660  0,
1661  diamond_codegen,
1662  array_loops,
1663  true);
1664  CHECK(!estimator_arg_comp_lvs.original_value);
1665  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1666  // store the sub-key to the buffer
1667  LL_BUILDER.CreateStore(estimator_arg_comp_lv,
1668  LL_BUILDER.CreateGEP(estimator_key_lv, LL_INT(subkey_idx++)));
1669  }
1670  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1671  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1672  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1673  const auto estimator_comp_bytes_lv =
1674  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1675  const auto bitmap_size_lv =
1676  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1677  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1678  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1679 }
1680 
1681 extern "C" RUNTIME_EXPORT void agg_count_distinct(int64_t* agg, const int64_t val) {
1682  reinterpret_cast<std::set<int64_t>*>(*agg)->insert(val);
1683 }
1684 
1685 extern "C" RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t* agg,
1686  const int64_t val,
1687  const int64_t skip_val) {
1688  if (val != skip_val) {
1689  agg_count_distinct(agg, val);
1690  }
1691 }
1692 
1693 extern "C" RUNTIME_EXPORT void agg_approx_median(int64_t* agg, const double val) {
1694  auto* t_digest = reinterpret_cast<quantile::TDigest*>(*agg);
1695  t_digest->allocate();
1696  t_digest->add(val);
1697 }
1698 
1700  const size_t target_idx,
1701  const Analyzer::Expr* target_expr,
1702  std::vector<llvm::Value*>& agg_args,
1703  const QueryMemoryDescriptor& query_mem_desc,
1704  const ExecutorDeviceType device_type) {
1705  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1706  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1707  const auto& arg_ti =
1708  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1709  if (arg_ti.is_fp()) {
1710  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1711  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1712  }
1713  const auto& count_distinct_descriptor =
1714  query_mem_desc.getCountDistinctDescriptor(target_idx);
1715  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1716  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1717  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1718  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1719  if (device_type == ExecutorDeviceType::GPU) {
1720  const auto base_dev_addr = getAdditionalLiteral(-1);
1721  const auto base_host_addr = getAdditionalLiteral(-2);
1722  agg_args.push_back(base_dev_addr);
1723  agg_args.push_back(base_host_addr);
1724  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1725  } else {
1726  emitCall("agg_approximate_count_distinct", agg_args);
1727  }
1728  return;
1729  }
1730  std::string agg_fname{"agg_count_distinct"};
1731  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1732  agg_fname += "_bitmap";
1733  agg_args.push_back(LL_INT(static_cast<int64_t>(count_distinct_descriptor.min_val)));
1734  }
1735  if (agg_info.skip_null_val) {
1736  auto null_lv = executor_->cgen_state_->castToTypeIn(
1737  (arg_ti.is_fp()
1738  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1739  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1740  64);
1741  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1742  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1743  agg_fname += "_skip_val";
1744  agg_args.push_back(null_lv);
1745  }
1746  if (device_type == ExecutorDeviceType::GPU) {
1747  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1748  agg_fname += "_gpu";
1749  const auto base_dev_addr = getAdditionalLiteral(-1);
1750  const auto base_host_addr = getAdditionalLiteral(-2);
1751  agg_args.push_back(base_dev_addr);
1752  agg_args.push_back(base_host_addr);
1753  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1754  CHECK_EQ(size_t(0),
1755  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1756  count_distinct_descriptor.sub_bitmap_count);
1757  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1758  count_distinct_descriptor.sub_bitmap_count)));
1759  }
1760  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1761  emitCall(agg_fname, agg_args);
1762  } else {
1763  executor_->cgen_state_->emitExternalCall(
1764  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1765  }
1766 }
1767 
1768 void GroupByAndAggregate::codegenApproxMedian(const size_t target_idx,
1769  const Analyzer::Expr* target_expr,
1770  std::vector<llvm::Value*>& agg_args,
1771  const QueryMemoryDescriptor& query_mem_desc,
1772  const ExecutorDeviceType device_type) {
1773  if (device_type == ExecutorDeviceType::GPU) {
1774  throw QueryMustRunOnCpu();
1775  }
1776  llvm::BasicBlock *calc, *skip;
1777  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1778  auto const arg_ti =
1779  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1780  bool const nullable = !arg_ti.get_notnull();
1781 
1782  auto* cs = executor_->cgen_state_.get();
1783  auto& irb = cs->ir_builder_;
1784  if (nullable) {
1785  auto* const null_value = cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1786  auto* const skip_cond = arg_ti.is_fp()
1787  ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1788  : irb.CreateICmpEQ(agg_args.back(), null_value);
1789  calc = llvm::BasicBlock::Create(cs->context_, "calc_approx_median");
1790  skip = llvm::BasicBlock::Create(cs->context_, "skip_approx_median");
1791  irb.CreateCondBr(skip_cond, skip, calc);
1792  cs->current_func_->getBasicBlockList().push_back(calc);
1793  irb.SetInsertPoint(calc);
1794  }
1795  if (!arg_ti.is_fp()) {
1796  auto const agg_info = get_target_info(target_expr, g_bigint_count);
1797  agg_args.back() = executor_->castToFP(agg_args.back(), arg_ti, agg_info.sql_type);
1798  }
1799  cs->emitExternalCall(
1800  "agg_approx_median", llvm::Type::getVoidTy(cs->context_), agg_args);
1801  if (nullable) {
1802  irb.CreateBr(skip);
1803  cs->current_func_->getBasicBlockList().push_back(skip);
1804  irb.SetInsertPoint(skip);
1805  }
1806 }
1807 
1808 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
1809  CHECK_LT(off, 0);
1810  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
1811  return LL_BUILDER.CreateLoad(LL_BUILDER.CreateGEP(
1812  LL_BUILDER.CreateBitCast(lit_buff_lv,
1813  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)),
1814  LL_INT(off)));
1815 }
1816 
1817 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
1818  const Analyzer::Expr* target_expr,
1819  const CompilationOptions& co) {
1820  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1821  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
1822  const auto func_expr = dynamic_cast<const Analyzer::FunctionOper*>(target_expr);
1823  const auto arr_expr = dynamic_cast<const Analyzer::ArrayExpr*>(target_expr);
1824 
1825  // TODO(alex): handle arrays uniformly?
1826  CodeGenerator code_generator(executor_);
1827  if (target_expr) {
1828  const auto& target_ti = target_expr->get_type_info();
1829  if (target_ti.is_buffer() &&
1830  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1831  const auto target_lvs =
1832  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1833  : code_generator.codegen(
1834  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1835  if (!func_expr && !arr_expr) {
1836  // Something with the chunk transport is code that was generated from a source
1837  // other than an ARRAY[] expression
1838  if (target_ti.is_bytes()) {
1839  CHECK_EQ(size_t(3), target_lvs.size());
1840  return {target_lvs[1], target_lvs[2]};
1841  }
1842  CHECK(target_ti.is_array());
1843  CHECK_EQ(size_t(1), target_lvs.size());
1844  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
1845  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1846  const auto i8p_ty =
1847  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1848  const auto& elem_ti = target_ti.get_elem_type();
1849  return {
1850  executor_->cgen_state_->emitExternalCall(
1851  "array_buff",
1852  i8p_ty,
1853  {target_lvs.front(), code_generator.posArg(target_expr)}),
1854  executor_->cgen_state_->emitExternalCall(
1855  "array_size",
1856  i32_ty,
1857  {target_lvs.front(),
1858  code_generator.posArg(target_expr),
1859  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
1860  } else {
1861  if (agg_expr) {
1862  throw std::runtime_error(
1863  "Using array[] operator as argument to an aggregate operator is not "
1864  "supported");
1865  }
1866  CHECK(func_expr || arr_expr);
1867  if (dynamic_cast<const Analyzer::FunctionOper*>(target_expr)) {
1868  CHECK_EQ(size_t(1), target_lvs.size());
1869  const auto prefix = target_ti.get_buffer_name();
1870  CHECK(target_ti.is_array() || target_ti.is_bytes());
1871  const auto target_lv = LL_BUILDER.CreateLoad(target_lvs[0]);
1872  // const auto target_lv_type = target_lvs[0]->getType();
1873  // CHECK(target_lv_type->isStructTy());
1874  // CHECK_EQ(target_lv_type->getNumContainedTypes(), 3u);
1875  const auto i8p_ty = llvm::PointerType::get(
1876  get_int_type(8, executor_->cgen_state_->context_), 0);
1877  const auto ptr = LL_BUILDER.CreatePointerCast(
1878  LL_BUILDER.CreateExtractValue(target_lv, 0), i8p_ty);
1879  const auto size = LL_BUILDER.CreateExtractValue(target_lv, 1);
1880  const auto null_flag = LL_BUILDER.CreateExtractValue(target_lv, 2);
1881  const auto nullcheck_ok_bb =
1882  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_nullcheck_ok_bb", CUR_FUNC);
1883  const auto nullcheck_fail_bb = llvm::BasicBlock::Create(
1884  LL_CONTEXT, prefix + "_nullcheck_fail_bb", CUR_FUNC);
1885 
1886  // TODO(adb): probably better to zext the bool
1887  const auto nullcheck = LL_BUILDER.CreateICmpEQ(
1888  null_flag, executor_->cgen_state_->llInt(static_cast<int8_t>(1)));
1889  LL_BUILDER.CreateCondBr(nullcheck, nullcheck_fail_bb, nullcheck_ok_bb);
1890 
1891  const auto ret_bb =
1892  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_return", CUR_FUNC);
1893  LL_BUILDER.SetInsertPoint(ret_bb);
1894  auto result_phi = LL_BUILDER.CreatePHI(i8p_ty, 2, prefix + "_ptr_return");
1895  result_phi->addIncoming(ptr, nullcheck_ok_bb);
1896  const auto null_arr_sentinel = LL_BUILDER.CreateIntToPtr(
1897  executor_->cgen_state_->llInt(static_cast<int8_t>(0)), i8p_ty);
1898  result_phi->addIncoming(null_arr_sentinel, nullcheck_fail_bb);
1899  LL_BUILDER.SetInsertPoint(nullcheck_ok_bb);
1900  executor_->cgen_state_->emitExternalCall(
1901  "register_buffer_with_executor_rsm",
1902  llvm::Type::getVoidTy(executor_->cgen_state_->context_),
1903  {executor_->cgen_state_->llInt(reinterpret_cast<int64_t>(executor_)), ptr});
1904  LL_BUILDER.CreateBr(ret_bb);
1905  LL_BUILDER.SetInsertPoint(nullcheck_fail_bb);
1906  LL_BUILDER.CreateBr(ret_bb);
1907 
1908  LL_BUILDER.SetInsertPoint(ret_bb);
1909  return {result_phi, size};
1910  }
1911  CHECK_EQ(size_t(2), target_lvs.size());
1912  return {target_lvs[0], target_lvs[1]};
1913  }
1914  }
1915  if (target_ti.is_geometry() &&
1916  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1917  auto generate_coord_lvs =
1918  [&](auto* selected_target_expr,
1919  bool const fetch_columns) -> std::vector<llvm::Value*> {
1920  const auto target_lvs =
1921  code_generator.codegen(selected_target_expr, fetch_columns, co);
1922  const auto geo_uoper = dynamic_cast<const Analyzer::GeoUOper*>(target_expr);
1923  const auto geo_binoper = dynamic_cast<const Analyzer::GeoBinOper*>(target_expr);
1924  if (geo_uoper || geo_binoper) {
1925  CHECK(target_expr->get_type_info().is_geometry());
1926  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
1927  target_lvs.size());
1928  return target_lvs;
1929  }
1930  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
1931  target_lvs.size());
1932 
1933  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1934  const auto i8p_ty =
1935  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1936  std::vector<llvm::Value*> coords;
1937  size_t ctr = 0;
1938  for (const auto& target_lv : target_lvs) {
1939  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
1940  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
1941  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
1942  // coords array (TINYINT). Subsequent arrays are regular INT.
1943 
1944  const size_t elem_sz = ctr == 0 ? 1 : 4;
1945  ctr++;
1946  int32_t fixlen = -1;
1947  if (target_ti.get_type() == kPOINT) {
1948  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
1949  if (col_var) {
1950  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
1951  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
1952  fixlen = coords_cd->columnType.get_size();
1953  }
1954  }
1955  }
1956  if (fixlen > 0) {
1957  coords.push_back(executor_->cgen_state_->emitExternalCall(
1958  "fast_fixlen_array_buff",
1959  i8p_ty,
1960  {target_lv, code_generator.posArg(selected_target_expr)}));
1961  coords.push_back(executor_->cgen_state_->llInt(int64_t(fixlen)));
1962  continue;
1963  }
1964  coords.push_back(executor_->cgen_state_->emitExternalCall(
1965  "array_buff",
1966  i8p_ty,
1967  {target_lv, code_generator.posArg(selected_target_expr)}));
1968  coords.push_back(executor_->cgen_state_->emitExternalCall(
1969  "array_size",
1970  i32_ty,
1971  {target_lv,
1972  code_generator.posArg(selected_target_expr),
1973  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
1974  }
1975  return coords;
1976  };
1977 
1978  if (agg_expr) {
1979  return generate_coord_lvs(agg_expr->get_arg(), true);
1980  } else {
1981  return generate_coord_lvs(target_expr,
1982  !executor_->plan_state_->allow_lazy_fetch_);
1983  }
1984  }
1985  }
1986  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1987  : code_generator.codegen(
1988  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1989 }
1990 
1991 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
1992  const std::vector<llvm::Value*>& args) {
1993  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1994  return executor_->cgen_state_->emitCall(fname, args);
1995 }
1996 
1997 void GroupByAndAggregate::checkErrorCode(llvm::Value* retCode) {
1998  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1999  auto zero_const = llvm::ConstantInt::get(retCode->getType(), 0, true);
2000  auto rc_check_condition = executor_->cgen_state_->ir_builder_.CreateICmp(
2001  llvm::ICmpInst::ICMP_EQ, retCode, zero_const);
2002 
2003  executor_->cgen_state_->emitErrorCheck(rc_check_condition, retCode, "rc");
2004 }
2005 
2006 #undef CUR_FUNC
2007 #undef ROW_FUNC
2008 #undef LL_FP
2009 #undef LL_INT
2010 #undef LL_BOOL
2011 #undef LL_BUILDER
2012 #undef LL_CONTEXT
2013 
2015  const RelAlgExecutionUnit& ra_exe_unit,
2016  const Catalog_Namespace::Catalog& catalog) {
2017  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
2018  return 0;
2019  }
2020  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
2021  const auto grouped_col_expr =
2022  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
2023  if (!grouped_col_expr) {
2024  continue;
2025  }
2026  if (grouped_col_expr->get_table_id() <= 0) {
2027  return 0;
2028  }
2029  const auto td = catalog.getMetadataForTable(grouped_col_expr->get_table_id());
2030  if (td->shardedColumnId == grouped_col_expr->get_column_id()) {
2031  return td->nShards;
2032  }
2033  }
2034  return 0;
2035 }
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:211
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:1447
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
llvm::Value * getAdditionalLiteral(const int32_t off)
ColRangeInfo get_expr_range_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Analyzer::Expr *expr, Executor *executor)
llvm::BasicBlock * cond_false_
void get_domain(DomainSet &domain_set) const override
Definition: Analyzer.cpp:3165
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
HOST DEVICE int get_size() const
Definition: sqltypes.h:324
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
bool g_enable_watchdog
std::string cat(Ts &&...args)
#define LL_BUILDER
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:102
RUNTIME_EXPORT void agg_count_distinct(int64_t *agg, const int64_t val)
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:115
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
#define LL_CONTEXT
bool expr_is_rowid(const Analyzer::Expr *expr, const Catalog_Namespace::Catalog &cat)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
ExecutorDeviceType
KeylessInfo get_keyless_info(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool is_group_by, Executor *executor)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
SQLTypeInfo sql_type
Definition: TargetInfo.h:42
Streaming Top N algorithm.
void codegenApproxMedian(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
#define LOG(tag)
Definition: Logger.h:194
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:79
void mark_function_always_inline(llvm::Function *func)
bool is_fp() const
Definition: sqltypes.h:493
ColRangeInfo getColRangeInfo()
const std::list< Analyzer::OrderEntry > order_entries
#define LL_INT(v)
static const size_t baseline_threshold
Definition: Execute.h:1042
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
QueryDescriptionType hash_type_
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:512
CountDistinctDescriptors init_count_distinct_descriptors(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ExecutorDeviceType device_type, Executor *executor)
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint)
#define CHECK_GE(x, y)
Definition: Logger.h:216
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: sqldefs.h:49
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
Expr * get_arg() const
Definition: Analyzer.h:1096
size_t getEffectiveKeyWidth() const
void checkErrorCode(llvm::Value *retCode)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:134
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:215
std::list< const Expr * > DomainSet
Definition: Analyzer.h:61
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
std::string to_string(char const *&&v)
Helpers for codegen of target expressions.
#define LL_BOOL(v)
size_t getColOnlyOffInBytes(const size_t col_idx) const
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:73
const SQLTypeInfo get_compact_type(const TargetInfo &target)
const size_t limit
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, DiamondCodegen &diamond_codegen) const
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const std::optional< int64_t > &group_cardinality_estimation)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:167
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:221
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:35
const SortInfo sort_info
size_t getGroupbyColCount() const
RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
const JoinQualsPerNestingLevel join_quals
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
bool g_bigint_count
Definition: sqldefs.h:75
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:130
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
DEVICE void allocate()
Definition: quantile.h:579
#define AUTOMATIC_IR_METADATA(CGENSTATE)
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:41
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:78
QueryDescriptionType getQueryDescriptionType() const
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
ExecutorDeviceType device_type
#define RUNTIME_EXPORT
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:28
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:42
#define CHECK_LT(x, y)
Definition: Logger.h:213
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
#define CHECK_LE(x, y)
Definition: Logger.h:214
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
Definition: sqldefs.h:76
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
Descriptor for the result set buffer layout.
CountDistinctImplType
const std::optional< int64_t > group_cardinality_estimation_
void add_transient_string_literals_for_expression(const Analyzer::Expr *expr, Executor *executor, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:203
bool is_geometry() const
Definition: sqltypes.h:501
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
llvm::Value * codegenWindowPosition(WindowFunctionContext *window_func_context, llvm::Value *pos_arg)
Definition: ColumnIR.cpp:226
bool g_cluster
RUNTIME_EXPORT void agg_approx_median(int64_t *agg, const double val)
#define CUR_FUNC
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:177
Definition: sqltypes.h:44
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
size_t g_leaf_count
Definition: ParserNode.cpp:76
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:321
const RelAlgExecutionUnit & ra_exe_unit_
const size_t offset
Definition: sqldefs.h:74
Definition: sqldefs.h:72
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
SQLOps get_optype() const
Definition: Analyzer.h:370
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)