OmniSciDB  a667adc9c8
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
19 
20 #include "CardinalityEstimator.h"
21 #include "CodeGenerator.h"
23 #include "ExpressionRange.h"
24 #include "ExpressionRewrite.h"
25 #include "GpuInitGroups.h"
26 #include "InPlaceSort.h"
28 #include "MaxwellCodegenPatch.h"
30 #include "TargetExprBuilder.h"
31 
32 #include "../CudaMgr/CudaMgr.h"
33 #include "../Shared/checked_alloc.h"
34 #include "../Shared/funcannotations.h"
35 #include "../Utils/ChunkIter.h"
37 #include "Execute.h"
38 #include "QueryTemplateGenerator.h"
39 #include "RuntimeFunctions.h"
40 #include "StreamingTopN.h"
41 #include "TopKSort.h"
42 #include "WindowContext.h"
43 
44 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
45 
46 #include <cstring> // strcat()
47 #include <numeric>
48 #include <string_view>
49 #include <thread>
50 
51 bool g_cluster{false};
52 bool g_bigint_count{false};
54 extern size_t g_leaf_count;
55 
56 namespace {
57 
58 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
59  int32_t agg_count{0};
60  for (auto target_expr : target_exprs) {
61  CHECK(target_expr);
62  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
63  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
64  const auto& ti = target_expr->get_type_info();
65  // TODO(pavan): or if is_geometry()
66  if (ti.is_buffer()) {
67  agg_count += 2;
68  } else if (ti.is_geometry()) {
69  agg_count += ti.get_physical_coord_cols() * 2;
70  } else {
71  ++agg_count;
72  }
73  continue;
74  }
75  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
76  agg_count += 2;
77  } else {
78  ++agg_count;
79  }
80  }
81  return agg_count;
82 }
83 
85  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
86  if (!col) {
87  return false;
88  }
89  const auto cd =
90  get_column_descriptor_maybe(col->get_column_id(), col->get_table_id(), cat);
91  if (!cd || !cd->isVirtualCol) {
92  return false;
93  }
94  CHECK_EQ("rowid", cd->columnName);
95  return true;
96 }
97 
98 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
99  for (const auto& target_expr : ra_exe_unit.target_exprs) {
100  const auto agg_info = get_target_info(target_expr, g_bigint_count);
101  if (agg_info.is_agg && is_distinct_target(agg_info)) {
102  return true;
103  }
104  }
105  return false;
106 }
107 
109  const int64_t max_entry_count) {
110  try {
111  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
112  checked_int64_t(col_range_info.min)) >= max_entry_count;
113  } catch (...) {
114  return true;
115  }
116 }
117 
118 bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate,
119  const ColRangeInfo& col_range_info) {
120  try {
121  // the cardinality estimate is the size of the baseline hash table. further penalize
122  // the baseline hash table by a factor of 2x due to overhead in computing baseline
123  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
124  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
125  // count of the column, we use baseline hash on the filtered set
126  return checked_int64_t(cardinality_estimate) * 2 <
127  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
128  checked_int64_t(col_range_info.min));
129  } catch (...) {
130  return false;
131  }
132 }
133 
134 } // namespace
135 
137  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
138  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
139  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
140  // can expect this to be true anyway for grouped queries since the precise version
141  // uses significantly more memory.
142  const int64_t baseline_threshold =
147  if (ra_exe_unit_.groupby_exprs.size() != 1) {
148  try {
149  checked_int64_t cardinality{1};
150  bool has_nulls{false};
151  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
152  auto col_range_info = getExprRangeInfo(groupby_expr.get());
153  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
154  // going through baseline hash if a non-integer type is encountered
155  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
156  }
157  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
158  CHECK_GE(crt_col_cardinality, 0);
159  cardinality *= crt_col_cardinality;
160  if (col_range_info.has_nulls) {
161  has_nulls = true;
162  }
163  }
164  // For zero or high cardinalities, use baseline layout.
165  if (!cardinality || cardinality > baseline_threshold) {
166  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
167  }
169  0,
170  int64_t(cardinality),
171  0,
172  has_nulls};
173  } catch (...) { // overflow when computing cardinality
174  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
175  }
176  }
177  // For single column groupby on high timestamps, force baseline hash due to wide ranges
178  // we are likely to encounter when applying quals to the expression range
179  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
180  // the range is small enough
181  if (ra_exe_unit_.groupby_exprs.front() &&
182  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
183  ra_exe_unit_.simple_quals.size() > 0) {
184  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
185  }
186  const auto col_range_info = getExprRangeInfo(ra_exe_unit_.groupby_exprs.front().get());
187  if (!ra_exe_unit_.groupby_exprs.front()) {
188  return col_range_info;
189  }
190  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
191  const int64_t col_count =
193  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
195  max_entry_count = std::min(max_entry_count, baseline_threshold);
196  }
197  const auto& groupby_expr_ti = ra_exe_unit_.groupby_exprs.front()->get_type_info();
198  if (groupby_expr_ti.is_string() && !col_range_info.bucket) {
199  CHECK(groupby_expr_ti.get_compression() == kENCODING_DICT);
200 
201  const bool has_filters =
202  !ra_exe_unit_.quals.empty() || !ra_exe_unit_.simple_quals.empty();
203  if (has_filters &&
204  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
205  // if filters are present, we can use the filter to narrow the cardinality of the
206  // group by in the case of ranges too big for perfect hash. Otherwise, we are better
207  // off attempting perfect hash (since we know the range will be made of
208  // monotonically increasing numbers from min to max for dictionary encoded strings)
209  // and failing later due to excessive memory use.
210  // Check the conditions where baseline hash can provide a performance increase and
211  // return baseline hash (potentially forcing an estimator query) as the range type.
212  // Otherwise, return col_range_info which will likely be perfect hash, though could
213  // be baseline from a previous call of this function prior to the estimator query.
214  if (!ra_exe_unit_.sort_info.order_entries.empty()) {
215  // TODO(adb): allow some sorts to pass through this block by centralizing sort
216  // algorithm decision making
218  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
219  // always use baseline hash for column range too big for perfect hash with count
220  // distinct descriptors. We will need 8GB of CPU memory minimum for the perfect
221  // hash group by in this case.
223  col_range_info.min,
224  col_range_info.max,
225  0,
226  col_range_info.has_nulls};
227  } else {
228  // use original col range for sort
229  return col_range_info;
230  }
231  }
232  // if filters are present and the filtered range is less than the cardinality of
233  // the column, consider baseline hash
236  col_range_info)) {
238  col_range_info.min,
239  col_range_info.max,
240  0,
241  col_range_info.has_nulls};
242  }
243  }
244  } else if ((!expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get(),
245  *executor_->catalog_)) &&
246  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
247  !col_range_info.bucket) {
249  col_range_info.min,
250  col_range_info.max,
251  0,
252  col_range_info.has_nulls};
253  }
254  return col_range_info;
255 }
256 
258  if (!expr) {
259  return {QueryDescriptionType::Projection, 0, 0, 0, false};
260  }
261 
262  const auto expr_range = getExpressionRange(
263  expr, query_infos_, executor_, boost::make_optional(ra_exe_unit_.simple_quals));
264  switch (expr_range.getType()) {
266  if (expr_range.getIntMin() > expr_range.getIntMax()) {
267  return {
268  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
269  }
271  expr_range.getIntMin(),
272  expr_range.getIntMax(),
273  expr_range.getBucket(),
274  expr_range.hasNulls()};
275  }
278  if (expr_range.getFpMin() > expr_range.getFpMax()) {
279  return {
280  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
281  }
282  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
283  }
285  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
286  default:
287  CHECK(false);
288  }
289  CHECK(false);
290  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
291 }
292 
294  checked_int64_t crt_col_cardinality =
295  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
296  if (col_range_info.bucket) {
297  crt_col_cardinality /= col_range_info.bucket;
298  }
299  return static_cast<int64_t>(crt_col_cardinality +
300  (1 + (col_range_info.has_nulls ? 1 : 0)));
301 }
302 
303 #define LL_CONTEXT executor_->cgen_state_->context_
304 #define LL_BUILDER executor_->cgen_state_->ir_builder_
305 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
306 #define LL_INT(v) executor_->cgen_state_->llInt(v)
307 #define LL_FP(v) executor_->cgen_state_->llFp(v)
308 #define ROW_FUNC executor_->cgen_state_->row_func_
309 #define CUR_FUNC executor_->cgen_state_->current_func_
310 
312  Executor* executor,
313  const ExecutorDeviceType device_type,
314  const RelAlgExecutionUnit& ra_exe_unit,
315  const std::vector<InputTableInfo>& query_infos,
316  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
317  const std::optional<int64_t>& group_cardinality_estimation)
318  : executor_(executor)
319  , ra_exe_unit_(ra_exe_unit)
320  , query_infos_(query_infos)
321  , row_set_mem_owner_(row_set_mem_owner)
322  , device_type_(device_type)
323  , group_cardinality_estimation_(group_cardinality_estimation) {
324  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
325  if (!groupby_expr) {
326  continue;
327  }
328  const auto& groupby_ti = groupby_expr->get_type_info();
329  if (groupby_ti.is_bytes()) {
330  throw std::runtime_error(
331  "Cannot group by string columns which are not dictionary encoded.");
332  }
333  if (groupby_ti.is_buffer()) {
334  throw std::runtime_error("Group by buffer not supported");
335  }
336  if (groupby_ti.is_geometry()) {
337  throw std::runtime_error("Group by geometry not supported");
338  }
339  }
340 }
341 
343  const size_t shard_count) const {
344  size_t device_count{0};
346  auto cuda_mgr = executor_->getCatalog()->getDataMgr().getCudaMgr();
347  CHECK(cuda_mgr);
348  device_count = executor_->getCatalog()->getDataMgr().getCudaMgr()->getDeviceCount();
349  CHECK_GT(device_count, 0u);
350  }
351 
352  int64_t bucket{col_range_info.bucket};
353 
354  if (shard_count) {
355  CHECK(!col_range_info.bucket);
356  /*
357  when a node has fewer devices than shard count,
358  a) In a distributed setup, the minimum distance between two keys would be
359  device_count because shards are stored consecutively across the physical tables,
360  i.e if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1
361  would have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf
362  node has only 1 device, in this case, all the keys from each node are loaded on
363  the device each.
364 
365  b) In a single node setup, the distance would be minimum of device_count or
366  difference of device_count - shard_count. For example: If a single node server
367  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
368  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9
369  device 3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum
370  of device_count or difference.
371 
372  When a node has device count equal to or more than shard count then the
373  minimum distance is always at least shard_count * no of leaf nodes.
374  */
375  if (device_count < shard_count) {
376  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
377  : std::min(device_count, shard_count - device_count);
378  } else {
379  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
380  }
381  }
382 
383  return bucket;
384 }
385 
386 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
387  const bool allow_multifrag,
388  const size_t max_groups_buffer_entry_count,
389  const int8_t crt_min_byte_width,
390  RenderInfo* render_info,
391  const bool output_columnar_hint) {
392  const auto shard_count =
395  : 0;
396  bool sort_on_gpu_hint =
397  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
400  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
401  // but the total output buffer size would be too big or it's a sharded top query.
402  // For the sake of managing risk, use the new result set way very selectively for
403  // this case only (alongside the baseline layout we've enabled for a while now).
404  bool must_use_baseline_sort = shard_count;
405  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
406  while (true) {
407  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
408  max_groups_buffer_entry_count,
409  crt_min_byte_width,
410  sort_on_gpu_hint,
411  render_info,
412  must_use_baseline_sort,
413  output_columnar_hint);
414  CHECK(query_mem_desc);
415  if (query_mem_desc->sortOnGpu() &&
416  (query_mem_desc->getBufferSizeBytes(device_type_) +
417  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
418  2 * 1024 * 1024 * 1024LL) {
419  must_use_baseline_sort = true;
420  sort_on_gpu_hint = false;
421  } else {
422  break;
423  }
424  }
425  return query_mem_desc;
426 }
427 
428 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
429  const bool allow_multifrag,
430  const size_t max_groups_buffer_entry_count,
431  const int8_t crt_min_byte_width,
432  const bool sort_on_gpu_hint,
433  RenderInfo* render_info,
434  const bool must_use_baseline_sort,
435  const bool output_columnar_hint) {
437 
438  const auto count_distinct_descriptors = initCountDistinctDescriptors();
439 
440  auto group_col_widths = get_col_byte_widths(ra_exe_unit_.groupby_exprs);
441 
442  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
443 
444  auto col_range_info_nosharding = getColRangeInfo();
445 
446  const auto shard_count =
449  : 0;
450 
451  const auto col_range_info =
452  ColRangeInfo{col_range_info_nosharding.hash_type_,
453  col_range_info_nosharding.min,
454  col_range_info_nosharding.max,
455  getShardedTopBucket(col_range_info_nosharding, shard_count),
456  col_range_info_nosharding.has_nulls};
457 
458  // Non-grouped aggregates do not support accessing aggregated ranges
459  // Keyless hash is currently only supported with single-column perfect hash
460  const auto keyless_info = !(is_group_by && col_range_info.hash_type_ ==
462  ? KeylessInfo{false, -1}
463  : getKeylessInfo(ra_exe_unit_.target_exprs, is_group_by);
464 
465  if (g_enable_watchdog &&
466  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
467  max_groups_buffer_entry_count > 120000000) ||
468  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
469  ra_exe_unit_.groupby_exprs.size() == 1 &&
470  (col_range_info.max - col_range_info.min) /
471  std::max(col_range_info.bucket, int64_t(1)) >
472  130000000))) {
473  throw WatchdogException("Query would use too much memory");
474  }
475  try {
477  ra_exe_unit_,
478  query_infos_,
479  col_range_info,
480  keyless_info,
481  allow_multifrag,
482  device_type_,
483  crt_min_byte_width,
484  sort_on_gpu_hint,
485  shard_count,
486  max_groups_buffer_entry_count,
487  render_info,
488  count_distinct_descriptors,
489  must_use_baseline_sort,
490  output_columnar_hint,
491  /*streaming_top_n_hint=*/true);
492  } catch (const StreamingTopNOOM& e) {
493  LOG(WARNING) << e.what() << " Disabling Streaming Top N.";
495  ra_exe_unit_,
496  query_infos_,
497  col_range_info,
498  keyless_info,
499  allow_multifrag,
500  device_type_,
501  crt_min_byte_width,
502  sort_on_gpu_hint,
503  shard_count,
504  max_groups_buffer_entry_count,
505  render_info,
506  count_distinct_descriptors,
507  must_use_baseline_sort,
508  output_columnar_hint,
509  /*streaming_top_n_hint=*/false);
510  }
511 }
512 
515 }
516 
517 namespace {
518 
520  const Analyzer::Expr* expr,
521  Executor* executor,
522  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner) {
523  if (!expr) {
524  return;
525  }
526 
527  const auto array_expr = dynamic_cast<const Analyzer::ArrayExpr*>(expr);
528  if (array_expr) {
529  for (size_t i = 0; i < array_expr->getElementCount(); i++) {
531  array_expr->getElement(i), executor, row_set_mem_owner);
532  }
533  return;
534  }
535 
536  const auto cast_expr = dynamic_cast<const Analyzer::UOper*>(expr);
537  const auto& expr_ti = expr->get_type_info();
538  if (cast_expr && cast_expr->get_optype() == kCAST && expr_ti.is_string()) {
539  CHECK_EQ(kENCODING_DICT, expr_ti.get_compression());
540  auto sdp = executor->getStringDictionaryProxy(
541  expr_ti.get_comp_param(), row_set_mem_owner, true);
542  CHECK(sdp);
543  const auto str_lit_expr =
544  dynamic_cast<const Analyzer::Constant*>(cast_expr->get_operand());
545  if (str_lit_expr && str_lit_expr->get_constval().stringval) {
546  sdp->getOrAddTransient(*str_lit_expr->get_constval().stringval);
547  }
548  return;
549  }
550  const auto case_expr = dynamic_cast<const Analyzer::CaseExpr*>(expr);
551  if (!case_expr) {
552  return;
553  }
554  Analyzer::DomainSet domain_set;
555  case_expr->get_domain(domain_set);
556  if (domain_set.empty()) {
557  return;
558  }
559  if (expr_ti.is_string()) {
560  CHECK_EQ(kENCODING_DICT, expr_ti.get_compression());
561  auto sdp = executor->getStringDictionaryProxy(
562  expr_ti.get_comp_param(), row_set_mem_owner, true);
563  CHECK(sdp);
564  for (const auto domain_expr : domain_set) {
565  const auto cast_expr = dynamic_cast<const Analyzer::UOper*>(domain_expr);
566  const auto str_lit_expr =
567  cast_expr && cast_expr->get_optype() == kCAST
568  ? dynamic_cast<const Analyzer::Constant*>(cast_expr->get_operand())
569  : dynamic_cast<const Analyzer::Constant*>(domain_expr);
570  if (str_lit_expr && str_lit_expr->get_constval().stringval) {
571  sdp->getOrAddTransient(*str_lit_expr->get_constval().stringval);
572  }
573  }
574  }
575 }
576 
577 } // namespace
578 
580  const RelAlgExecutionUnit& ra_exe_unit,
581  Executor* executor,
582  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner) {
583  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
585  group_expr.get(), executor, row_set_mem_owner);
586  }
587  for (const auto target_expr : ra_exe_unit.target_exprs) {
588  const auto& target_type = target_expr->get_type_info();
589  if (target_type.is_string() && target_type.get_compression() != kENCODING_DICT) {
590  continue;
591  }
592  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
593  if (agg_expr) {
594  if (agg_expr->get_aggtype() == kSINGLE_VALUE ||
595  agg_expr->get_aggtype() == kSAMPLE) {
597  agg_expr->get_arg(), executor, row_set_mem_owner);
598  }
599  } else {
601  target_expr, executor, row_set_mem_owner);
602  }
603  }
604 }
605 
607  CountDistinctDescriptors count_distinct_descriptors;
608  for (const auto target_expr : ra_exe_unit_.target_exprs) {
609  auto agg_info = get_target_info(target_expr, g_bigint_count);
610  if (is_distinct_target(agg_info)) {
611  CHECK(agg_info.is_agg);
612  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
613  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
614  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
615  if (arg_ti.is_bytes()) {
616  throw std::runtime_error(
617  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
618  }
619  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
620  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
621  }
622  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
623  throw std::runtime_error(
624  "APPROX_COUNT_DISTINCT on geometry columns not supported");
625  }
626  if (agg_info.is_distinct && arg_ti.is_geometry()) {
627  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
628  }
629  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
630  auto arg_range_info =
631  arg_ti.is_fp() ? no_range_info : getExprRangeInfo(agg_expr->get_arg());
632  CountDistinctImplType count_distinct_impl_type{CountDistinctImplType::StdSet};
633  int64_t bitmap_sz_bits{0};
634  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
635  const auto error_rate = agg_expr->get_error_rate();
636  if (error_rate) {
637  CHECK(error_rate->get_type_info().get_type() == kINT);
638  CHECK_GE(error_rate->get_constval().intval, 1);
639  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
640  } else {
641  bitmap_sz_bits = g_hll_precision_bits;
642  }
643  }
644  if (arg_range_info.isEmpty()) {
645  count_distinct_descriptors.emplace_back(
647  0,
648  64,
649  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
650  device_type_,
651  1});
652  continue;
653  }
654  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
655  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
656  // implementation for arrays
657  count_distinct_impl_type = CountDistinctImplType::Bitmap;
658  if (agg_info.agg_kind == kCOUNT) {
659  bitmap_sz_bits = arg_range_info.max - arg_range_info.min + 1;
660  const int64_t MAX_BITMAP_BITS{8 * 1000 * 1000 * 1000LL};
661  if (bitmap_sz_bits <= 0 || bitmap_sz_bits > MAX_BITMAP_BITS) {
662  count_distinct_impl_type = CountDistinctImplType::StdSet;
663  }
664  }
665  }
666  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
667  count_distinct_impl_type == CountDistinctImplType::StdSet &&
668  !(arg_ti.is_array() || arg_ti.is_geometry())) {
669  count_distinct_impl_type = CountDistinctImplType::Bitmap;
670  }
671 
672  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
673  count_distinct_impl_type == CountDistinctImplType::StdSet) {
674  throw WatchdogException("Cannot use a fast path for COUNT distinct");
675  }
676  const auto sub_bitmap_count =
678  count_distinct_descriptors.emplace_back(
679  CountDistinctDescriptor{count_distinct_impl_type,
680  arg_range_info.min,
681  bitmap_sz_bits,
682  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
683  device_type_,
684  sub_bitmap_count});
685  } else {
686  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
687  CountDistinctImplType::Invalid, 0, 0, false, device_type_, 0});
688  }
689  }
690  return count_distinct_descriptors;
691 }
692 
703  const std::vector<Analyzer::Expr*>& target_expr_list,
704  const bool is_group_by) const {
705  bool keyless{true}, found{false};
706  int32_t num_agg_expr{0};
707  int32_t index{0};
708  for (const auto target_expr : target_expr_list) {
709  const auto agg_info = get_target_info(target_expr, g_bigint_count);
710  const auto chosen_type = get_compact_type(agg_info);
711  if (agg_info.is_agg) {
712  num_agg_expr++;
713  }
714  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
715  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
716  CHECK(agg_expr);
717  const auto arg_expr = agg_arg(target_expr);
718  const bool float_argument_input = takes_float_argument(agg_info);
719  switch (agg_info.agg_kind) {
720  case kAVG:
721  ++index;
722  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
723  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
724  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
725  expr_range_info.hasNulls()) {
726  break;
727  }
728  }
729  found = true;
730  break;
731  case kCOUNT:
732  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
733  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
734  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
735  expr_range_info.hasNulls()) {
736  break;
737  }
738  }
739  found = true;
740  break;
741  case kSUM: {
742  auto arg_ti = arg_expr->get_type_info();
743  if (constrained_not_null(arg_expr, ra_exe_unit_.quals)) {
744  arg_ti.set_notnull(true);
745  }
746  if (!arg_ti.get_notnull()) {
747  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
748  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
749  !expr_range_info.hasNulls()) {
750  found = true;
751  }
752  } else {
753  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
754  switch (expr_range_info.getType()) {
757  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
758  found = true;
759  }
760  break;
762  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
763  found = true;
764  }
765  break;
766  default:
767  break;
768  }
769  }
770  break;
771  }
772  case kMIN: {
773  CHECK(agg_expr && agg_expr->get_arg());
774  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
775  if (arg_ti.is_string() || arg_ti.is_buffer()) {
776  break;
777  }
778  auto expr_range_info =
779  getExpressionRange(agg_expr->get_arg(), query_infos_, executor_);
780  auto init_max = get_agg_initial_val(agg_info.agg_kind,
781  chosen_type,
782  is_group_by || float_argument_input,
783  float_argument_input ? sizeof(float) : 8);
784  switch (expr_range_info.getType()) {
787  auto double_max =
788  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
789  if (expr_range_info.getFpMax() < double_max) {
790  found = true;
791  }
792  break;
793  }
795  if (expr_range_info.getIntMax() < init_max) {
796  found = true;
797  }
798  break;
799  default:
800  break;
801  }
802  break;
803  }
804  case kMAX: {
805  CHECK(agg_expr && agg_expr->get_arg());
806  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
807  if (arg_ti.is_string() || arg_ti.is_buffer()) {
808  break;
809  }
810  auto expr_range_info =
811  getExpressionRange(agg_expr->get_arg(), query_infos_, executor_);
812  // NULL sentinel and init value for kMAX are identical, which results in
813  // ambiguity in detecting empty keys in presence of nulls.
814  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
815  expr_range_info.hasNulls()) {
816  break;
817  }
818  auto init_min = get_agg_initial_val(agg_info.agg_kind,
819  chosen_type,
820  is_group_by || float_argument_input,
821  float_argument_input ? sizeof(float) : 8);
822  switch (expr_range_info.getType()) {
825  auto double_min =
826  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
827  if (expr_range_info.getFpMin() > double_min) {
828  found = true;
829  }
830  break;
831  }
833  if (expr_range_info.getIntMin() > init_min) {
834  found = true;
835  }
836  break;
837  default:
838  break;
839  }
840  break;
841  }
842  default:
843  keyless = false;
844  break;
845  }
846  }
847  if (!keyless) {
848  break;
849  }
850  if (!found) {
851  ++index;
852  }
853  }
854 
855  // shouldn't use keyless for projection only
856  return {
857  keyless && found,
858  index,
859  };
860 }
861 
863  const std::list<Analyzer::OrderEntry>& order_entries) {
864  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
865  return false;
866  }
867  for (const auto& order_entry : order_entries) {
868  CHECK_GE(order_entry.tle_no, 1);
869  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
870  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
871  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
872  return false;
873  }
874  // TODO(alex): relax the restrictions
875  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
876  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
877  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
878  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
879  return false;
880  }
881  if (agg_expr->get_arg()) {
882  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
883  if (arg_ti.is_fp()) {
884  return false;
885  }
886  auto expr_range_info = getExprRangeInfo(agg_expr->get_arg());
887  // TOD(adb): QMD not actually initialized here?
888  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
889  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
890  expr_range_info.has_nulls) &&
891  order_entry.is_desc == order_entry.nulls_first) {
892  return false;
893  }
894  }
895  const auto& target_ti = target_expr->get_type_info();
896  CHECK(!target_ti.is_buffer());
897  if (!target_ti.is_integer()) {
898  return false;
899  }
900  }
901  return true;
902 }
903 
905  llvm::Value* cond,
906  Executor* executor,
907  const bool chain_to_next,
908  const std::string& label_prefix,
909  DiamondCodegen* parent,
910  const bool share_false_edge_with_parent)
911  : executor_(executor), chain_to_next_(chain_to_next), parent_(parent) {
912  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
913  if (parent_) {
915  }
916  cond_true_ = llvm::BasicBlock::Create(LL_CONTEXT, label_prefix + "_true", CUR_FUNC);
917  if (share_false_edge_with_parent) {
918  CHECK(parent);
920  } else {
922  llvm::BasicBlock::Create(LL_CONTEXT, label_prefix + "_false", CUR_FUNC);
923  }
924 
925  LL_BUILDER.CreateCondBr(cond, cond_true_, cond_false_);
926  LL_BUILDER.SetInsertPoint(cond_true_);
927 }
928 
930  CHECK(!parent_);
931  chain_to_next_ = true;
932 }
933 
934 void GroupByAndAggregate::DiamondCodegen::setFalseTarget(llvm::BasicBlock* cond_false) {
935  CHECK(!parent_ || orig_cond_false_ != parent_->cond_false_);
936  cond_false_ = cond_false;
937 }
938 
940  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
941  if (parent_ && orig_cond_false_ != parent_->cond_false_) {
942  LL_BUILDER.CreateBr(parent_->cond_false_);
943  } else if (chain_to_next_) {
944  LL_BUILDER.CreateBr(cond_false_);
945  }
946  if (!parent_ || (!chain_to_next_ && cond_false_ != parent_->cond_false_)) {
947  LL_BUILDER.SetInsertPoint(orig_cond_false_);
948  }
949 }
950 
951 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
952  llvm::BasicBlock* sc_false,
954  const CompilationOptions& co,
955  const GpuSharedMemoryContext& gpu_smem_context) {
956  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
957  CHECK(filter_result);
958 
959  bool can_return_error = false;
960  llvm::BasicBlock* filter_false{nullptr};
961 
962  {
963  const bool is_group_by = !ra_exe_unit_.groupby_exprs.empty();
964 
965  if (executor_->isArchMaxwell(co.device_type)) {
967  }
968  DiamondCodegen filter_cfg(filter_result,
969  executor_,
970  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
971  "filter", // filter_true and filter_false basic blocks
972  nullptr,
973  false);
974  filter_false = filter_cfg.cond_false_;
975 
976  if (is_group_by) {
978  !query_mem_desc.useStreamingTopN()) {
979  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
980  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
981  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
982  llvm::Value* old_total_matched_val{nullptr};
984  old_total_matched_val =
985  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
986  total_matched_ptr,
987  LL_INT(int32_t(1)),
988  llvm::AtomicOrdering::Monotonic);
989  } else {
990  old_total_matched_val = LL_BUILDER.CreateLoad(total_matched_ptr);
991  LL_BUILDER.CreateStore(
992  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
993  total_matched_ptr);
994  }
995  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
996  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
997  }
998 
999  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
1000  if (query_mem_desc.usesGetGroupValueFast() ||
1001  query_mem_desc.getQueryDescriptionType() ==
1003  if (query_mem_desc.getGroupbyColCount() > 1) {
1004  filter_cfg.setChainToNext();
1005  }
1006  // Don't generate null checks if the group slot is guaranteed to be non-null,
1007  // as it's the case for get_group_value_fast* family.
1008  can_return_error = codegenAggCalls(
1009  agg_out_ptr_w_idx, {}, query_mem_desc, co, gpu_smem_context, filter_cfg);
1010  } else {
1011  {
1012  llvm::Value* nullcheck_cond{nullptr};
1013  if (query_mem_desc.didOutputColumnar()) {
1014  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
1015  LL_INT(int32_t(0)));
1016  } else {
1017  nullcheck_cond = LL_BUILDER.CreateICmpNE(
1018  std::get<0>(agg_out_ptr_w_idx),
1019  llvm::ConstantPointerNull::get(
1020  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
1021  }
1022  DiamondCodegen nullcheck_cfg(
1023  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
1025  agg_out_ptr_w_idx, {}, query_mem_desc, co, gpu_smem_context, filter_cfg);
1026  }
1027  can_return_error = true;
1028  if (query_mem_desc.getQueryDescriptionType() ==
1030  query_mem_desc.useStreamingTopN()) {
1031  // Ignore rejection on pushing current row to top-K heap.
1032  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
1033  } else {
1034  CodeGenerator code_generator(executor_);
1035  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
1036  // TODO(alex): remove the trunc once pos is converted to 32 bits
1037  code_generator.posArg(nullptr),
1038  get_int_type(32, LL_CONTEXT))));
1039  }
1040  }
1041  } else {
1042  if (ra_exe_unit_.estimator) {
1043  std::stack<llvm::BasicBlock*> array_loops;
1044  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
1045  } else {
1046  auto arg_it = ROW_FUNC->arg_begin();
1047  std::vector<llvm::Value*> agg_out_vec;
1048  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
1049  agg_out_vec.push_back(&*arg_it++);
1050  }
1051  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
1052  agg_out_vec,
1053  query_mem_desc,
1054  co,
1055  gpu_smem_context,
1056  filter_cfg);
1057  }
1058  }
1059  }
1060 
1061  if (ra_exe_unit_.join_quals.empty()) {
1062  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
1063  } else if (sc_false) {
1064  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
1065  LL_BUILDER.SetInsertPoint(sc_false);
1066  LL_BUILDER.CreateBr(filter_false);
1067  LL_BUILDER.SetInsertPoint(saved_insert_block);
1068  }
1069 
1070  return can_return_error;
1071 }
1072 
1074  llvm::Value* groups_buffer,
1076  const CompilationOptions& co,
1077  DiamondCodegen& diamond_codegen) {
1078  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1080  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1081  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
1082  CHECK(!group_expr);
1083  if (!query_mem_desc.didOutputColumnar()) {
1084  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
1085  }
1086  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1087  ? 0
1088  : query_mem_desc.getRowSize() / sizeof(int64_t);
1089  CodeGenerator code_generator(executor_);
1090  if (query_mem_desc.useStreamingTopN()) {
1091  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
1092  CHECK_GE(only_order_entry.tle_no, int(1));
1093  const size_t target_idx = only_order_entry.tle_no - 1;
1094  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1095  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1096  const auto chosen_bytes =
1097  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1098  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1099  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1101  std::string fname = "get_bin_from_k_heap";
1102  const auto& oe_ti = order_entry_expr->get_type_info();
1103  llvm::Value* null_key_lv = nullptr;
1104  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1105  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1106  switch (bit_width) {
1107  case 32:
1108  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1109  break;
1110  case 64:
1111  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1112  break;
1113  default:
1114  CHECK(false);
1115  }
1116  fname += "_int" + std::to_string(bit_width) + "_t";
1117  } else {
1118  CHECK(oe_ti.is_fp());
1119  if (order_entry_lv->getType()->isDoubleTy()) {
1120  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1121  } else {
1122  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1123  }
1124  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1125  }
1126  const auto key_slot_idx =
1128  return emitCall(
1129  fname,
1130  {groups_buffer,
1131  LL_INT(n),
1132  LL_INT(row_size_quad),
1133  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1134  LL_BOOL(only_order_entry.is_desc),
1135  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1136  LL_BOOL(only_order_entry.nulls_first),
1137  null_key_lv,
1138  order_entry_lv});
1139  } else {
1140  llvm::Value* output_buffer_entry_count_lv{nullptr};
1142  output_buffer_entry_count_lv =
1143  LL_BUILDER.CreateLoad(get_arg_by_name(ROW_FUNC, "max_matched"));
1144  CHECK(output_buffer_entry_count_lv);
1145  }
1146  const auto group_expr_lv =
1147  LL_BUILDER.CreateLoad(get_arg_by_name(ROW_FUNC, "old_total_matched"));
1148  std::vector<llvm::Value*> args{
1149  groups_buffer,
1150  output_buffer_entry_count_lv
1151  ? output_buffer_entry_count_lv
1152  : LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1153  group_expr_lv,
1154  code_generator.posArg(nullptr)};
1155  if (query_mem_desc.didOutputColumnar()) {
1156  const auto columnar_output_offset =
1157  emitCall("get_columnar_scan_output_offset", args);
1158  return columnar_output_offset;
1159  }
1160  args.push_back(LL_INT(row_size_quad));
1161  return emitCall("get_scan_output_slot", args);
1162  }
1163 }
1164 
1165 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1167  const CompilationOptions& co,
1168  DiamondCodegen& diamond_codegen) {
1169  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1170  auto arg_it = ROW_FUNC->arg_begin();
1171  auto groups_buffer = arg_it++;
1172 
1173  std::stack<llvm::BasicBlock*> array_loops;
1174 
1175  // TODO(Saman): move this logic outside of this function.
1177  if (query_mem_desc.didOutputColumnar()) {
1178  return std::make_tuple(
1179  &*groups_buffer,
1180  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1181  } else {
1182  return std::make_tuple(
1183  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1184  nullptr);
1185  }
1186  }
1187 
1188  CHECK(query_mem_desc.getQueryDescriptionType() ==
1190  query_mem_desc.getQueryDescriptionType() ==
1192 
1193  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1194  ? 0
1195  : query_mem_desc.getRowSize() / sizeof(int64_t);
1196 
1197  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1198  ? sizeof(int64_t)
1199  : query_mem_desc.getEffectiveKeyWidth();
1200  // for multi-column group by
1201  llvm::Value* group_key = nullptr;
1202  llvm::Value* key_size_lv = nullptr;
1203 
1204  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1205  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1206  if (query_mem_desc.getQueryDescriptionType() ==
1208  group_key =
1209  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1210  } else if (query_mem_desc.getQueryDescriptionType() ==
1212  group_key =
1213  col_width_size == sizeof(int32_t)
1214  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1215  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1216  }
1217  CHECK(group_key);
1218  CHECK(key_size_lv);
1219  }
1220 
1221  int32_t subkey_idx = 0;
1222  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1223  for (const auto& group_expr : ra_exe_unit_.groupby_exprs) {
1224  const auto col_range_info = getExprRangeInfo(group_expr.get());
1225  const auto translated_null_value = static_cast<int64_t>(
1226  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1227  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1228  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1229  : checked_int64_t(col_range_info.max) +
1230  (col_range_info.bucket ? col_range_info.bucket : 1));
1231 
1232  const bool col_has_nulls =
1233  query_mem_desc.getQueryDescriptionType() ==
1235  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1236  ? query_mem_desc.hasNulls()
1237  : col_range_info.has_nulls)
1238  : false;
1239 
1240  const auto group_expr_lvs =
1241  executor_->groupByColumnCodegen(group_expr.get(),
1242  col_width_size,
1243  co,
1244  col_has_nulls,
1245  translated_null_value,
1246  diamond_codegen,
1247  array_loops,
1248  query_mem_desc.threadsShareMemory());
1249  const auto group_expr_lv = group_expr_lvs.translated_value;
1250  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1251  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1252  return codegenSingleColumnPerfectHash(query_mem_desc,
1253  co,
1254  &*groups_buffer,
1255  group_expr_lv,
1256  group_expr_lvs.original_value,
1257  row_size_quad);
1258  } else {
1259  // store the sub-key to the buffer
1260  LL_BUILDER.CreateStore(group_expr_lv,
1261  LL_BUILDER.CreateGEP(group_key, LL_INT(subkey_idx++)));
1262  }
1263  }
1264  if (query_mem_desc.getQueryDescriptionType() ==
1266  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1268  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1269  } else if (query_mem_desc.getQueryDescriptionType() ==
1272  &*groups_buffer,
1273  group_key,
1274  key_size_lv,
1275  query_mem_desc,
1276  col_width_size,
1277  row_size_quad);
1278  }
1279  CHECK(false);
1280  return std::make_tuple(nullptr, nullptr);
1281 }
1282 
1283 std::tuple<llvm::Value*, llvm::Value*>
1286  const CompilationOptions& co,
1287  llvm::Value* groups_buffer,
1288  llvm::Value* group_expr_lv_translated,
1289  llvm::Value* group_expr_lv_original,
1290  const int32_t row_size_quad) {
1291  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1292  CHECK(query_mem_desc.usesGetGroupValueFast());
1293  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1294  ? "get_columnar_group_bin_offset"
1295  : "get_group_value_fast"};
1296  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1297  get_group_fn_name += "_keyless";
1298  }
1299  if (query_mem_desc.interleavedBins(co.device_type)) {
1300  CHECK(!query_mem_desc.didOutputColumnar());
1301  CHECK(query_mem_desc.hasKeylessHash());
1302  get_group_fn_name += "_semiprivate";
1303  }
1304  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1305  &*group_expr_lv_translated};
1306  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1307  query_mem_desc.mustUseBaselineSort()) {
1308  get_group_fn_name += "_with_original_key";
1309  get_group_fn_args.push_back(group_expr_lv_original);
1310  }
1311  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1312  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1313  if (!query_mem_desc.hasKeylessHash()) {
1314  if (!query_mem_desc.didOutputColumnar()) {
1315  get_group_fn_args.push_back(LL_INT(row_size_quad));
1316  }
1317  } else {
1318  if (!query_mem_desc.didOutputColumnar()) {
1319  get_group_fn_args.push_back(LL_INT(row_size_quad));
1320  }
1321  if (query_mem_desc.interleavedBins(co.device_type)) {
1322  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1323  get_group_fn_args.push_back(warp_idx);
1324  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1325  }
1326  }
1327  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1328  return std::make_tuple(&*groups_buffer,
1329  emitCall(get_group_fn_name, get_group_fn_args));
1330  }
1331  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1332 }
1333 
1334 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1335  llvm::Value* groups_buffer,
1336  llvm::Value* group_key,
1337  llvm::Value* key_size_lv,
1338  const QueryMemoryDescriptor& query_mem_desc,
1339  const int32_t row_size_quad) {
1340  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1341  CHECK(query_mem_desc.getQueryDescriptionType() ==
1343  // compute the index (perfect hash)
1344  auto perfect_hash_func = codegenPerfectHashFunction();
1345  auto hash_lv =
1346  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1347 
1348  if (query_mem_desc.didOutputColumnar()) {
1349  if (!query_mem_desc.hasKeylessHash()) {
1350  const std::string set_matching_func_name{
1351  "set_matching_group_value_perfect_hash_columnar"};
1352  const std::vector<llvm::Value*> set_matching_func_arg{
1353  groups_buffer,
1354  hash_lv,
1355  group_key,
1356  key_size_lv,
1357  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1358  query_mem_desc.getEntryCount())};
1359  emitCall(set_matching_func_name, set_matching_func_arg);
1360  }
1361  return std::make_tuple(groups_buffer, hash_lv);
1362  } else {
1363  if (query_mem_desc.hasKeylessHash()) {
1364  return std::make_tuple(emitCall("get_matching_group_value_perfect_hash_keyless",
1365  {groups_buffer, hash_lv, LL_INT(row_size_quad)}),
1366  nullptr);
1367  } else {
1368  return std::make_tuple(
1369  emitCall(
1370  "get_matching_group_value_perfect_hash",
1371  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1372  nullptr);
1373  }
1374  }
1375 }
1376 
1377 std::tuple<llvm::Value*, llvm::Value*>
1379  const CompilationOptions& co,
1380  llvm::Value* groups_buffer,
1381  llvm::Value* group_key,
1382  llvm::Value* key_size_lv,
1383  const QueryMemoryDescriptor& query_mem_desc,
1384  const size_t key_width,
1385  const int32_t row_size_quad) {
1386  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1387  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1388  CHECK(key_width == sizeof(int32_t));
1389  group_key =
1390  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1391  }
1392  std::vector<llvm::Value*> func_args{
1393  groups_buffer,
1394  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1395  &*group_key,
1396  &*key_size_lv,
1397  LL_INT(static_cast<int32_t>(key_width))};
1398  std::string func_name{"get_group_value"};
1399  if (query_mem_desc.didOutputColumnar()) {
1400  func_name += "_columnar_slot";
1401  } else {
1402  func_args.push_back(LL_INT(row_size_quad));
1403  }
1404  if (co.with_dynamic_watchdog) {
1405  func_name += "_with_watchdog";
1406  }
1407  if (query_mem_desc.didOutputColumnar()) {
1408  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1409  } else {
1410  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1411  }
1412 }
1413 
1415  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1416  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1417  auto ft = llvm::FunctionType::get(
1418  get_int_type(32, LL_CONTEXT),
1419  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1420  false);
1421  auto key_hash_func = llvm::Function::Create(ft,
1422  llvm::Function::ExternalLinkage,
1423  "perfect_key_hash",
1424  executor_->cgen_state_->module_);
1425  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1426  mark_function_always_inline(key_hash_func);
1427  auto& key_buff_arg = *key_hash_func->args().begin();
1428  llvm::Value* key_buff_lv = &key_buff_arg;
1429  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1430  llvm::IRBuilder<> key_hash_func_builder(bb);
1431  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1432  std::vector<int64_t> cardinalities;
1433  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1434  auto col_range_info = getExprRangeInfo(groupby_expr.get());
1435  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1436  cardinalities.push_back(getBucketedCardinality(col_range_info));
1437  }
1438  size_t dim_idx = 0;
1439  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1440  auto key_comp_lv = key_hash_func_builder.CreateLoad(
1441  key_hash_func_builder.CreateGEP(key_buff_lv, LL_INT(dim_idx)));
1442  auto col_range_info = getExprRangeInfo(groupby_expr.get());
1443  auto crt_term_lv =
1444  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1445  if (col_range_info.bucket) {
1446  crt_term_lv =
1447  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1448  }
1449  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1450  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1451  LL_INT(cardinalities[prev_dim_idx]));
1452  }
1453  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1454  ++dim_idx;
1455  }
1456  key_hash_func_builder.CreateRet(
1457  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1458  return key_hash_func;
1459 }
1460 
1462  const TargetInfo& agg_info,
1463  llvm::Value* target) {
1464  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1465  const auto& agg_type = agg_info.sql_type;
1466  const size_t chosen_bytes = agg_type.get_size();
1467 
1468  bool need_conversion{false};
1469  llvm::Value* arg_null{nullptr};
1470  llvm::Value* agg_null{nullptr};
1471  llvm::Value* target_to_cast{target};
1472  if (arg_type.is_fp()) {
1473  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1474  if (agg_type.is_fp()) {
1475  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1476  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1477  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1478  need_conversion = true;
1479  }
1480  } else {
1481  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1482  return target;
1483  }
1484  } else {
1485  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1486  if (agg_type.is_fp()) {
1487  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1488  need_conversion = true;
1489  target_to_cast = executor_->castToFP(target, arg_type, agg_type);
1490  } else {
1491  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1492  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1493  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1494  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1495  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1496  need_conversion = true;
1497  }
1498  }
1499  }
1500  if (need_conversion) {
1501  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1502  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1503  return LL_BUILDER.CreateSelect(
1504  cmp,
1505  agg_null,
1506  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1507  } else {
1508  return target;
1509  }
1510 }
1511 
1513  const Analyzer::WindowFunction* window_func,
1514  const QueryMemoryDescriptor& query_mem_desc,
1515  const CompilationOptions& co,
1516  DiamondCodegen& diamond_codegen) {
1517  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1518  const auto window_func_context =
1520  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1521  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1522  ? 0
1523  : query_mem_desc.getRowSize() / sizeof(int64_t);
1524  auto arg_it = ROW_FUNC->arg_begin();
1525  auto groups_buffer = arg_it++;
1526  CodeGenerator code_generator(executor_);
1527  auto window_pos_lv = code_generator.codegenWindowPosition(
1528  window_func_context, code_generator.posArg(nullptr));
1529  const auto pos_in_window =
1530  LL_BUILDER.CreateTrunc(window_pos_lv, get_int_type(32, LL_CONTEXT));
1531  llvm::Value* entry_count_lv =
1532  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1533  std::vector<llvm::Value*> args{
1534  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1535  if (query_mem_desc.didOutputColumnar()) {
1536  const auto columnar_output_offset =
1537  emitCall("get_columnar_scan_output_offset", args);
1538  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1539  }
1540  args.push_back(LL_INT(row_size_quad));
1541  return emitCall("get_scan_output_slot", args);
1542  }
1543  auto arg_it = ROW_FUNC->arg_begin();
1544  auto groups_buffer = arg_it++;
1545  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1546 }
1547 
1549  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1550  const std::vector<llvm::Value*>& agg_out_vec,
1551  const QueryMemoryDescriptor& query_mem_desc,
1552  const CompilationOptions& co,
1553  const GpuSharedMemoryContext& gpu_smem_context,
1554  DiamondCodegen& diamond_codegen) {
1555  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1556  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1557  // TODO(alex): unify the two cases, the output for non-group by queries
1558  // should be a contiguous buffer
1559  const bool is_group_by = std::get<0>(agg_out_ptr_w_idx);
1560  bool can_return_error = false;
1561  if (is_group_by) {
1562  CHECK(agg_out_vec.empty());
1563  } else {
1564  CHECK(!agg_out_vec.empty());
1565  }
1566 
1567  // output buffer is casted into a byte stream to be able to handle data elements of
1568  // different sizes (only used when actual column width sizes are used)
1569  llvm::Value* output_buffer_byte_stream{nullptr};
1570  llvm::Value* out_row_idx{nullptr};
1571  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1573  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1574  std::get<0>(agg_out_ptr_w_idx),
1575  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1576  output_buffer_byte_stream->setName("out_buff_b_stream");
1577  CHECK(std::get<1>(agg_out_ptr_w_idx));
1578  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1579  llvm::Type::getInt64Ty(LL_CONTEXT));
1580  out_row_idx->setName("out_row_idx");
1581  }
1582 
1583  TargetExprCodegenBuilder target_builder(query_mem_desc, ra_exe_unit_, is_group_by);
1584  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1585  ++target_idx) {
1586  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1587  CHECK(target_expr);
1588 
1589  target_builder(target_expr, executor_, co);
1590  }
1591 
1592  target_builder.codegen(this,
1593  executor_,
1594  query_mem_desc,
1595  co,
1596  gpu_smem_context,
1597  agg_out_ptr_w_idx,
1598  agg_out_vec,
1599  output_buffer_byte_stream,
1600  out_row_idx,
1601  diamond_codegen);
1602 
1603  for (auto target_expr : ra_exe_unit_.target_exprs) {
1604  CHECK(target_expr);
1605  executor_->plan_state_->isLazyFetchColumn(target_expr);
1606  }
1607 
1608  return can_return_error;
1609 }
1610 
1615  llvm::Value* output_buffer_byte_stream,
1616  llvm::Value* out_row_idx,
1617  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1618  const QueryMemoryDescriptor& query_mem_desc,
1619  const size_t chosen_bytes,
1620  const size_t agg_out_off,
1621  const size_t target_idx) {
1622  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1623  llvm::Value* agg_col_ptr{nullptr};
1624  if (query_mem_desc.didOutputColumnar()) {
1625  // TODO(Saman): remove the second columnar branch, and support all query description
1626  // types through the first branch. Then, input arguments should also be cleaned up
1627  if (!g_cluster &&
1629  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1630  chosen_bytes == 8);
1631  CHECK(output_buffer_byte_stream);
1632  CHECK(out_row_idx);
1633  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1634  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1635  auto out_per_col_byte_idx =
1636  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1637  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1638  LL_INT(static_cast<int64_t>(col_off)));
1639  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1640  auto output_ptr = LL_BUILDER.CreateGEP(output_buffer_byte_stream, byte_offset);
1641  agg_col_ptr = LL_BUILDER.CreateBitCast(
1642  output_ptr,
1643  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1644  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1645  } else {
1646  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1647  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1648  col_off /= chosen_bytes;
1649  CHECK(std::get<1>(agg_out_ptr_w_idx));
1650  auto offset = LL_BUILDER.CreateAdd(std::get<1>(agg_out_ptr_w_idx), LL_INT(col_off));
1651  agg_col_ptr = LL_BUILDER.CreateGEP(
1652  LL_BUILDER.CreateBitCast(
1653  std::get<0>(agg_out_ptr_w_idx),
1654  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0)),
1655  offset);
1656  }
1657  } else {
1658  uint32_t col_off = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1659  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1660  col_off /= chosen_bytes;
1661  agg_col_ptr = LL_BUILDER.CreateGEP(
1662  LL_BUILDER.CreateBitCast(
1663  std::get<0>(agg_out_ptr_w_idx),
1664  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0)),
1665  LL_INT(col_off));
1666  }
1667  CHECK(agg_col_ptr);
1668  return agg_col_ptr;
1669 }
1670 
1672  std::stack<llvm::BasicBlock*>& array_loops,
1673  GroupByAndAggregate::DiamondCodegen& diamond_codegen,
1674  const QueryMemoryDescriptor& query_mem_desc,
1675  const CompilationOptions& co) {
1676  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1677  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1678  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1679  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1680  estimator_comp_count_lv);
1681  int32_t subkey_idx = 0;
1682  for (const auto& estimator_arg_comp : estimator_arg) {
1683  const auto estimator_arg_comp_lvs =
1684  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1685  query_mem_desc.getEffectiveKeyWidth(),
1686  co,
1687  false,
1688  0,
1689  diamond_codegen,
1690  array_loops,
1691  true);
1692  CHECK(!estimator_arg_comp_lvs.original_value);
1693  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1694  // store the sub-key to the buffer
1695  LL_BUILDER.CreateStore(estimator_arg_comp_lv,
1696  LL_BUILDER.CreateGEP(estimator_key_lv, LL_INT(subkey_idx++)));
1697  }
1698  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1699  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1700  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1701  const auto estimator_comp_bytes_lv =
1702  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1703  const auto bitmap_size_lv =
1704  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1705  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1706  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1707 }
1708 
1709 extern "C" RUNTIME_EXPORT void agg_count_distinct(int64_t* agg, const int64_t val) {
1710  reinterpret_cast<std::set<int64_t>*>(*agg)->insert(val);
1711 }
1712 
1713 extern "C" RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t* agg,
1714  const int64_t val,
1715  const int64_t skip_val) {
1716  if (val != skip_val) {
1717  agg_count_distinct(agg, val);
1718  }
1719 }
1720 
1721 extern "C" RUNTIME_EXPORT void agg_approx_median(int64_t* agg, const double val) {
1722  auto* t_digest = reinterpret_cast<quantile::TDigest*>(*agg);
1723  t_digest->allocate();
1724  t_digest->add(val);
1725 }
1726 
1728  const size_t target_idx,
1729  const Analyzer::Expr* target_expr,
1730  std::vector<llvm::Value*>& agg_args,
1731  const QueryMemoryDescriptor& query_mem_desc,
1732  const ExecutorDeviceType device_type) {
1733  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1734  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1735  const auto& arg_ti =
1736  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1737  if (arg_ti.is_fp()) {
1738  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1739  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1740  }
1741  const auto& count_distinct_descriptor =
1742  query_mem_desc.getCountDistinctDescriptor(target_idx);
1743  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1744  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1745  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1746  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1747  if (device_type == ExecutorDeviceType::GPU) {
1748  const auto base_dev_addr = getAdditionalLiteral(-1);
1749  const auto base_host_addr = getAdditionalLiteral(-2);
1750  agg_args.push_back(base_dev_addr);
1751  agg_args.push_back(base_host_addr);
1752  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1753  } else {
1754  emitCall("agg_approximate_count_distinct", agg_args);
1755  }
1756  return;
1757  }
1758  std::string agg_fname{"agg_count_distinct"};
1759  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1760  agg_fname += "_bitmap";
1761  agg_args.push_back(LL_INT(static_cast<int64_t>(count_distinct_descriptor.min_val)));
1762  }
1763  if (agg_info.skip_null_val) {
1764  auto null_lv = executor_->cgen_state_->castToTypeIn(
1765  (arg_ti.is_fp()
1766  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1767  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1768  64);
1769  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1770  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1771  agg_fname += "_skip_val";
1772  agg_args.push_back(null_lv);
1773  }
1774  if (device_type == ExecutorDeviceType::GPU) {
1775  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1776  agg_fname += "_gpu";
1777  const auto base_dev_addr = getAdditionalLiteral(-1);
1778  const auto base_host_addr = getAdditionalLiteral(-2);
1779  agg_args.push_back(base_dev_addr);
1780  agg_args.push_back(base_host_addr);
1781  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1782  CHECK_EQ(size_t(0),
1783  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1784  count_distinct_descriptor.sub_bitmap_count);
1785  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1786  count_distinct_descriptor.sub_bitmap_count)));
1787  }
1788  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1789  emitCall(agg_fname, agg_args);
1790  } else {
1791  executor_->cgen_state_->emitExternalCall(
1792  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1793  }
1794 }
1795 
1796 void GroupByAndAggregate::codegenApproxMedian(const size_t target_idx,
1797  const Analyzer::Expr* target_expr,
1798  std::vector<llvm::Value*>& agg_args,
1799  const QueryMemoryDescriptor& query_mem_desc,
1800  const ExecutorDeviceType device_type) {
1801  if (device_type == ExecutorDeviceType::GPU) {
1802  throw QueryMustRunOnCpu();
1803  }
1804  llvm::BasicBlock *calc, *skip;
1805  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1806  auto const arg_ti =
1807  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1808  bool const nullable = !arg_ti.get_notnull();
1809 
1810  auto* cs = executor_->cgen_state_.get();
1811  auto& irb = cs->ir_builder_;
1812  if (nullable) {
1813  auto* const null_value = cs->castToTypeIn(cs->inlineNull(arg_ti), 64);
1814  auto* const skip_cond = arg_ti.is_fp()
1815  ? irb.CreateFCmpOEQ(agg_args.back(), null_value)
1816  : irb.CreateICmpEQ(agg_args.back(), null_value);
1817  calc = llvm::BasicBlock::Create(cs->context_, "calc_approx_median");
1818  skip = llvm::BasicBlock::Create(cs->context_, "skip_approx_median");
1819  irb.CreateCondBr(skip_cond, skip, calc);
1820  cs->current_func_->getBasicBlockList().push_back(calc);
1821  irb.SetInsertPoint(calc);
1822  }
1823  if (!arg_ti.is_fp()) {
1824  auto const agg_info = get_target_info(target_expr, g_bigint_count);
1825  agg_args.back() = executor_->castToFP(agg_args.back(), arg_ti, agg_info.sql_type);
1826  }
1827  cs->emitExternalCall(
1828  "agg_approx_median", llvm::Type::getVoidTy(cs->context_), agg_args);
1829  if (nullable) {
1830  irb.CreateBr(skip);
1831  cs->current_func_->getBasicBlockList().push_back(skip);
1832  irb.SetInsertPoint(skip);
1833  }
1834 }
1835 
1836 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
1837  CHECK_LT(off, 0);
1838  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
1839  return LL_BUILDER.CreateLoad(LL_BUILDER.CreateGEP(
1840  LL_BUILDER.CreateBitCast(lit_buff_lv,
1841  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)),
1842  LL_INT(off)));
1843 }
1844 
1845 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
1846  const Analyzer::Expr* target_expr,
1847  const CompilationOptions& co) {
1848  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1849  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
1850  const auto func_expr = dynamic_cast<const Analyzer::FunctionOper*>(target_expr);
1851  const auto arr_expr = dynamic_cast<const Analyzer::ArrayExpr*>(target_expr);
1852 
1853  // TODO(alex): handle arrays uniformly?
1854  CodeGenerator code_generator(executor_);
1855  if (target_expr) {
1856  const auto& target_ti = target_expr->get_type_info();
1857  if (target_ti.is_buffer() &&
1858  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1859  const auto target_lvs =
1860  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1861  : code_generator.codegen(
1862  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1863  if (!func_expr && !arr_expr) {
1864  // Something with the chunk transport is code that was generated from a source
1865  // other than an ARRAY[] expression
1866  if (target_ti.is_bytes()) {
1867  CHECK_EQ(size_t(3), target_lvs.size());
1868  return {target_lvs[1], target_lvs[2]};
1869  }
1870  CHECK(target_ti.is_array());
1871  CHECK_EQ(size_t(1), target_lvs.size());
1872  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
1873  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1874  const auto i8p_ty =
1875  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1876  const auto& elem_ti = target_ti.get_elem_type();
1877  return {
1878  executor_->cgen_state_->emitExternalCall(
1879  "array_buff",
1880  i8p_ty,
1881  {target_lvs.front(), code_generator.posArg(target_expr)}),
1882  executor_->cgen_state_->emitExternalCall(
1883  "array_size",
1884  i32_ty,
1885  {target_lvs.front(),
1886  code_generator.posArg(target_expr),
1887  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
1888  } else {
1889  if (agg_expr) {
1890  throw std::runtime_error(
1891  "Using array[] operator as argument to an aggregate operator is not "
1892  "supported");
1893  }
1894  CHECK(func_expr || arr_expr);
1895  if (dynamic_cast<const Analyzer::FunctionOper*>(target_expr)) {
1896  CHECK_EQ(size_t(1), target_lvs.size());
1897  const auto prefix = target_ti.get_buffer_name();
1898  CHECK(target_ti.is_array() || target_ti.is_bytes());
1899  const auto target_lv = LL_BUILDER.CreateLoad(target_lvs[0]);
1900  // const auto target_lv_type = target_lvs[0]->getType();
1901  // CHECK(target_lv_type->isStructTy());
1902  // CHECK_EQ(target_lv_type->getNumContainedTypes(), 3u);
1903  const auto i8p_ty = llvm::PointerType::get(
1904  get_int_type(8, executor_->cgen_state_->context_), 0);
1905  const auto ptr = LL_BUILDER.CreatePointerCast(
1906  LL_BUILDER.CreateExtractValue(target_lv, 0), i8p_ty);
1907  const auto size = LL_BUILDER.CreateExtractValue(target_lv, 1);
1908  const auto null_flag = LL_BUILDER.CreateExtractValue(target_lv, 2);
1909  const auto nullcheck_ok_bb =
1910  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_nullcheck_ok_bb", CUR_FUNC);
1911  const auto nullcheck_fail_bb = llvm::BasicBlock::Create(
1912  LL_CONTEXT, prefix + "_nullcheck_fail_bb", CUR_FUNC);
1913 
1914  // TODO(adb): probably better to zext the bool
1915  const auto nullcheck = LL_BUILDER.CreateICmpEQ(
1916  null_flag, executor_->cgen_state_->llInt(static_cast<int8_t>(1)));
1917  LL_BUILDER.CreateCondBr(nullcheck, nullcheck_fail_bb, nullcheck_ok_bb);
1918 
1919  const auto ret_bb =
1920  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_return", CUR_FUNC);
1921  LL_BUILDER.SetInsertPoint(ret_bb);
1922  auto result_phi = LL_BUILDER.CreatePHI(i8p_ty, 2, prefix + "_ptr_return");
1923  result_phi->addIncoming(ptr, nullcheck_ok_bb);
1924  const auto null_arr_sentinel = LL_BUILDER.CreateIntToPtr(
1925  executor_->cgen_state_->llInt(static_cast<int8_t>(0)), i8p_ty);
1926  result_phi->addIncoming(null_arr_sentinel, nullcheck_fail_bb);
1927  LL_BUILDER.SetInsertPoint(nullcheck_ok_bb);
1928  executor_->cgen_state_->emitExternalCall(
1929  "register_buffer_with_executor_rsm",
1930  llvm::Type::getVoidTy(executor_->cgen_state_->context_),
1931  {executor_->cgen_state_->llInt(reinterpret_cast<int64_t>(executor_)), ptr});
1932  LL_BUILDER.CreateBr(ret_bb);
1933  LL_BUILDER.SetInsertPoint(nullcheck_fail_bb);
1934  LL_BUILDER.CreateBr(ret_bb);
1935 
1936  LL_BUILDER.SetInsertPoint(ret_bb);
1937  return {result_phi, size};
1938  }
1939  CHECK_EQ(size_t(2), target_lvs.size());
1940  return {target_lvs[0], target_lvs[1]};
1941  }
1942  }
1943  if (target_ti.is_geometry() &&
1944  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1945  auto generate_coord_lvs =
1946  [&](auto* selected_target_expr,
1947  bool const fetch_columns) -> std::vector<llvm::Value*> {
1948  const auto target_lvs =
1949  code_generator.codegen(selected_target_expr, fetch_columns, co);
1950  const auto geo_uoper = dynamic_cast<const Analyzer::GeoUOper*>(target_expr);
1951  const auto geo_binoper = dynamic_cast<const Analyzer::GeoBinOper*>(target_expr);
1952  if (geo_uoper || geo_binoper) {
1953  CHECK(target_expr->get_type_info().is_geometry());
1954  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
1955  target_lvs.size());
1956  return target_lvs;
1957  }
1958  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
1959  target_lvs.size());
1960 
1961  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1962  const auto i8p_ty =
1963  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1964  std::vector<llvm::Value*> coords;
1965  size_t ctr = 0;
1966  for (const auto& target_lv : target_lvs) {
1967  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
1968  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
1969  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
1970  // coords array (TINYINT). Subsequent arrays are regular INT.
1971 
1972  const size_t elem_sz = ctr == 0 ? 1 : 4;
1973  ctr++;
1974  int32_t fixlen = -1;
1975  if (target_ti.get_type() == kPOINT) {
1976  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
1977  if (col_var) {
1978  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
1979  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
1980  fixlen = coords_cd->columnType.get_size();
1981  }
1982  }
1983  }
1984  if (fixlen > 0) {
1985  coords.push_back(executor_->cgen_state_->emitExternalCall(
1986  "fast_fixlen_array_buff",
1987  i8p_ty,
1988  {target_lv, code_generator.posArg(selected_target_expr)}));
1989  coords.push_back(executor_->cgen_state_->llInt(int64_t(fixlen)));
1990  continue;
1991  }
1992  coords.push_back(executor_->cgen_state_->emitExternalCall(
1993  "array_buff",
1994  i8p_ty,
1995  {target_lv, code_generator.posArg(selected_target_expr)}));
1996  coords.push_back(executor_->cgen_state_->emitExternalCall(
1997  "array_size",
1998  i32_ty,
1999  {target_lv,
2000  code_generator.posArg(selected_target_expr),
2001  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
2002  }
2003  return coords;
2004  };
2005 
2006  if (agg_expr) {
2007  return generate_coord_lvs(agg_expr->get_arg(), true);
2008  } else {
2009  return generate_coord_lvs(target_expr,
2010  !executor_->plan_state_->allow_lazy_fetch_);
2011  }
2012  }
2013  }
2014  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
2015  : code_generator.codegen(
2016  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
2017 }
2018 
2019 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
2020  const std::vector<llvm::Value*>& args) {
2021  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2022  return executor_->cgen_state_->emitCall(fname, args);
2023 }
2024 
2025 void GroupByAndAggregate::checkErrorCode(llvm::Value* retCode) {
2026  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2027  auto zero_const = llvm::ConstantInt::get(retCode->getType(), 0, true);
2028  auto rc_check_condition = executor_->cgen_state_->ir_builder_.CreateICmp(
2029  llvm::ICmpInst::ICMP_EQ, retCode, zero_const);
2030 
2031  executor_->cgen_state_->emitErrorCheck(rc_check_condition, retCode, "rc");
2032 }
2033 
2034 #undef CUR_FUNC
2035 #undef ROW_FUNC
2036 #undef LL_FP
2037 #undef LL_INT
2038 #undef LL_BOOL
2039 #undef LL_BUILDER
2040 #undef LL_CONTEXT
2041 
2043  const RelAlgExecutionUnit& ra_exe_unit,
2044  const Catalog_Namespace::Catalog& catalog) {
2045  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
2046  return 0;
2047  }
2048  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
2049  const auto grouped_col_expr =
2050  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
2051  if (!grouped_col_expr) {
2052  continue;
2053  }
2054  if (grouped_col_expr->get_table_id() <= 0) {
2055  return 0;
2056  }
2057  const auto td = catalog.getMetadataForTable(grouped_col_expr->get_table_id());
2058  if (td->shardedColumnId == grouped_col_expr->get_column_id()) {
2059  return td->nShards;
2060  }
2061  }
2062  return 0;
2063 }
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, GroupByAndAggregate::DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:205
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:1447
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
llvm::Value * getAdditionalLiteral(const int32_t off)
void get_domain(DomainSet &domain_set) const override
Definition: Analyzer.cpp:3112
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
HOST DEVICE int get_size() const
Definition: sqltypes.h:324
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
bool g_enable_watchdog
std::string cat(Ts &&...args)
#define LL_BUILDER
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:101
RUNTIME_EXPORT void agg_count_distinct(int64_t *agg, const int64_t val)
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:115
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
#define LL_CONTEXT
bool expr_is_rowid(const Analyzer::Expr *expr, const Catalog_Namespace::Catalog &cat)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
ExecutorDeviceType
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
SQLTypeInfo sql_type
Definition: TargetInfo.h:42
Streaming Top N algorithm.
void codegenApproxMedian(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
#define LOG(tag)
Definition: Logger.h:188
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:79
void mark_function_always_inline(llvm::Function *func)
bool is_fp() const
Definition: sqltypes.h:492
ColRangeInfo getColRangeInfo()
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, GroupByAndAggregate::DiamondCodegen &diamond_codegen) const
const std::list< Analyzer::OrderEntry > order_entries
#define LL_INT(v)
static const size_t baseline_threshold
Definition: Execute.h:1043
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
QueryDescriptionType hash_type_
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:512
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint)
#define CHECK_GE(x, y)
Definition: Logger.h:210
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: sqldefs.h:49
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
Expr * get_arg() const
Definition: Analyzer.h:1096
size_t getEffectiveKeyWidth() const
void checkErrorCode(llvm::Value *retCode)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:134
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:209
std::list< const Expr * > DomainSet
Definition: Analyzer.h:61
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
std::string to_string(char const *&&v)
Helpers for codegen of target expressions.
#define LL_BOOL(v)
size_t getColOnlyOffInBytes(const size_t col_idx) const
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:73
const SQLTypeInfo get_compact_type(const TargetInfo &target)
const size_t limit
CountDistinctDescriptors initCountDistinctDescriptors()
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const std::optional< int64_t > &group_cardinality_estimation)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:168
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:222
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:35
const SortInfo sort_info
size_t getGroupbyColCount() const
RUNTIME_EXPORT void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)
const JoinQualsPerNestingLevel join_quals
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
void setFalseTarget(llvm::BasicBlock *cond_false)
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
bool g_bigint_count
Definition: sqldefs.h:75
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:130
KeylessInfo getKeylessInfo(const std::vector< Analyzer::Expr * > &target_expr_list, const bool is_group_by) const
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
DiamondCodegen(llvm::Value *cond, Executor *executor, const bool chain_to_next, const std::string &label_prefix, DiamondCodegen *parent, const bool share_false_edge_with_parent)
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
DEVICE void allocate()
Definition: quantile.h:579
#define AUTOMATIC_IR_METADATA(CGENSTATE)
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:41
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:78
QueryDescriptionType getQueryDescriptionType() const
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
ExecutorDeviceType device_type
#define RUNTIME_EXPORT
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:28
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:42
#define CHECK_LT(x, y)
Definition: Logger.h:207
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
std::vector< int8_t > get_col_byte_widths(const T &col_expr_list)
#define CHECK_LE(x, y)
Definition: Logger.h:208
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
Definition: sqldefs.h:76
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
Descriptor for the result set buffer layout.
CountDistinctImplType
const std::optional< int64_t > group_cardinality_estimation_
void add_transient_string_literals_for_expression(const Analyzer::Expr *expr, Executor *executor, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:197
bool is_geometry() const
Definition: sqltypes.h:500
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
llvm::Value * codegenWindowPosition(WindowFunctionContext *window_func_context, llvm::Value *pos_arg)
Definition: ColumnIR.cpp:226
bool g_cluster
ColRangeInfo getExprRangeInfo(const Analyzer::Expr *expr) const
RUNTIME_EXPORT void agg_approx_median(int64_t *agg, const double val)
#define CUR_FUNC
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:178
Definition: sqltypes.h:44
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
size_t g_leaf_count
Definition: ParserNode.cpp:75
HOST DEVICE bool get_notnull() const
Definition: sqltypes.h:321
const RelAlgExecutionUnit & ra_exe_unit_
const size_t offset
Definition: sqldefs.h:74
Definition: sqldefs.h:72
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
SQLOps get_optype() const
Definition: Analyzer.h:370
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)