OmniSciDB  bf83d84833
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
19 
20 #include "CardinalityEstimator.h"
21 #include "CodeGenerator.h"
23 #include "ExpressionRange.h"
24 #include "ExpressionRewrite.h"
25 #include "GpuInitGroups.h"
26 #include "InPlaceSort.h"
28 #include "MaxwellCodegenPatch.h"
30 #include "TargetExprBuilder.h"
31 
32 #include "../CudaMgr/CudaMgr.h"
33 #include "../Shared/checked_alloc.h"
34 #include "../Utils/ChunkIter.h"
36 #include "Execute.h"
37 #include "QueryTemplateGenerator.h"
38 #include "RuntimeFunctions.h"
39 #include "StreamingTopN.h"
40 #include "TopKSort.h"
41 #include "WindowContext.h"
42 
43 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
44 
45 #include <cstring> // strcat()
46 #include <numeric>
47 #include <string_view>
48 #include <thread>
49 
50 bool g_cluster{false};
51 bool g_bigint_count{false};
53 extern size_t g_leaf_count;
54 
55 namespace {
56 
57 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
58  int32_t agg_count{0};
59  for (auto target_expr : target_exprs) {
60  CHECK(target_expr);
61  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
62  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
63  const auto& ti = target_expr->get_type_info();
64  // TODO(pavan): or if is_geometry()
65  if (ti.is_buffer()) {
66  agg_count += 2;
67  } else if (ti.is_geometry()) {
68  agg_count += ti.get_physical_coord_cols() * 2;
69  } else {
70  ++agg_count;
71  }
72  continue;
73  }
74  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
75  agg_count += 2;
76  } else {
77  ++agg_count;
78  }
79  }
80  return agg_count;
81 }
82 
84  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
85  if (!col) {
86  return false;
87  }
88  const auto cd =
89  get_column_descriptor_maybe(col->get_column_id(), col->get_table_id(), cat);
90  if (!cd || !cd->isVirtualCol) {
91  return false;
92  }
93  CHECK_EQ("rowid", cd->columnName);
94  return true;
95 }
96 
97 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
98  for (const auto& target_expr : ra_exe_unit.target_exprs) {
99  const auto agg_info = get_target_info(target_expr, g_bigint_count);
100  if (agg_info.is_agg && is_distinct_target(agg_info)) {
101  return true;
102  }
103  }
104  return false;
105 }
106 
108  const int64_t max_entry_count) {
109  try {
110  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
111  checked_int64_t(col_range_info.min)) >= max_entry_count;
112  } catch (...) {
113  return true;
114  }
115 }
116 
117 bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate,
118  const ColRangeInfo& col_range_info) {
119  try {
120  // the cardinality estimate is the size of the baseline hash table. further penalize
121  // the baseline hash table by a factor of 2x due to overhead in computing baseline
122  // hash. This has the overall effect of penalizing baseline hash over perfect hash by
123  // 4x; i.e. if the cardinality of the filtered data is less than 25% of the entry
124  // count of the column, we use baseline hash on the filtered set
125  return checked_int64_t(cardinality_estimate) * 2 <
126  static_cast<int64_t>(checked_int64_t(col_range_info.max) -
127  checked_int64_t(col_range_info.min));
128  } catch (...) {
129  return false;
130  }
131 }
132 
133 } // namespace
134 
136  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
137  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
138  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
139  // can expect this to be true anyway for grouped queries since the precise version
140  // uses significantly more memory.
141  const int64_t baseline_threshold =
146  if (ra_exe_unit_.groupby_exprs.size() != 1) {
147  try {
148  checked_int64_t cardinality{1};
149  bool has_nulls{false};
150  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
151  auto col_range_info = getExprRangeInfo(groupby_expr.get());
152  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
153  // going through baseline hash if a non-integer type is encountered
154  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
155  }
156  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
157  CHECK_GE(crt_col_cardinality, 0);
158  cardinality *= crt_col_cardinality;
159  if (col_range_info.has_nulls) {
160  has_nulls = true;
161  }
162  }
163  // For zero or high cardinalities, use baseline layout.
164  if (!cardinality || cardinality > baseline_threshold) {
165  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
166  }
168  0,
169  int64_t(cardinality),
170  0,
171  has_nulls};
172  } catch (...) { // overflow when computing cardinality
173  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
174  }
175  }
176  // For single column groupby on high timestamps, force baseline hash due to wide ranges
177  // we are likely to encounter when applying quals to the expression range
178  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
179  // the range is small enough
180  if (ra_exe_unit_.groupby_exprs.front() &&
181  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
182  ra_exe_unit_.simple_quals.size() > 0) {
183  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
184  }
185  const auto col_range_info = getExprRangeInfo(ra_exe_unit_.groupby_exprs.front().get());
186  if (!ra_exe_unit_.groupby_exprs.front()) {
187  return col_range_info;
188  }
189  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
190  const int64_t col_count =
192  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
194  max_entry_count = std::min(max_entry_count, baseline_threshold);
195  }
196  const auto& groupby_expr_ti = ra_exe_unit_.groupby_exprs.front()->get_type_info();
197  if (groupby_expr_ti.is_string() && !col_range_info.bucket) {
198  CHECK(groupby_expr_ti.get_compression() == kENCODING_DICT);
199 
200  const bool has_filters =
201  !ra_exe_unit_.quals.empty() || !ra_exe_unit_.simple_quals.empty();
202  if (has_filters &&
203  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
204  // if filters are present, we can use the filter to narrow the cardinality of the
205  // group by in the case of ranges too big for perfect hash. Otherwise, we are better
206  // off attempting perfect hash (since we know the range will be made of
207  // monotonically increasing numbers from min to max for dictionary encoded strings)
208  // and failing later due to excessive memory use.
209  // Check the conditions where baseline hash can provide a performance increase and
210  // return baseline hash (potentially forcing an estimator query) as the range type.
211  // Otherwise, return col_range_info which will likely be perfect hash, though could
212  // be baseline from a previous call of this function prior to the estimator query.
213  if (!ra_exe_unit_.sort_info.order_entries.empty()) {
214  // TODO(adb): allow some sorts to pass through this block by centralizing sort
215  // algorithm decision making
217  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count)) {
218  // always use baseline hash for column range too big for perfect hash with count
219  // distinct descriptors. We will need 8GB of CPU memory minimum for the perfect
220  // hash group by in this case.
222  col_range_info.min,
223  col_range_info.max,
224  0,
225  col_range_info.has_nulls};
226  } else {
227  // use original col range for sort
228  return col_range_info;
229  }
230  }
231  // if filters are present and the filtered range is less than the cardinality of
232  // the column, consider baseline hash
235  col_range_info)) {
237  col_range_info.min,
238  col_range_info.max,
239  0,
240  col_range_info.has_nulls};
241  }
242  }
243  } else if ((!expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get(),
244  *executor_->catalog_)) &&
245  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
246  !col_range_info.bucket) {
248  col_range_info.min,
249  col_range_info.max,
250  0,
251  col_range_info.has_nulls};
252  }
253  return col_range_info;
254 }
255 
257  if (!expr) {
258  return {QueryDescriptionType::Projection, 0, 0, 0, false};
259  }
260 
261  const auto expr_range = getExpressionRange(
262  expr, query_infos_, executor_, boost::make_optional(ra_exe_unit_.simple_quals));
263  switch (expr_range.getType()) {
265  if (expr_range.getIntMin() > expr_range.getIntMax()) {
266  return {
267  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
268  }
270  expr_range.getIntMin(),
271  expr_range.getIntMax(),
272  expr_range.getBucket(),
273  expr_range.hasNulls()};
274  }
277  if (expr_range.getFpMin() > expr_range.getFpMax()) {
278  return {
279  QueryDescriptionType::GroupByBaselineHash, 0, -1, 0, expr_range.hasNulls()};
280  }
281  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
282  }
284  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
285  default:
286  CHECK(false);
287  }
288  CHECK(false);
289  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
290 }
291 
293  checked_int64_t crt_col_cardinality =
294  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
295  if (col_range_info.bucket) {
296  crt_col_cardinality /= col_range_info.bucket;
297  }
298  return static_cast<int64_t>(crt_col_cardinality +
299  (1 + (col_range_info.has_nulls ? 1 : 0)));
300 }
301 
302 #define LL_CONTEXT executor_->cgen_state_->context_
303 #define LL_BUILDER executor_->cgen_state_->ir_builder_
304 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
305 #define LL_INT(v) executor_->cgen_state_->llInt(v)
306 #define LL_FP(v) executor_->cgen_state_->llFp(v)
307 #define ROW_FUNC executor_->cgen_state_->row_func_
308 #define CUR_FUNC executor_->cgen_state_->current_func_
309 
311  Executor* executor,
312  const ExecutorDeviceType device_type,
313  const RelAlgExecutionUnit& ra_exe_unit,
314  const std::vector<InputTableInfo>& query_infos,
315  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
316  const std::optional<int64_t>& group_cardinality_estimation)
317  : executor_(executor)
318  , ra_exe_unit_(ra_exe_unit)
319  , query_infos_(query_infos)
320  , row_set_mem_owner_(row_set_mem_owner)
321  , device_type_(device_type)
322  , group_cardinality_estimation_(group_cardinality_estimation) {
323  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
324  if (!groupby_expr) {
325  continue;
326  }
327  const auto& groupby_ti = groupby_expr->get_type_info();
328  if (groupby_ti.is_bytes()) {
329  throw std::runtime_error(
330  "Cannot group by string columns which are not dictionary encoded.");
331  }
332  if (groupby_ti.is_buffer()) {
333  throw std::runtime_error("Group by buffer not supported");
334  }
335  if (groupby_ti.is_geometry()) {
336  throw std::runtime_error("Group by geometry not supported");
337  }
338  }
339 }
340 
342  const size_t shard_count) const {
343  size_t device_count{0};
345  auto cuda_mgr = executor_->getCatalog()->getDataMgr().getCudaMgr();
346  CHECK(cuda_mgr);
347  device_count = executor_->getCatalog()->getDataMgr().getCudaMgr()->getDeviceCount();
348  CHECK_GT(device_count, 0u);
349  }
350 
351  int64_t bucket{col_range_info.bucket};
352 
353  if (shard_count) {
354  CHECK(!col_range_info.bucket);
355  /*
356  when a node has fewer devices than shard count,
357  a) In a distributed setup, the minimum distance between two keys would be
358  device_count because shards are stored consecutively across the physical tables,
359  i.e if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1
360  would have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf
361  node has only 1 device, in this case, all the keys from each node are loaded on
362  the device each.
363 
364  b) In a single node setup, the distance would be minimum of device_count or
365  difference of device_count - shard_count. For example: If a single node server
366  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
367  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9
368  device 3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum
369  of device_count or difference.
370 
371  When a node has device count equal to or more than shard count then the
372  minimum distance is always at least shard_count * no of leaf nodes.
373  */
374  if (device_count < shard_count) {
375  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
376  : std::min(device_count, shard_count - device_count);
377  } else {
378  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
379  }
380  }
381 
382  return bucket;
383 }
384 
385 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
386  const bool allow_multifrag,
387  const size_t max_groups_buffer_entry_count,
388  const int8_t crt_min_byte_width,
389  RenderInfo* render_info,
390  const bool output_columnar_hint) {
391  const auto shard_count =
394  : 0;
395  bool sort_on_gpu_hint =
396  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
399  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
400  // but the total output buffer size would be too big or it's a sharded top query.
401  // For the sake of managing risk, use the new result set way very selectively for
402  // this case only (alongside the baseline layout we've enabled for a while now).
403  bool must_use_baseline_sort = shard_count;
404  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
405  while (true) {
406  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
407  max_groups_buffer_entry_count,
408  crt_min_byte_width,
409  sort_on_gpu_hint,
410  render_info,
411  must_use_baseline_sort,
412  output_columnar_hint);
413  CHECK(query_mem_desc);
414  if (query_mem_desc->sortOnGpu() &&
415  (query_mem_desc->getBufferSizeBytes(device_type_) +
416  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
417  2 * 1024 * 1024 * 1024L) {
418  must_use_baseline_sort = true;
419  sort_on_gpu_hint = false;
420  } else {
421  break;
422  }
423  }
424  return query_mem_desc;
425 }
426 
427 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
428  const bool allow_multifrag,
429  const size_t max_groups_buffer_entry_count,
430  const int8_t crt_min_byte_width,
431  const bool sort_on_gpu_hint,
432  RenderInfo* render_info,
433  const bool must_use_baseline_sort,
434  const bool output_columnar_hint) {
436 
437  const auto count_distinct_descriptors = initCountDistinctDescriptors();
438 
439  auto group_col_widths = get_col_byte_widths(ra_exe_unit_.groupby_exprs);
440 
441  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
442 
443  auto col_range_info_nosharding = getColRangeInfo();
444 
445  const auto shard_count =
448  : 0;
449 
450  const auto col_range_info =
451  ColRangeInfo{col_range_info_nosharding.hash_type_,
452  col_range_info_nosharding.min,
453  col_range_info_nosharding.max,
454  getShardedTopBucket(col_range_info_nosharding, shard_count),
455  col_range_info_nosharding.has_nulls};
456 
457  // Non-grouped aggregates do not support accessing aggregated ranges
458  // Keyless hash is currently only supported with single-column perfect hash
459  const auto keyless_info = !(is_group_by && col_range_info.hash_type_ ==
461  ? KeylessInfo{false, -1}
462  : getKeylessInfo(ra_exe_unit_.target_exprs, is_group_by);
463 
464  if (g_enable_watchdog &&
465  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
466  max_groups_buffer_entry_count > 120000000) ||
467  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
468  ra_exe_unit_.groupby_exprs.size() == 1 &&
469  (col_range_info.max - col_range_info.min) /
470  std::max(col_range_info.bucket, int64_t(1)) >
471  130000000))) {
472  throw WatchdogException("Query would use too much memory");
473  }
474  try {
476  ra_exe_unit_,
477  query_infos_,
478  col_range_info,
479  keyless_info,
480  allow_multifrag,
481  device_type_,
482  crt_min_byte_width,
483  sort_on_gpu_hint,
484  shard_count,
485  max_groups_buffer_entry_count,
486  render_info,
487  count_distinct_descriptors,
488  must_use_baseline_sort,
489  output_columnar_hint,
490  /*streaming_top_n_hint=*/true);
491  } catch (const StreamingTopNOOM& e) {
492  LOG(WARNING) << e.what() << " Disabling Streaming Top N.";
494  ra_exe_unit_,
495  query_infos_,
496  col_range_info,
497  keyless_info,
498  allow_multifrag,
499  device_type_,
500  crt_min_byte_width,
501  sort_on_gpu_hint,
502  shard_count,
503  max_groups_buffer_entry_count,
504  render_info,
505  count_distinct_descriptors,
506  must_use_baseline_sort,
507  output_columnar_hint,
508  /*streaming_top_n_hint=*/false);
509  }
510 }
511 
514 }
515 
516 namespace {
517 
519  const Analyzer::Expr* expr,
520  Executor* executor,
521  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner) {
522  if (!expr) {
523  return;
524  }
525 
526  const auto array_expr = dynamic_cast<const Analyzer::ArrayExpr*>(expr);
527  if (array_expr) {
528  for (size_t i = 0; i < array_expr->getElementCount(); i++) {
530  array_expr->getElement(i), executor, row_set_mem_owner);
531  }
532  return;
533  }
534 
535  const auto cast_expr = dynamic_cast<const Analyzer::UOper*>(expr);
536  const auto& expr_ti = expr->get_type_info();
537  if (cast_expr && cast_expr->get_optype() == kCAST && expr_ti.is_string()) {
538  CHECK_EQ(kENCODING_DICT, expr_ti.get_compression());
539  auto sdp = executor->getStringDictionaryProxy(
540  expr_ti.get_comp_param(), row_set_mem_owner, true);
541  CHECK(sdp);
542  const auto str_lit_expr =
543  dynamic_cast<const Analyzer::Constant*>(cast_expr->get_operand());
544  if (str_lit_expr && str_lit_expr->get_constval().stringval) {
545  sdp->getOrAddTransient(*str_lit_expr->get_constval().stringval);
546  }
547  return;
548  }
549  const auto case_expr = dynamic_cast<const Analyzer::CaseExpr*>(expr);
550  if (!case_expr) {
551  return;
552  }
553  Analyzer::DomainSet domain_set;
554  case_expr->get_domain(domain_set);
555  if (domain_set.empty()) {
556  return;
557  }
558  if (expr_ti.is_string()) {
559  CHECK_EQ(kENCODING_DICT, expr_ti.get_compression());
560  auto sdp = executor->getStringDictionaryProxy(
561  expr_ti.get_comp_param(), row_set_mem_owner, true);
562  CHECK(sdp);
563  for (const auto domain_expr : domain_set) {
564  const auto cast_expr = dynamic_cast<const Analyzer::UOper*>(domain_expr);
565  const auto str_lit_expr =
566  cast_expr && cast_expr->get_optype() == kCAST
567  ? dynamic_cast<const Analyzer::Constant*>(cast_expr->get_operand())
568  : dynamic_cast<const Analyzer::Constant*>(domain_expr);
569  if (str_lit_expr && str_lit_expr->get_constval().stringval) {
570  sdp->getOrAddTransient(*str_lit_expr->get_constval().stringval);
571  }
572  }
573  }
574 }
575 
576 } // namespace
577 
579  const RelAlgExecutionUnit& ra_exe_unit,
580  Executor* executor,
581  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner) {
582  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
584  group_expr.get(), executor, row_set_mem_owner);
585  }
586  for (const auto target_expr : ra_exe_unit.target_exprs) {
587  const auto& target_type = target_expr->get_type_info();
588  if (target_type.is_string() && target_type.get_compression() != kENCODING_DICT) {
589  continue;
590  }
591  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
592  if (agg_expr) {
593  if (agg_expr->get_aggtype() == kSINGLE_VALUE ||
594  agg_expr->get_aggtype() == kSAMPLE) {
596  agg_expr->get_arg(), executor, row_set_mem_owner);
597  }
598  } else {
600  target_expr, executor, row_set_mem_owner);
601  }
602  }
603 }
604 
606  CountDistinctDescriptors count_distinct_descriptors;
607  for (const auto target_expr : ra_exe_unit_.target_exprs) {
608  auto agg_info = get_target_info(target_expr, g_bigint_count);
609  if (is_distinct_target(agg_info)) {
610  CHECK(agg_info.is_agg);
611  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
612  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
613  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
614  if (arg_ti.is_bytes()) {
615  throw std::runtime_error(
616  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
617  }
618  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_buffer()) {
619  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
620  }
621  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
622  throw std::runtime_error(
623  "APPROX_COUNT_DISTINCT on geometry columns not supported");
624  }
625  if (agg_info.is_distinct && arg_ti.is_geometry()) {
626  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
627  }
628  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
629  auto arg_range_info =
630  arg_ti.is_fp() ? no_range_info : getExprRangeInfo(agg_expr->get_arg());
631  CountDistinctImplType count_distinct_impl_type{CountDistinctImplType::StdSet};
632  int64_t bitmap_sz_bits{0};
633  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
634  const auto error_rate = agg_expr->get_error_rate();
635  if (error_rate) {
636  CHECK(error_rate->get_type_info().get_type() == kINT);
637  CHECK_GE(error_rate->get_constval().intval, 1);
638  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
639  } else {
640  bitmap_sz_bits = g_hll_precision_bits;
641  }
642  }
643  if (arg_range_info.isEmpty()) {
644  count_distinct_descriptors.emplace_back(
646  0,
647  64,
648  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
649  device_type_,
650  1});
651  continue;
652  }
653  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
654  !(arg_ti.is_buffer() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
655  // implementation for arrays
656  count_distinct_impl_type = CountDistinctImplType::Bitmap;
657  if (agg_info.agg_kind == kCOUNT) {
658  bitmap_sz_bits = arg_range_info.max - arg_range_info.min + 1;
659  const int64_t MAX_BITMAP_BITS{8 * 1000 * 1000 * 1000L};
660  if (bitmap_sz_bits <= 0 || bitmap_sz_bits > MAX_BITMAP_BITS) {
661  count_distinct_impl_type = CountDistinctImplType::StdSet;
662  }
663  }
664  }
665  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
666  count_distinct_impl_type == CountDistinctImplType::StdSet &&
667  !(arg_ti.is_array() || arg_ti.is_geometry())) {
668  count_distinct_impl_type = CountDistinctImplType::Bitmap;
669  }
670 
671  if (g_enable_watchdog && !(arg_range_info.isEmpty()) &&
672  count_distinct_impl_type == CountDistinctImplType::StdSet) {
673  throw WatchdogException("Cannot use a fast path for COUNT distinct");
674  }
675  const auto sub_bitmap_count =
677  count_distinct_descriptors.emplace_back(
678  CountDistinctDescriptor{count_distinct_impl_type,
679  arg_range_info.min,
680  bitmap_sz_bits,
681  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
682  device_type_,
683  sub_bitmap_count});
684  } else {
685  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
686  CountDistinctImplType::Invalid, 0, 0, false, device_type_, 0});
687  }
688  }
689  return count_distinct_descriptors;
690 }
691 
702  const std::vector<Analyzer::Expr*>& target_expr_list,
703  const bool is_group_by) const {
704  bool keyless{true}, found{false};
705  int32_t num_agg_expr{0};
706  int32_t index{0};
707  for (const auto target_expr : target_expr_list) {
708  const auto agg_info = get_target_info(target_expr, g_bigint_count);
709  const auto chosen_type = get_compact_type(agg_info);
710  if (agg_info.is_agg) {
711  num_agg_expr++;
712  }
713  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
714  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
715  CHECK(agg_expr);
716  const auto arg_expr = agg_arg(target_expr);
717  const bool float_argument_input = takes_float_argument(agg_info);
718  switch (agg_info.agg_kind) {
719  case kAVG:
720  ++index;
721  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
722  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
723  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
724  expr_range_info.hasNulls()) {
725  break;
726  }
727  }
728  found = true;
729  break;
730  case kCOUNT:
731  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
732  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
733  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
734  expr_range_info.hasNulls()) {
735  break;
736  }
737  }
738  found = true;
739  break;
740  case kSUM: {
741  auto arg_ti = arg_expr->get_type_info();
742  if (constrained_not_null(arg_expr, ra_exe_unit_.quals)) {
743  arg_ti.set_notnull(true);
744  }
745  if (!arg_ti.get_notnull()) {
746  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
747  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
748  !expr_range_info.hasNulls()) {
749  found = true;
750  }
751  } else {
752  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
753  switch (expr_range_info.getType()) {
756  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
757  found = true;
758  }
759  break;
761  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
762  found = true;
763  }
764  break;
765  default:
766  break;
767  }
768  }
769  break;
770  }
771  case kMIN: {
772  CHECK(agg_expr && agg_expr->get_arg());
773  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
774  if (arg_ti.is_string() || arg_ti.is_buffer()) {
775  break;
776  }
777  auto expr_range_info =
778  getExpressionRange(agg_expr->get_arg(), query_infos_, executor_);
779  auto init_max = get_agg_initial_val(agg_info.agg_kind,
780  chosen_type,
781  is_group_by || float_argument_input,
782  float_argument_input ? sizeof(float) : 8);
783  switch (expr_range_info.getType()) {
786  auto double_max =
787  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
788  if (expr_range_info.getFpMax() < double_max) {
789  found = true;
790  }
791  break;
792  }
794  if (expr_range_info.getIntMax() < init_max) {
795  found = true;
796  }
797  break;
798  default:
799  break;
800  }
801  break;
802  }
803  case kMAX: {
804  CHECK(agg_expr && agg_expr->get_arg());
805  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
806  if (arg_ti.is_string() || arg_ti.is_buffer()) {
807  break;
808  }
809  auto expr_range_info =
810  getExpressionRange(agg_expr->get_arg(), query_infos_, executor_);
811  // NULL sentinel and init value for kMAX are identical, which results in
812  // ambiguity in detecting empty keys in presence of nulls.
813  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
814  expr_range_info.hasNulls()) {
815  break;
816  }
817  auto init_min = get_agg_initial_val(agg_info.agg_kind,
818  chosen_type,
819  is_group_by || float_argument_input,
820  float_argument_input ? sizeof(float) : 8);
821  switch (expr_range_info.getType()) {
824  auto double_min =
825  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
826  if (expr_range_info.getFpMin() > double_min) {
827  found = true;
828  }
829  break;
830  }
832  if (expr_range_info.getIntMin() > init_min) {
833  found = true;
834  }
835  break;
836  default:
837  break;
838  }
839  break;
840  }
841  default:
842  keyless = false;
843  break;
844  }
845  }
846  if (!keyless) {
847  break;
848  }
849  if (!found) {
850  ++index;
851  }
852  }
853 
854  // shouldn't use keyless for projection only
855  return {
856  keyless && found,
857  index,
858  };
859 }
860 
862  const std::list<Analyzer::OrderEntry>& order_entries) {
863  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
864  return false;
865  }
866  for (const auto& order_entry : order_entries) {
867  CHECK_GE(order_entry.tle_no, 1);
868  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
869  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
870  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
871  return false;
872  }
873  // TODO(alex): relax the restrictions
874  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
875  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
876  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
877  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
878  return false;
879  }
880  if (agg_expr->get_arg()) {
881  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
882  if (arg_ti.is_fp()) {
883  return false;
884  }
885  auto expr_range_info = getExprRangeInfo(agg_expr->get_arg());
886  // TOD(adb): QMD not actually initialized here?
887  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
888  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
889  expr_range_info.has_nulls) &&
890  order_entry.is_desc == order_entry.nulls_first) {
891  return false;
892  }
893  }
894  const auto& target_ti = target_expr->get_type_info();
895  CHECK(!target_ti.is_buffer());
896  if (!target_ti.is_integer()) {
897  return false;
898  }
899  }
900  return true;
901 }
902 
904  llvm::Value* cond,
905  Executor* executor,
906  const bool chain_to_next,
907  const std::string& label_prefix,
908  DiamondCodegen* parent,
909  const bool share_false_edge_with_parent)
910  : executor_(executor), chain_to_next_(chain_to_next), parent_(parent) {
911  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
912  if (parent_) {
914  }
915  cond_true_ = llvm::BasicBlock::Create(LL_CONTEXT, label_prefix + "_true", CUR_FUNC);
916  if (share_false_edge_with_parent) {
917  CHECK(parent);
919  } else {
921  llvm::BasicBlock::Create(LL_CONTEXT, label_prefix + "_false", CUR_FUNC);
922  }
923 
924  LL_BUILDER.CreateCondBr(cond, cond_true_, cond_false_);
925  LL_BUILDER.SetInsertPoint(cond_true_);
926 }
927 
929  CHECK(!parent_);
930  chain_to_next_ = true;
931 }
932 
933 void GroupByAndAggregate::DiamondCodegen::setFalseTarget(llvm::BasicBlock* cond_false) {
934  CHECK(!parent_ || orig_cond_false_ != parent_->cond_false_);
935  cond_false_ = cond_false;
936 }
937 
939  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
940  if (parent_ && orig_cond_false_ != parent_->cond_false_) {
941  LL_BUILDER.CreateBr(parent_->cond_false_);
942  } else if (chain_to_next_) {
943  LL_BUILDER.CreateBr(cond_false_);
944  }
945  if (!parent_ || (!chain_to_next_ && cond_false_ != parent_->cond_false_)) {
946  LL_BUILDER.SetInsertPoint(orig_cond_false_);
947  }
948 }
949 
950 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
951  llvm::BasicBlock* sc_false,
953  const CompilationOptions& co,
954  const GpuSharedMemoryContext& gpu_smem_context) {
955  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
956  CHECK(filter_result);
957 
958  bool can_return_error = false;
959  llvm::BasicBlock* filter_false{nullptr};
960 
961  {
962  const bool is_group_by = !ra_exe_unit_.groupby_exprs.empty();
963 
964  if (executor_->isArchMaxwell(co.device_type)) {
966  }
967  DiamondCodegen filter_cfg(filter_result,
968  executor_,
969  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
970  "filter", // filter_true and filter_false basic blocks
971  nullptr,
972  false);
973  filter_false = filter_cfg.cond_false_;
974 
975  if (is_group_by) {
977  !query_mem_desc.useStreamingTopN()) {
978  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
979  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
980  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
981  llvm::Value* old_total_matched_val{nullptr};
983  old_total_matched_val =
984  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
985  total_matched_ptr,
986  LL_INT(int32_t(1)),
987  llvm::AtomicOrdering::Monotonic);
988  } else {
989  old_total_matched_val = LL_BUILDER.CreateLoad(total_matched_ptr);
990  LL_BUILDER.CreateStore(
991  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
992  total_matched_ptr);
993  }
994  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
995  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
996  }
997 
998  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
999  if (query_mem_desc.usesGetGroupValueFast() ||
1000  query_mem_desc.getQueryDescriptionType() ==
1002  if (query_mem_desc.getGroupbyColCount() > 1) {
1003  filter_cfg.setChainToNext();
1004  }
1005  // Don't generate null checks if the group slot is guaranteed to be non-null,
1006  // as it's the case for get_group_value_fast* family.
1007  can_return_error = codegenAggCalls(
1008  agg_out_ptr_w_idx, {}, query_mem_desc, co, gpu_smem_context, filter_cfg);
1009  } else {
1010  {
1011  llvm::Value* nullcheck_cond{nullptr};
1012  if (query_mem_desc.didOutputColumnar()) {
1013  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
1014  LL_INT(int32_t(0)));
1015  } else {
1016  nullcheck_cond = LL_BUILDER.CreateICmpNE(
1017  std::get<0>(agg_out_ptr_w_idx),
1018  llvm::ConstantPointerNull::get(
1019  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
1020  }
1021  DiamondCodegen nullcheck_cfg(
1022  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
1024  agg_out_ptr_w_idx, {}, query_mem_desc, co, gpu_smem_context, filter_cfg);
1025  }
1026  can_return_error = true;
1027  if (query_mem_desc.getQueryDescriptionType() ==
1029  query_mem_desc.useStreamingTopN()) {
1030  // Ignore rejection on pushing current row to top-K heap.
1031  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
1032  } else {
1033  CodeGenerator code_generator(executor_);
1034  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
1035  // TODO(alex): remove the trunc once pos is converted to 32 bits
1036  code_generator.posArg(nullptr),
1037  get_int_type(32, LL_CONTEXT))));
1038  }
1039  }
1040  } else {
1041  if (ra_exe_unit_.estimator) {
1042  std::stack<llvm::BasicBlock*> array_loops;
1043  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
1044  } else {
1045  auto arg_it = ROW_FUNC->arg_begin();
1046  std::vector<llvm::Value*> agg_out_vec;
1047  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
1048  agg_out_vec.push_back(&*arg_it++);
1049  }
1050  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
1051  agg_out_vec,
1052  query_mem_desc,
1053  co,
1054  gpu_smem_context,
1055  filter_cfg);
1056  }
1057  }
1058  }
1059 
1060  if (ra_exe_unit_.join_quals.empty()) {
1061  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
1062  } else if (sc_false) {
1063  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
1064  LL_BUILDER.SetInsertPoint(sc_false);
1065  LL_BUILDER.CreateBr(filter_false);
1066  LL_BUILDER.SetInsertPoint(saved_insert_block);
1067  }
1068 
1069  return can_return_error;
1070 }
1071 
1073  llvm::Value* groups_buffer,
1075  const CompilationOptions& co,
1076  DiamondCodegen& diamond_codegen) {
1077  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1079  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1080  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
1081  CHECK(!group_expr);
1082  if (!query_mem_desc.didOutputColumnar()) {
1083  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
1084  }
1085  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1086  ? 0
1087  : query_mem_desc.getRowSize() / sizeof(int64_t);
1088  CodeGenerator code_generator(executor_);
1089  if (query_mem_desc.useStreamingTopN()) {
1090  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
1091  CHECK_GE(only_order_entry.tle_no, int(1));
1092  const size_t target_idx = only_order_entry.tle_no - 1;
1093  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1094  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1095  const auto chosen_bytes =
1096  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1097  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1098  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1100  std::string fname = "get_bin_from_k_heap";
1101  const auto& oe_ti = order_entry_expr->get_type_info();
1102  llvm::Value* null_key_lv = nullptr;
1103  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1104  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1105  switch (bit_width) {
1106  case 32:
1107  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1108  break;
1109  case 64:
1110  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1111  break;
1112  default:
1113  CHECK(false);
1114  }
1115  fname += "_int" + std::to_string(bit_width) + "_t";
1116  } else {
1117  CHECK(oe_ti.is_fp());
1118  if (order_entry_lv->getType()->isDoubleTy()) {
1119  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1120  } else {
1121  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1122  }
1123  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1124  }
1125  const auto key_slot_idx =
1127  return emitCall(
1128  fname,
1129  {groups_buffer,
1130  LL_INT(n),
1131  LL_INT(row_size_quad),
1132  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1133  LL_BOOL(only_order_entry.is_desc),
1134  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1135  LL_BOOL(only_order_entry.nulls_first),
1136  null_key_lv,
1137  order_entry_lv});
1138  } else {
1139  llvm::Value* output_buffer_entry_count_lv{nullptr};
1141  output_buffer_entry_count_lv =
1142  LL_BUILDER.CreateLoad(get_arg_by_name(ROW_FUNC, "max_matched"));
1143  CHECK(output_buffer_entry_count_lv);
1144  }
1145  const auto group_expr_lv =
1146  LL_BUILDER.CreateLoad(get_arg_by_name(ROW_FUNC, "old_total_matched"));
1147  std::vector<llvm::Value*> args{
1148  groups_buffer,
1149  output_buffer_entry_count_lv
1150  ? output_buffer_entry_count_lv
1151  : LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1152  group_expr_lv,
1153  code_generator.posArg(nullptr)};
1154  if (query_mem_desc.didOutputColumnar()) {
1155  const auto columnar_output_offset =
1156  emitCall("get_columnar_scan_output_offset", args);
1157  return columnar_output_offset;
1158  }
1159  args.push_back(LL_INT(row_size_quad));
1160  return emitCall("get_scan_output_slot", args);
1161  }
1162 }
1163 
1164 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1166  const CompilationOptions& co,
1167  DiamondCodegen& diamond_codegen) {
1168  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1169  auto arg_it = ROW_FUNC->arg_begin();
1170  auto groups_buffer = arg_it++;
1171 
1172  std::stack<llvm::BasicBlock*> array_loops;
1173 
1174  // TODO(Saman): move this logic outside of this function.
1176  if (query_mem_desc.didOutputColumnar()) {
1177  return std::make_tuple(
1178  &*groups_buffer,
1179  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1180  } else {
1181  return std::make_tuple(
1182  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1183  nullptr);
1184  }
1185  }
1186 
1187  CHECK(query_mem_desc.getQueryDescriptionType() ==
1189  query_mem_desc.getQueryDescriptionType() ==
1191 
1192  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1193  ? 0
1194  : query_mem_desc.getRowSize() / sizeof(int64_t);
1195 
1196  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1197  ? sizeof(int64_t)
1198  : query_mem_desc.getEffectiveKeyWidth();
1199  // for multi-column group by
1200  llvm::Value* group_key = nullptr;
1201  llvm::Value* key_size_lv = nullptr;
1202 
1203  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1204  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1205  if (query_mem_desc.getQueryDescriptionType() ==
1207  group_key =
1208  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1209  } else if (query_mem_desc.getQueryDescriptionType() ==
1211  group_key =
1212  col_width_size == sizeof(int32_t)
1213  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1214  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1215  }
1216  CHECK(group_key);
1217  CHECK(key_size_lv);
1218  }
1219 
1220  int32_t subkey_idx = 0;
1221  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1222  for (const auto& group_expr : ra_exe_unit_.groupby_exprs) {
1223  const auto col_range_info = getExprRangeInfo(group_expr.get());
1224  const auto translated_null_value = static_cast<int64_t>(
1225  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1226  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1227  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1228  : checked_int64_t(col_range_info.max) +
1229  (col_range_info.bucket ? col_range_info.bucket : 1));
1230 
1231  const bool col_has_nulls =
1232  query_mem_desc.getQueryDescriptionType() ==
1234  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1235  ? query_mem_desc.hasNulls()
1236  : col_range_info.has_nulls)
1237  : false;
1238 
1239  const auto group_expr_lvs =
1240  executor_->groupByColumnCodegen(group_expr.get(),
1241  col_width_size,
1242  co,
1243  col_has_nulls,
1244  translated_null_value,
1245  diamond_codegen,
1246  array_loops,
1247  query_mem_desc.threadsShareMemory());
1248  const auto group_expr_lv = group_expr_lvs.translated_value;
1249  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1250  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1251  return codegenSingleColumnPerfectHash(query_mem_desc,
1252  co,
1253  &*groups_buffer,
1254  group_expr_lv,
1255  group_expr_lvs.original_value,
1256  row_size_quad);
1257  } else {
1258  // store the sub-key to the buffer
1259  LL_BUILDER.CreateStore(group_expr_lv,
1260  LL_BUILDER.CreateGEP(group_key, LL_INT(subkey_idx++)));
1261  }
1262  }
1263  if (query_mem_desc.getQueryDescriptionType() ==
1265  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1267  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1268  } else if (query_mem_desc.getQueryDescriptionType() ==
1271  &*groups_buffer,
1272  group_key,
1273  key_size_lv,
1274  query_mem_desc,
1275  col_width_size,
1276  row_size_quad);
1277  }
1278  CHECK(false);
1279  return std::make_tuple(nullptr, nullptr);
1280 }
1281 
1282 std::tuple<llvm::Value*, llvm::Value*>
1285  const CompilationOptions& co,
1286  llvm::Value* groups_buffer,
1287  llvm::Value* group_expr_lv_translated,
1288  llvm::Value* group_expr_lv_original,
1289  const int32_t row_size_quad) {
1290  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1291  CHECK(query_mem_desc.usesGetGroupValueFast());
1292  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1293  ? "get_columnar_group_bin_offset"
1294  : "get_group_value_fast"};
1295  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1296  get_group_fn_name += "_keyless";
1297  }
1298  if (query_mem_desc.interleavedBins(co.device_type)) {
1299  CHECK(!query_mem_desc.didOutputColumnar());
1300  CHECK(query_mem_desc.hasKeylessHash());
1301  get_group_fn_name += "_semiprivate";
1302  }
1303  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1304  &*group_expr_lv_translated};
1305  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1306  query_mem_desc.mustUseBaselineSort()) {
1307  get_group_fn_name += "_with_original_key";
1308  get_group_fn_args.push_back(group_expr_lv_original);
1309  }
1310  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1311  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1312  if (!query_mem_desc.hasKeylessHash()) {
1313  if (!query_mem_desc.didOutputColumnar()) {
1314  get_group_fn_args.push_back(LL_INT(row_size_quad));
1315  }
1316  } else {
1317  if (!query_mem_desc.didOutputColumnar()) {
1318  get_group_fn_args.push_back(LL_INT(row_size_quad));
1319  }
1320  if (query_mem_desc.interleavedBins(co.device_type)) {
1321  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1322  get_group_fn_args.push_back(warp_idx);
1323  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1324  }
1325  }
1326  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1327  return std::make_tuple(&*groups_buffer,
1328  emitCall(get_group_fn_name, get_group_fn_args));
1329  }
1330  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1331 }
1332 
1333 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1334  llvm::Value* groups_buffer,
1335  llvm::Value* group_key,
1336  llvm::Value* key_size_lv,
1337  const QueryMemoryDescriptor& query_mem_desc,
1338  const int32_t row_size_quad) {
1339  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1340  CHECK(query_mem_desc.getQueryDescriptionType() ==
1342  // compute the index (perfect hash)
1343  auto perfect_hash_func = codegenPerfectHashFunction();
1344  auto hash_lv =
1345  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1346 
1347  if (query_mem_desc.didOutputColumnar()) {
1348  if (!query_mem_desc.hasKeylessHash()) {
1349  const std::string set_matching_func_name{
1350  "set_matching_group_value_perfect_hash_columnar"};
1351  const std::vector<llvm::Value*> set_matching_func_arg{
1352  groups_buffer,
1353  hash_lv,
1354  group_key,
1355  key_size_lv,
1356  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1357  query_mem_desc.getEntryCount())};
1358  emitCall(set_matching_func_name, set_matching_func_arg);
1359  }
1360  return std::make_tuple(groups_buffer, hash_lv);
1361  } else {
1362  if (query_mem_desc.hasKeylessHash()) {
1363  return std::make_tuple(emitCall("get_matching_group_value_perfect_hash_keyless",
1364  {groups_buffer, hash_lv, LL_INT(row_size_quad)}),
1365  nullptr);
1366  } else {
1367  return std::make_tuple(
1368  emitCall(
1369  "get_matching_group_value_perfect_hash",
1370  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1371  nullptr);
1372  }
1373  }
1374 }
1375 
1376 std::tuple<llvm::Value*, llvm::Value*>
1378  const CompilationOptions& co,
1379  llvm::Value* groups_buffer,
1380  llvm::Value* group_key,
1381  llvm::Value* key_size_lv,
1382  const QueryMemoryDescriptor& query_mem_desc,
1383  const size_t key_width,
1384  const int32_t row_size_quad) {
1385  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1386  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1387  CHECK(key_width == sizeof(int32_t));
1388  group_key =
1389  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1390  }
1391  std::vector<llvm::Value*> func_args{
1392  groups_buffer,
1393  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1394  &*group_key,
1395  &*key_size_lv,
1396  LL_INT(static_cast<int32_t>(key_width))};
1397  std::string func_name{"get_group_value"};
1398  if (query_mem_desc.didOutputColumnar()) {
1399  func_name += "_columnar_slot";
1400  } else {
1401  func_args.push_back(LL_INT(row_size_quad));
1402  }
1403  if (co.with_dynamic_watchdog) {
1404  func_name += "_with_watchdog";
1405  }
1406  if (query_mem_desc.didOutputColumnar()) {
1407  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1408  } else {
1409  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1410  }
1411 }
1412 
1414  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1415  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1416  auto ft = llvm::FunctionType::get(
1417  get_int_type(32, LL_CONTEXT),
1418  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1419  false);
1420  auto key_hash_func = llvm::Function::Create(ft,
1421  llvm::Function::ExternalLinkage,
1422  "perfect_key_hash",
1423  executor_->cgen_state_->module_);
1424  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1425  mark_function_always_inline(key_hash_func);
1426  auto& key_buff_arg = *key_hash_func->args().begin();
1427  llvm::Value* key_buff_lv = &key_buff_arg;
1428  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1429  llvm::IRBuilder<> key_hash_func_builder(bb);
1430  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1431  std::vector<int64_t> cardinalities;
1432  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1433  auto col_range_info = getExprRangeInfo(groupby_expr.get());
1434  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1435  cardinalities.push_back(getBucketedCardinality(col_range_info));
1436  }
1437  size_t dim_idx = 0;
1438  for (const auto& groupby_expr : ra_exe_unit_.groupby_exprs) {
1439  auto key_comp_lv = key_hash_func_builder.CreateLoad(
1440  key_hash_func_builder.CreateGEP(key_buff_lv, LL_INT(dim_idx)));
1441  auto col_range_info = getExprRangeInfo(groupby_expr.get());
1442  auto crt_term_lv =
1443  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1444  if (col_range_info.bucket) {
1445  crt_term_lv =
1446  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1447  }
1448  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1449  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1450  LL_INT(cardinalities[prev_dim_idx]));
1451  }
1452  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1453  ++dim_idx;
1454  }
1455  key_hash_func_builder.CreateRet(
1456  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1457  return key_hash_func;
1458 }
1459 
1461  const TargetInfo& agg_info,
1462  llvm::Value* target) {
1463  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1464  const auto& agg_type = agg_info.sql_type;
1465  const size_t chosen_bytes = agg_type.get_size();
1466 
1467  bool need_conversion{false};
1468  llvm::Value* arg_null{nullptr};
1469  llvm::Value* agg_null{nullptr};
1470  llvm::Value* target_to_cast{target};
1471  if (arg_type.is_fp()) {
1472  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1473  if (agg_type.is_fp()) {
1474  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1475  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1476  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1477  need_conversion = true;
1478  }
1479  } else {
1480  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1481  return target;
1482  }
1483  } else {
1484  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1485  if (agg_type.is_fp()) {
1486  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1487  need_conversion = true;
1488  target_to_cast = executor_->castToFP(target);
1489  } else {
1490  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1491  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1492  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1493  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1494  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1495  need_conversion = true;
1496  }
1497  }
1498  }
1499  if (need_conversion) {
1500  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1501  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1502  return LL_BUILDER.CreateSelect(
1503  cmp,
1504  agg_null,
1505  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1506  } else {
1507  return target;
1508  }
1509 }
1510 
1512  const Analyzer::WindowFunction* window_func,
1513  const QueryMemoryDescriptor& query_mem_desc,
1514  const CompilationOptions& co,
1515  DiamondCodegen& diamond_codegen) {
1516  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1517  const auto window_func_context =
1519  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1520  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1521  ? 0
1522  : query_mem_desc.getRowSize() / sizeof(int64_t);
1523  auto arg_it = ROW_FUNC->arg_begin();
1524  auto groups_buffer = arg_it++;
1525  CodeGenerator code_generator(executor_);
1526  if (!window_func_context->getRowNumber()) {
1527  CHECK(window_func->getKind() == SqlWindowFunctionKind::COUNT);
1528  window_func_context->setRowNumber(emitCall(
1529  "row_number_window_func",
1530  {LL_INT(reinterpret_cast<const int64_t>(window_func_context->output())),
1531  code_generator.posArg(nullptr)}));
1532  }
1533  const auto pos_in_window = LL_BUILDER.CreateTrunc(window_func_context->getRowNumber(),
1534  get_int_type(32, LL_CONTEXT));
1535  llvm::Value* entry_count_lv =
1536  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1537  std::vector<llvm::Value*> args{
1538  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1539  if (query_mem_desc.didOutputColumnar()) {
1540  const auto columnar_output_offset =
1541  emitCall("get_columnar_scan_output_offset", args);
1542  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1543  }
1544  args.push_back(LL_INT(row_size_quad));
1545  return emitCall("get_scan_output_slot", args);
1546  }
1547  auto arg_it = ROW_FUNC->arg_begin();
1548  auto groups_buffer = arg_it++;
1549  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1550 }
1551 
1553  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1554  const std::vector<llvm::Value*>& agg_out_vec,
1555  const QueryMemoryDescriptor& query_mem_desc,
1556  const CompilationOptions& co,
1557  const GpuSharedMemoryContext& gpu_smem_context,
1558  DiamondCodegen& diamond_codegen) {
1559  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1560  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1561  // TODO(alex): unify the two cases, the output for non-group by queries
1562  // should be a contiguous buffer
1563  const bool is_group_by = std::get<0>(agg_out_ptr_w_idx);
1564  bool can_return_error = false;
1565  if (is_group_by) {
1566  CHECK(agg_out_vec.empty());
1567  } else {
1568  CHECK(!agg_out_vec.empty());
1569  }
1570 
1571  // output buffer is casted into a byte stream to be able to handle data elements of
1572  // different sizes (only used when actual column width sizes are used)
1573  llvm::Value* output_buffer_byte_stream{nullptr};
1574  llvm::Value* out_row_idx{nullptr};
1575  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1577  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1578  std::get<0>(agg_out_ptr_w_idx),
1579  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1580  output_buffer_byte_stream->setName("out_buff_b_stream");
1581  CHECK(std::get<1>(agg_out_ptr_w_idx));
1582  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1583  llvm::Type::getInt64Ty(LL_CONTEXT));
1584  out_row_idx->setName("out_row_idx");
1585  }
1586 
1587  TargetExprCodegenBuilder target_builder(query_mem_desc, ra_exe_unit_, is_group_by);
1588  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1589  ++target_idx) {
1590  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1591  CHECK(target_expr);
1592 
1593  target_builder(target_expr, executor_, co);
1594  }
1595 
1596  target_builder.codegen(this,
1597  executor_,
1598  query_mem_desc,
1599  co,
1600  gpu_smem_context,
1601  agg_out_ptr_w_idx,
1602  agg_out_vec,
1603  output_buffer_byte_stream,
1604  out_row_idx,
1605  diamond_codegen);
1606 
1607  for (auto target_expr : ra_exe_unit_.target_exprs) {
1608  CHECK(target_expr);
1609  executor_->plan_state_->isLazyFetchColumn(target_expr);
1610  }
1611 
1612  return can_return_error;
1613 }
1614 
1619  llvm::Value* output_buffer_byte_stream,
1620  llvm::Value* out_row_idx,
1621  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1622  const QueryMemoryDescriptor& query_mem_desc,
1623  const size_t chosen_bytes,
1624  const size_t agg_out_off,
1625  const size_t target_idx) {
1626  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1627  llvm::Value* agg_col_ptr{nullptr};
1628  if (query_mem_desc.didOutputColumnar()) {
1629  // TODO(Saman): remove the second columnar branch, and support all query description
1630  // types through the first branch. Then, input arguments should also be cleaned up
1631  if (!g_cluster &&
1633  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1634  chosen_bytes == 8);
1635  CHECK(output_buffer_byte_stream);
1636  CHECK(out_row_idx);
1637  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1638  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1639  auto out_per_col_byte_idx =
1640  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1641  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1642  LL_INT(static_cast<int64_t>(col_off)));
1643  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1644  auto output_ptr = LL_BUILDER.CreateGEP(output_buffer_byte_stream, byte_offset);
1645  agg_col_ptr = LL_BUILDER.CreateBitCast(
1646  output_ptr,
1647  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1648  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1649  } else {
1650  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1651  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1652  col_off /= chosen_bytes;
1653  CHECK(std::get<1>(agg_out_ptr_w_idx));
1654  auto offset = LL_BUILDER.CreateAdd(std::get<1>(agg_out_ptr_w_idx), LL_INT(col_off));
1655  agg_col_ptr = LL_BUILDER.CreateGEP(
1656  LL_BUILDER.CreateBitCast(
1657  std::get<0>(agg_out_ptr_w_idx),
1658  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0)),
1659  offset);
1660  }
1661  } else {
1662  uint32_t col_off = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1663  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1664  col_off /= chosen_bytes;
1665  agg_col_ptr = LL_BUILDER.CreateGEP(
1666  LL_BUILDER.CreateBitCast(
1667  std::get<0>(agg_out_ptr_w_idx),
1668  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0)),
1669  LL_INT(col_off));
1670  }
1671  CHECK(agg_col_ptr);
1672  return agg_col_ptr;
1673 }
1674 
1676  std::stack<llvm::BasicBlock*>& array_loops,
1677  GroupByAndAggregate::DiamondCodegen& diamond_codegen,
1678  const QueryMemoryDescriptor& query_mem_desc,
1679  const CompilationOptions& co) {
1680  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1681  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1682  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1683  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1684  estimator_comp_count_lv);
1685  int32_t subkey_idx = 0;
1686  for (const auto& estimator_arg_comp : estimator_arg) {
1687  const auto estimator_arg_comp_lvs =
1688  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1689  query_mem_desc.getEffectiveKeyWidth(),
1690  co,
1691  false,
1692  0,
1693  diamond_codegen,
1694  array_loops,
1695  true);
1696  CHECK(!estimator_arg_comp_lvs.original_value);
1697  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1698  // store the sub-key to the buffer
1699  LL_BUILDER.CreateStore(estimator_arg_comp_lv,
1700  LL_BUILDER.CreateGEP(estimator_key_lv, LL_INT(subkey_idx++)));
1701  }
1702  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1703  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1704  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1705  const auto estimator_comp_bytes_lv =
1706  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1707  const auto bitmap_size_lv =
1708  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1709  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1710  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1711 }
1712 
1713 extern "C" void agg_count_distinct(int64_t* agg, const int64_t val) {
1714  reinterpret_cast<std::set<int64_t>*>(*agg)->insert(val);
1715 }
1716 
1717 extern "C" void agg_count_distinct_skip_val(int64_t* agg,
1718  const int64_t val,
1719  const int64_t skip_val) {
1720  if (val != skip_val) {
1721  agg_count_distinct(agg, val);
1722  }
1723 }
1724 
1726  const size_t target_idx,
1727  const Analyzer::Expr* target_expr,
1728  std::vector<llvm::Value*>& agg_args,
1729  const QueryMemoryDescriptor& query_mem_desc,
1730  const ExecutorDeviceType device_type) {
1731  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1732  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1733  const auto& arg_ti =
1734  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1735  if (arg_ti.is_fp()) {
1736  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1737  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1738  }
1739  const auto& count_distinct_descriptor =
1740  query_mem_desc.getCountDistinctDescriptor(target_idx);
1741  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1742  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1743  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1744  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1745  if (device_type == ExecutorDeviceType::GPU) {
1746  const auto base_dev_addr = getAdditionalLiteral(-1);
1747  const auto base_host_addr = getAdditionalLiteral(-2);
1748  agg_args.push_back(base_dev_addr);
1749  agg_args.push_back(base_host_addr);
1750  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1751  } else {
1752  emitCall("agg_approximate_count_distinct", agg_args);
1753  }
1754  return;
1755  }
1756  std::string agg_fname{"agg_count_distinct"};
1757  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1758  agg_fname += "_bitmap";
1759  agg_args.push_back(LL_INT(static_cast<int64_t>(count_distinct_descriptor.min_val)));
1760  }
1761  if (agg_info.skip_null_val) {
1762  auto null_lv = executor_->cgen_state_->castToTypeIn(
1763  (arg_ti.is_fp()
1764  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1765  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1766  64);
1767  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1768  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1769  agg_fname += "_skip_val";
1770  agg_args.push_back(null_lv);
1771  }
1772  if (device_type == ExecutorDeviceType::GPU) {
1773  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1774  agg_fname += "_gpu";
1775  const auto base_dev_addr = getAdditionalLiteral(-1);
1776  const auto base_host_addr = getAdditionalLiteral(-2);
1777  agg_args.push_back(base_dev_addr);
1778  agg_args.push_back(base_host_addr);
1779  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1780  CHECK_EQ(size_t(0),
1781  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1782  count_distinct_descriptor.sub_bitmap_count);
1783  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1784  count_distinct_descriptor.sub_bitmap_count)));
1785  }
1786  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1787  emitCall(agg_fname, agg_args);
1788  } else {
1789  executor_->cgen_state_->emitExternalCall(
1790  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1791  }
1792 }
1793 
1794 void GroupByAndAggregate::codegenApproxMedian(const size_t target_idx,
1795  const Analyzer::Expr* target_expr,
1796  std::vector<llvm::Value*>& agg_args,
1797  const QueryMemoryDescriptor& query_mem_desc,
1798  const ExecutorDeviceType device_type) {
1799  if (device_type == ExecutorDeviceType::GPU) {
1800  throw QueryMustRunOnCpu();
1801  }
1802  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1803  auto const arg_ti =
1804  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1805  if (!arg_ti.is_fp()) {
1806  agg_args.back() = executor_->castToFP(agg_args.back());
1807  }
1808  constexpr size_t MAXLEN = std::string_view("agg_approx_median_skip_val_gpu").size();
1809  char agg_fname[MAXLEN + 1] = "agg_approx_median";
1810  auto const agg_info = get_target_info(target_expr, g_bigint_count);
1811  if (agg_info.skip_null_val) {
1812  strcat(agg_fname, "_skip_val");
1813  auto* skip_val = executor_->cgen_state_->intNullValue(SQLTypeInfo(kDOUBLE), 64);
1814  agg_args.push_back(skip_val);
1815  }
1816  emitCall(agg_fname, agg_args);
1817 }
1818 
1819 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
1820  CHECK_LT(off, 0);
1821  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
1822  return LL_BUILDER.CreateLoad(LL_BUILDER.CreateGEP(
1823  LL_BUILDER.CreateBitCast(lit_buff_lv,
1824  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)),
1825  LL_INT(off)));
1826 }
1827 
1828 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
1829  const Analyzer::Expr* target_expr,
1830  const CompilationOptions& co) {
1831  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
1832  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
1833  const auto func_expr = dynamic_cast<const Analyzer::FunctionOper*>(target_expr);
1834  const auto arr_expr = dynamic_cast<const Analyzer::ArrayExpr*>(target_expr);
1835 
1836  // TODO(alex): handle arrays uniformly?
1837  CodeGenerator code_generator(executor_);
1838  if (target_expr) {
1839  const auto& target_ti = target_expr->get_type_info();
1840  if (target_ti.is_buffer() &&
1841  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1842  const auto target_lvs =
1843  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1844  : code_generator.codegen(
1845  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1846  if (!func_expr && !arr_expr) {
1847  // Something with the chunk transport is code that was generated from a source
1848  // other than an ARRAY[] expression
1849  if (target_ti.is_bytes()) {
1850  CHECK_EQ(size_t(3), target_lvs.size());
1851  return {target_lvs[1], target_lvs[2]};
1852  }
1853  CHECK(target_ti.is_array());
1854  CHECK_EQ(size_t(1), target_lvs.size());
1855  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
1856  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1857  const auto i8p_ty =
1858  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1859  const auto& elem_ti = target_ti.get_elem_type();
1860  return {
1861  executor_->cgen_state_->emitExternalCall(
1862  "array_buff",
1863  i8p_ty,
1864  {target_lvs.front(), code_generator.posArg(target_expr)}),
1865  executor_->cgen_state_->emitExternalCall(
1866  "array_size",
1867  i32_ty,
1868  {target_lvs.front(),
1869  code_generator.posArg(target_expr),
1870  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
1871  } else {
1872  if (agg_expr) {
1873  throw std::runtime_error(
1874  "Using array[] operator as argument to an aggregate operator is not "
1875  "supported");
1876  }
1877  CHECK(func_expr || arr_expr);
1878  if (dynamic_cast<const Analyzer::FunctionOper*>(target_expr)) {
1879  CHECK_EQ(size_t(1), target_lvs.size());
1880  const auto prefix = target_ti.get_buffer_name();
1881  CHECK(target_ti.is_array() || target_ti.is_bytes());
1882  const auto target_lv = LL_BUILDER.CreateLoad(target_lvs[0]);
1883  // const auto target_lv_type = target_lvs[0]->getType();
1884  // CHECK(target_lv_type->isStructTy());
1885  // CHECK_EQ(target_lv_type->getNumContainedTypes(), 3u);
1886  const auto i8p_ty = llvm::PointerType::get(
1887  get_int_type(8, executor_->cgen_state_->context_), 0);
1888  const auto ptr = LL_BUILDER.CreatePointerCast(
1889  LL_BUILDER.CreateExtractValue(target_lv, 0), i8p_ty);
1890  const auto size = LL_BUILDER.CreateExtractValue(target_lv, 1);
1891  const auto null_flag = LL_BUILDER.CreateExtractValue(target_lv, 2);
1892  const auto nullcheck_ok_bb =
1893  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_nullcheck_ok_bb", CUR_FUNC);
1894  const auto nullcheck_fail_bb = llvm::BasicBlock::Create(
1895  LL_CONTEXT, prefix + "_nullcheck_fail_bb", CUR_FUNC);
1896 
1897  // TODO(adb): probably better to zext the bool
1898  const auto nullcheck = LL_BUILDER.CreateICmpEQ(
1899  null_flag, executor_->cgen_state_->llInt(static_cast<int8_t>(1)));
1900  LL_BUILDER.CreateCondBr(nullcheck, nullcheck_fail_bb, nullcheck_ok_bb);
1901 
1902  const auto ret_bb =
1903  llvm::BasicBlock::Create(LL_CONTEXT, prefix + "_return", CUR_FUNC);
1904  LL_BUILDER.SetInsertPoint(ret_bb);
1905  auto result_phi = LL_BUILDER.CreatePHI(i8p_ty, 2, prefix + "_ptr_return");
1906  result_phi->addIncoming(ptr, nullcheck_ok_bb);
1907  const auto null_arr_sentinel = LL_BUILDER.CreateIntToPtr(
1908  executor_->cgen_state_->llInt(static_cast<int8_t>(0)), i8p_ty);
1909  result_phi->addIncoming(null_arr_sentinel, nullcheck_fail_bb);
1910  LL_BUILDER.SetInsertPoint(nullcheck_ok_bb);
1911  executor_->cgen_state_->emitExternalCall(
1912  "register_buffer_with_executor_rsm",
1913  llvm::Type::getVoidTy(executor_->cgen_state_->context_),
1914  {executor_->cgen_state_->llInt(reinterpret_cast<int64_t>(executor_)), ptr});
1915  LL_BUILDER.CreateBr(ret_bb);
1916  LL_BUILDER.SetInsertPoint(nullcheck_fail_bb);
1917  LL_BUILDER.CreateBr(ret_bb);
1918 
1919  LL_BUILDER.SetInsertPoint(ret_bb);
1920  return {result_phi, size};
1921  }
1922  CHECK_EQ(size_t(2), target_lvs.size());
1923  return {target_lvs[0], target_lvs[1]};
1924  }
1925  }
1926  if (target_ti.is_geometry() &&
1927  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1928  auto generate_coord_lvs =
1929  [&](auto* selected_target_expr,
1930  bool const fetch_columns) -> std::vector<llvm::Value*> {
1931  const auto target_lvs =
1932  code_generator.codegen(selected_target_expr, fetch_columns, co);
1933  const auto geo_uoper = dynamic_cast<const Analyzer::GeoUOper*>(target_expr);
1934  const auto geo_binoper = dynamic_cast<const Analyzer::GeoBinOper*>(target_expr);
1935  if (geo_uoper || geo_binoper) {
1936  CHECK(target_expr->get_type_info().is_geometry());
1937  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
1938  target_lvs.size());
1939  return target_lvs;
1940  }
1941  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
1942  target_lvs.size());
1943 
1944  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1945  const auto i8p_ty =
1946  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1947  std::vector<llvm::Value*> coords;
1948  size_t ctr = 0;
1949  for (const auto& target_lv : target_lvs) {
1950  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
1951  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
1952  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
1953  // coords array (TINYINT). Subsequent arrays are regular INT.
1954 
1955  const size_t elem_sz = ctr == 0 ? 1 : 4;
1956  ctr++;
1957  int32_t fixlen = -1;
1958  if (target_ti.get_type() == kPOINT) {
1959  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
1960  if (col_var) {
1961  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
1962  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
1963  fixlen = coords_cd->columnType.get_size();
1964  }
1965  }
1966  }
1967  if (fixlen > 0) {
1968  coords.push_back(executor_->cgen_state_->emitExternalCall(
1969  "fast_fixlen_array_buff",
1970  i8p_ty,
1971  {target_lv, code_generator.posArg(selected_target_expr)}));
1972  coords.push_back(executor_->cgen_state_->llInt(int64_t(fixlen)));
1973  continue;
1974  }
1975  coords.push_back(executor_->cgen_state_->emitExternalCall(
1976  "array_buff",
1977  i8p_ty,
1978  {target_lv, code_generator.posArg(selected_target_expr)}));
1979  coords.push_back(executor_->cgen_state_->emitExternalCall(
1980  "array_size",
1981  i32_ty,
1982  {target_lv,
1983  code_generator.posArg(selected_target_expr),
1984  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
1985  }
1986  return coords;
1987  };
1988 
1989  if (agg_expr) {
1990  return generate_coord_lvs(agg_expr->get_arg(), true);
1991  } else {
1992  return generate_coord_lvs(target_expr,
1993  !executor_->plan_state_->allow_lazy_fetch_);
1994  }
1995  }
1996  }
1997  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1998  : code_generator.codegen(
1999  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
2000 }
2001 
2002 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
2003  const std::vector<llvm::Value*>& args) {
2004  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2005  return executor_->cgen_state_->emitCall(fname, args);
2006 }
2007 
2008 void GroupByAndAggregate::checkErrorCode(llvm::Value* retCode) {
2009  AUTOMATIC_IR_METADATA(executor_->cgen_state_.get());
2010  auto zero_const = llvm::ConstantInt::get(retCode->getType(), 0, true);
2011  auto rc_check_condition = executor_->cgen_state_->ir_builder_.CreateICmp(
2012  llvm::ICmpInst::ICMP_EQ, retCode, zero_const);
2013 
2014  executor_->cgen_state_->emitErrorCheck(rc_check_condition, retCode, "rc");
2015 }
2016 
2017 #undef CUR_FUNC
2018 #undef ROW_FUNC
2019 #undef LL_FP
2020 #undef LL_INT
2021 #undef LL_BOOL
2022 #undef LL_BUILDER
2023 #undef LL_CONTEXT
2024 
2026  const RelAlgExecutionUnit& ra_exe_unit,
2027  const Catalog_Namespace::Catalog& catalog) {
2028  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
2029  return 0;
2030  }
2031  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
2032  const auto grouped_col_expr =
2033  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
2034  if (!grouped_col_expr) {
2035  continue;
2036  }
2037  if (grouped_col_expr->get_table_id() <= 0) {
2038  return 0;
2039  }
2040  const auto td = catalog.getMetadataForTable(grouped_col_expr->get_table_id());
2041  if (td->shardedColumnId == grouped_col_expr->get_column_id()) {
2042  return td->nShards;
2043  }
2044  }
2045  return 0;
2046 }
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, GroupByAndAggregate::DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:205
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:1447
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
llvm::Value * getAdditionalLiteral(const int32_t off)
void get_domain(DomainSet &domain_set) const override
Definition: Analyzer.cpp:3096
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
HOST DEVICE int get_size() const
Definition: sqltypes.h:321
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, DiamondCodegen &diamond_codegen)
bool g_enable_watchdog
std::string cat(Ts &&...args)
#define LL_BUILDER
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:101
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:115
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
#define LL_CONTEXT
bool expr_is_rowid(const Analyzer::Expr *expr, const Catalog_Namespace::Catalog &cat)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
ExecutorDeviceType
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
SQLTypeInfo sql_type
Definition: TargetInfo.h:42
Streaming Top N algorithm.
void codegenApproxMedian(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type)
#define LOG(tag)
Definition: Logger.h:188
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:79
void mark_function_always_inline(llvm::Function *func)
bool is_fp() const
Definition: sqltypes.h:482
ColRangeInfo getColRangeInfo()
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, GroupByAndAggregate::DiamondCodegen &diamond_codegen) const
const std::list< Analyzer::OrderEntry > order_entries
#define LL_INT(v)
static const size_t baseline_threshold
Definition: Execute.h:1018
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const GpuSharedMemoryContext &gpu_smem_context)
QueryDescriptionType hash_type_
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:513
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint)
#define CHECK_GE(x, y)
Definition: Logger.h:210
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: sqldefs.h:49
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
Expr * get_arg() const
Definition: Analyzer.h:1096
size_t getEffectiveKeyWidth() const
void checkErrorCode(llvm::Value *retCode)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:134
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:209
std::list< const Expr * > DomainSet
Definition: Analyzer.h:61
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
static WindowFunctionContext * getActiveWindowFunctionContext(Executor *executor)
std::string to_string(char const *&&v)
Helpers for codegen of target expressions.
#define LL_BOOL(v)
size_t getColOnlyOffInBytes(const size_t col_idx) const
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:73
const SQLTypeInfo get_compact_type(const TargetInfo &target)
const size_t limit
CountDistinctDescriptors initCountDistinctDescriptors()
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, const std::optional< int64_t > &group_cardinality_estimation)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:162
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:216
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:35
const SortInfo sort_info
size_t getGroupbyColCount() const
const JoinQualsPerNestingLevel join_quals
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
void setFalseTarget(llvm::BasicBlock *cond_false)
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
bool g_bigint_count
Definition: sqldefs.h:75
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:130
KeylessInfo getKeylessInfo(const std::vector< Analyzer::Expr * > &target_expr_list, const bool is_group_by) const
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
DiamondCodegen(llvm::Value *cond, Executor *executor, const bool chain_to_next, const std::string &label_prefix, DiamondCodegen *parent, const bool share_false_edge_with_parent)
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
#define AUTOMATIC_IR_METADATA(CGENSTATE)
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:41
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:78
QueryDescriptionType getQueryDescriptionType() const
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
ExecutorDeviceType device_type
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:26
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:42
#define CHECK_LT(x, y)
Definition: Logger.h:207
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
void agg_count_distinct(int64_t *agg, const int64_t val)
std::vector< int8_t > get_col_byte_widths(const T &col_expr_list)
#define CHECK_LE(x, y)
Definition: Logger.h:208
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
Definition: sqldefs.h:76
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
bool cardinality_estimate_less_than_column_range(const int64_t cardinality_estimate, const ColRangeInfo &col_range_info)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
Descriptor for the result set buffer layout.
CountDistinctImplType
const std::optional< int64_t > group_cardinality_estimation_
void add_transient_string_literals_for_expression(const Analyzer::Expr *expr, Executor *executor, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:197
bool is_geometry() const
Definition: sqltypes.h:490
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
bool g_cluster
ColRangeInfo getExprRangeInfo(const Analyzer::Expr *expr) const
#define CUR_FUNC
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:172
Definition: sqltypes.h:44
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
size_t g_leaf_count
Definition: ParserNode.cpp:74
const RelAlgExecutionUnit & ra_exe_unit_
const size_t offset
Definition: sqldefs.h:74
Definition: sqldefs.h:72
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
SQLOps get_optype() const
Definition: Analyzer.h:370
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)
void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)