OmniSciDB  c07336695a
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
20 
21 #include "CardinalityEstimator.h"
22 #include "CodeGenerator.h"
24 #include "ExpressionRange.h"
25 #include "ExpressionRewrite.h"
26 #include "GpuInitGroups.h"
27 #include "InPlaceSort.h"
29 #include "MaxwellCodegenPatch.h"
31 #include "TargetExprBuilder.h"
32 
33 #include "../CudaMgr/CudaMgr.h"
34 #include "../Shared/checked_alloc.h"
35 #include "../Utils/ChunkIter.h"
37 #include "Execute.h"
38 #include "QueryTemplateGenerator.h"
39 #include "RuntimeFunctions.h"
40 #include "StreamingTopN.h"
41 #include "TopKSort.h"
42 #include "WindowContext.h"
43 
44 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
45 
46 #include <numeric>
47 #include <thread>
48 
49 bool g_cluster{false};
50 bool g_bigint_count{false};
52 extern size_t g_leaf_count;
53 
54 namespace {
55 
56 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
57  int32_t agg_count{0};
58  for (auto target_expr : target_exprs) {
59  CHECK(target_expr);
60  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
61  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
62  const auto& ti = target_expr->get_type_info();
63  // TODO(pavan): or if is_geometry()
64  if (ti.is_array() || (ti.is_string() && ti.get_compression() == kENCODING_NONE)) {
65  agg_count += 2;
66  } else if (ti.is_geometry()) {
67  agg_count += ti.get_physical_coord_cols() * 2;
68  } else {
69  ++agg_count;
70  }
71  continue;
72  }
73  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
74  agg_count += 2;
75  } else {
76  ++agg_count;
77  }
78  }
79  return agg_count;
80 }
81 
83  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
84  if (!col) {
85  return false;
86  }
87  const auto cd =
88  get_column_descriptor_maybe(col->get_column_id(), col->get_table_id(), cat);
89  if (!cd || !cd->isVirtualCol) {
90  return false;
91  }
92  CHECK_EQ("rowid", cd->columnName);
93  return true;
94 }
95 
96 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
97  for (const auto& target_expr : ra_exe_unit.target_exprs) {
98  const auto agg_info = get_target_info(target_expr, g_bigint_count);
99  if (agg_info.is_agg && is_distinct_target(agg_info)) {
100  return true;
101  }
102  }
103  return false;
104 }
105 
107  const int64_t max_entry_count) {
108  try {
109  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
110  checked_int64_t(col_range_info.min)) >= max_entry_count;
111  } catch (...) {
112  return true;
113  }
114 }
115 
116 } // namespace
117 
119  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
120  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
121  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
122  // can expect this to be true anyway for grouped queries since the precise version
123  // uses significantly more memory.
124  const int64_t baseline_threshold =
125  has_count_distinct(ra_exe_unit_)
126  ? (device_type_ == ExecutorDeviceType::GPU ? (Executor::baseline_threshold / 4)
129  if (ra_exe_unit_.groupby_exprs.size() != 1) {
130  try {
131  checked_int64_t cardinality{1};
132  bool has_nulls{false};
133  for (const auto groupby_expr : ra_exe_unit_.groupby_exprs) {
134  auto col_range_info = getExprRangeInfo(groupby_expr.get());
135  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
136  // going through baseline hash if a non-integer type is encountered
137  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
138  }
139  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
140  CHECK_GE(crt_col_cardinality, 0);
141  cardinality *= crt_col_cardinality;
142  if (col_range_info.has_nulls) {
143  has_nulls = true;
144  }
145  }
146  // For zero or high cardinalities, use baseline layout.
147  if (!cardinality || cardinality > baseline_threshold) {
148  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
149  }
151  0,
152  int64_t(cardinality),
153  0,
154  has_nulls};
155  } catch (...) { // overflow when computing cardinality
156  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
157  }
158  }
159  // For single column groupby on high timestamps, force baseline hash due to wide ranges
160  // we are likely to encounter when applying quals to the expression range
161  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
162  // the range is small enough
163  if (ra_exe_unit_.groupby_exprs.front() &&
164  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
165  ra_exe_unit_.simple_quals.size() > 0) {
166  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
167  }
168  const auto col_range_info = getExprRangeInfo(ra_exe_unit_.groupby_exprs.front().get());
169  if (!ra_exe_unit_.groupby_exprs.front()) {
170  return col_range_info;
171  }
172  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
173  const int64_t col_count =
174  ra_exe_unit_.groupby_exprs.size() + ra_exe_unit_.target_exprs.size();
175  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
176  if (has_count_distinct(ra_exe_unit_)) {
177  max_entry_count = std::min(max_entry_count, baseline_threshold);
178  }
179  if ((!ra_exe_unit_.groupby_exprs.front()->get_type_info().is_string() &&
180  !expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get(), *executor_->catalog_)) &&
181  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
182  !col_range_info.bucket) {
184  col_range_info.min,
185  col_range_info.max,
186  0,
187  col_range_info.has_nulls};
188  }
189  return col_range_info;
190 }
191 
193  if (!expr) {
194  return {QueryDescriptionType::Projection, 0, 0, 0, false};
195  }
196 
197  const auto expr_range = getExpressionRange(
198  expr, query_infos_, executor_, boost::make_optional(ra_exe_unit_.simple_quals));
199  switch (expr_range.getType()) {
202  expr_range.getIntMin(),
203  expr_range.getIntMax(),
204  expr_range.getBucket(),
205  expr_range.hasNulls()};
210  default:
211  CHECK(false);
212  }
213  CHECK(false);
214  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
215 }
216 
218  checked_int64_t crt_col_cardinality =
219  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
220  if (col_range_info.bucket) {
221  crt_col_cardinality /= col_range_info.bucket;
222  }
223  return static_cast<int64_t>(crt_col_cardinality +
224  (1 + (col_range_info.has_nulls ? 1 : 0)));
225 }
226 
227 #define LL_CONTEXT executor_->cgen_state_->context_
228 #define LL_BUILDER executor_->cgen_state_->ir_builder_
229 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
230 #define LL_INT(v) executor_->cgen_state_->llInt(v)
231 #define LL_FP(v) executor_->cgen_state_->llFp(v)
232 #define ROW_FUNC executor_->cgen_state_->row_func_
233 
235  Executor* executor,
236  const ExecutorDeviceType device_type,
237  const RelAlgExecutionUnit& ra_exe_unit,
238  const std::vector<InputTableInfo>& query_infos,
239  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner)
240  : executor_(executor)
241  , ra_exe_unit_(ra_exe_unit)
242  , query_infos_(query_infos)
243  , row_set_mem_owner_(row_set_mem_owner)
244  , device_type_(device_type) {
245  for (const auto groupby_expr : ra_exe_unit_.groupby_exprs) {
246  if (!groupby_expr) {
247  continue;
248  }
249  const auto& groupby_ti = groupby_expr->get_type_info();
250  if (groupby_ti.is_string() && groupby_ti.get_compression() != kENCODING_DICT) {
251  throw std::runtime_error(
252  "Cannot group by string columns which are not dictionary encoded.");
253  }
254  if (groupby_ti.is_array()) {
255  throw std::runtime_error("Group by array not supported");
256  }
257  if (groupby_ti.is_geometry()) {
258  throw std::runtime_error("Group by geometry not supported");
259  }
260  }
261 }
262 
264  const size_t shard_count) const {
265  size_t device_count{0};
267  device_count = executor_->getCatalog()->getDataMgr().getCudaMgr()->getDeviceCount();
268  CHECK_GT(device_count, 0u);
269  }
270 
271  int64_t bucket{col_range_info.bucket};
272 
273  if (shard_count) {
274  CHECK(!col_range_info.bucket);
275  /*
276  when a node has fewer devices than shard count,
277  a) In a distributed setup, the minimum distance between two keys would be
278  device_count because shards are stored consecutively across the physical tables, i.e
279  if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1 would
280  have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf node
281  has only 1 device, in this case, all the keys from each node are loaded on the
282  device each.
283 
284  b) In a single node setup, the distance would be minimum of device_count or
285  difference of device_count - shard_count. For example: If a single node server
286  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
287  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9 device
288  3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum of
289  device_count or difference.
290 
291  When a node has device count equal to or more than shard count then the
292  minimum distance is always at least shard_count * no of leaf nodes.
293  */
294  if (device_count < shard_count) {
295  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
296  : std::min(device_count, shard_count - device_count);
297  } else {
298  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
299  }
300  }
301 
302  return bucket;
303 }
304 
305 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
306  const bool allow_multifrag,
307  const size_t max_groups_buffer_entry_count,
308  const int8_t crt_min_byte_width,
309  RenderInfo* render_info,
310  const bool output_columnar_hint) {
311  const auto shard_count =
314  : 0;
315  bool sort_on_gpu_hint =
316  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
319  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
320  // but the total output buffer size would be too big or it's a sharded top query.
321  // For the sake of managing risk, use the new result set way very selectively for
322  // this case only (alongside the baseline layout we've enabled for a while now).
323  bool must_use_baseline_sort = shard_count;
324  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
325  while (true) {
326  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
327  max_groups_buffer_entry_count,
328  crt_min_byte_width,
329  sort_on_gpu_hint,
330  render_info,
331  must_use_baseline_sort,
332  output_columnar_hint);
333  CHECK(query_mem_desc);
334  if (query_mem_desc->sortOnGpu() &&
335  (query_mem_desc->getBufferSizeBytes(device_type_) +
336  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
337  2 * 1024 * 1024 * 1024L) {
338  must_use_baseline_sort = true;
339  sort_on_gpu_hint = false;
340  } else {
341  break;
342  }
343  }
344  return query_mem_desc;
345 }
346 
347 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
348  const bool allow_multifrag,
349  const size_t max_groups_buffer_entry_count,
350  const int8_t crt_min_byte_width,
351  const bool sort_on_gpu_hint,
352  RenderInfo* render_info,
353  const bool must_use_baseline_sort,
354  const bool output_columnar_hint) {
356 
357  const auto count_distinct_descriptors = initCountDistinctDescriptors();
358 
359  auto group_col_widths = get_col_byte_widths(ra_exe_unit_.groupby_exprs, {});
360 
361  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
362 
363  auto col_range_info_nosharding = getColRangeInfo();
364 
365  const auto shard_count =
368  : 0;
369 
370  const auto col_range_info =
371  ColRangeInfo{col_range_info_nosharding.hash_type_,
372  col_range_info_nosharding.min,
373  col_range_info_nosharding.max,
374  getShardedTopBucket(col_range_info_nosharding, shard_count),
375  col_range_info_nosharding.has_nulls};
376 
377  // Non-grouped aggregates do not support accessing aggregated ranges
378  const auto keyless_info = !is_group_by
379  ? KeylessInfo{false, -1, false}
380  : getKeylessInfo(ra_exe_unit_.target_exprs, is_group_by);
381 
382  if (g_enable_watchdog &&
383  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
384  max_groups_buffer_entry_count > 120000000) ||
385  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
386  ra_exe_unit_.groupby_exprs.size() == 1 &&
387  (col_range_info.max - col_range_info.min) /
388  std::max(col_range_info.bucket, int64_t(1)) >
389  130000000))) {
390  throw WatchdogException("Query would use too much memory");
391  }
393  ra_exe_unit_,
394  query_infos_,
395  col_range_info,
396  keyless_info,
397  allow_multifrag,
398  device_type_,
399  crt_min_byte_width,
400  sort_on_gpu_hint,
401  shard_count,
402  max_groups_buffer_entry_count,
403  render_info,
404  count_distinct_descriptors,
405  must_use_baseline_sort,
406  output_columnar_hint);
407 }
408 
411 }
412 
413 namespace {
414 
416  const Analyzer::Expr* expr,
417  Executor* executor,
418  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner) {
419  if (!expr) {
420  return;
421  }
422 
423  const auto array_expr = dynamic_cast<const Analyzer::ArrayExpr*>(expr);
424  if (array_expr) {
425  for (size_t i = 0; i < array_expr->getElementCount(); i++) {
427  array_expr->getElement(i), executor, row_set_mem_owner);
428  }
429  return;
430  }
431 
432  const auto cast_expr = dynamic_cast<const Analyzer::UOper*>(expr);
433  const auto& expr_ti = expr->get_type_info();
434  if (cast_expr && cast_expr->get_optype() == kCAST && expr_ti.is_string()) {
435  CHECK_EQ(kENCODING_DICT, expr_ti.get_compression());
436  auto sdp = executor->getStringDictionaryProxy(
437  expr_ti.get_comp_param(), row_set_mem_owner, true);
438  CHECK(sdp);
439  const auto str_lit_expr =
440  dynamic_cast<const Analyzer::Constant*>(cast_expr->get_operand());
441  if (str_lit_expr && str_lit_expr->get_constval().stringval) {
442  sdp->getOrAddTransient(*str_lit_expr->get_constval().stringval);
443  }
444  return;
445  }
446  const auto case_expr = dynamic_cast<const Analyzer::CaseExpr*>(expr);
447  if (!case_expr) {
448  return;
449  }
450  Analyzer::DomainSet domain_set;
451  case_expr->get_domain(domain_set);
452  if (domain_set.empty()) {
453  return;
454  }
455  if (expr_ti.is_string()) {
456  CHECK_EQ(kENCODING_DICT, expr_ti.get_compression());
457  auto sdp = executor->getStringDictionaryProxy(
458  expr_ti.get_comp_param(), row_set_mem_owner, true);
459  CHECK(sdp);
460  for (const auto domain_expr : domain_set) {
461  const auto cast_expr = dynamic_cast<const Analyzer::UOper*>(domain_expr);
462  const auto str_lit_expr =
463  cast_expr && cast_expr->get_optype() == kCAST
464  ? dynamic_cast<const Analyzer::Constant*>(cast_expr->get_operand())
465  : dynamic_cast<const Analyzer::Constant*>(domain_expr);
466  if (str_lit_expr && str_lit_expr->get_constval().stringval) {
467  sdp->getOrAddTransient(*str_lit_expr->get_constval().stringval);
468  }
469  }
470  }
471 }
472 
473 } // namespace
474 
476  const RelAlgExecutionUnit& ra_exe_unit,
477  Executor* executor,
478  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner) {
479  for (const auto group_expr : ra_exe_unit.groupby_exprs) {
481  group_expr.get(), executor, row_set_mem_owner);
482  }
483  for (const auto target_expr : ra_exe_unit.target_exprs) {
484  const auto& target_type = target_expr->get_type_info();
485  if (target_type.is_string() && target_type.get_compression() != kENCODING_DICT) {
486  continue;
487  }
488  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
489  if (agg_expr) {
490  if (agg_expr->get_aggtype() == kSAMPLE) {
492  agg_expr->get_arg(), executor, row_set_mem_owner);
493  }
494  } else {
496  target_expr, executor, row_set_mem_owner);
497  }
498  }
499  row_set_mem_owner->addLiteralStringDictProxy(executor->lit_str_dict_proxy_);
500 }
501 
503  CountDistinctDescriptors count_distinct_descriptors;
504  for (const auto target_expr : ra_exe_unit_.target_exprs) {
505  auto agg_info = get_target_info(target_expr, g_bigint_count);
506  if (is_distinct_target(agg_info)) {
507  CHECK(agg_info.is_agg);
508  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
509  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
510  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
511  if (arg_ti.is_string() && arg_ti.get_compression() != kENCODING_DICT) {
512  throw std::runtime_error(
513  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
514  }
515  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_array()) {
516  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
517  }
518  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
519  throw std::runtime_error(
520  "APPROX_COUNT_DISTINCT on geometry columns not supported");
521  }
522  if (agg_info.is_distinct && arg_ti.is_geometry()) {
523  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
524  }
525  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
526  auto arg_range_info =
527  arg_ti.is_fp() ? no_range_info : getExprRangeInfo(agg_expr->get_arg());
528  CountDistinctImplType count_distinct_impl_type{CountDistinctImplType::StdSet};
529  int64_t bitmap_sz_bits{0};
530  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
531  const auto error_rate = agg_expr->get_error_rate();
532  if (error_rate) {
533  CHECK(error_rate->get_type_info().get_type() == kINT);
534  CHECK_GE(error_rate->get_constval().intval, 1);
535  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
536  } else {
537  bitmap_sz_bits = g_hll_precision_bits;
538  }
539  }
540  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
541  !(arg_ti.is_array() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
542  // implementation for arrays
543  if (arg_range_info.isEmpty()) {
544  count_distinct_descriptors.emplace_back(
546  0,
547  64,
548  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
549  device_type_,
550  1});
551  continue;
552  }
553  count_distinct_impl_type = CountDistinctImplType::Bitmap;
554  if (agg_info.agg_kind == kCOUNT) {
555  bitmap_sz_bits = arg_range_info.max - arg_range_info.min + 1;
556  const int64_t MAX_BITMAP_BITS{8 * 1000 * 1000 * 1000L};
557  if (bitmap_sz_bits <= 0 || bitmap_sz_bits > MAX_BITMAP_BITS) {
558  count_distinct_impl_type = CountDistinctImplType::StdSet;
559  }
560  }
561  }
562  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
563  count_distinct_impl_type == CountDistinctImplType::StdSet &&
564  !(arg_ti.is_array() || arg_ti.is_geometry())) {
565  count_distinct_impl_type = CountDistinctImplType::Bitmap;
566  }
567  if (g_enable_watchdog &&
568  count_distinct_impl_type == CountDistinctImplType::StdSet) {
569  throw WatchdogException("Cannot use a fast path for COUNT distinct");
570  }
571  const auto sub_bitmap_count =
573  count_distinct_descriptors.emplace_back(
574  CountDistinctDescriptor{count_distinct_impl_type,
575  arg_range_info.min,
576  bitmap_sz_bits,
577  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
578  device_type_,
579  sub_bitmap_count});
580  } else {
581  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
582  CountDistinctImplType::Invalid, 0, 0, false, device_type_, 0});
583  }
584  }
585  return count_distinct_descriptors;
586 }
587 
589  const std::vector<Analyzer::Expr*>& target_expr_list,
590  const bool is_group_by) const {
591  bool keyless{true}, found{false}, shared_mem_support{false},
592  shared_mem_valid_data_type{true};
593  /* Currently support shared memory usage for a limited subset of possible aggregate
594  * operations. shared_mem_support and
595  * shared_mem_valid_data_type are declared to ensure such support. */
596  int32_t num_agg_expr{0}; // used for shared memory support on the GPU
597  int32_t index{0};
598  for (const auto target_expr : target_expr_list) {
599  const auto agg_info = get_target_info(target_expr, g_bigint_count);
600  const auto chosen_type = get_compact_type(agg_info);
601  // TODO(Saman): should be eventually removed, once I make sure what data types can
602  // be used in this shared memory setting.
603 
604  shared_mem_valid_data_type =
605  shared_mem_valid_data_type && supportedTypeForGpuSharedMemUsage(chosen_type);
606 
607  if (agg_info.is_agg) {
608  num_agg_expr++;
609  }
610  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
611  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
612  CHECK(agg_expr);
613  const auto arg_expr = agg_arg(target_expr);
614  const bool float_argument_input = takes_float_argument(agg_info);
615  switch (agg_info.agg_kind) {
616  case kAVG:
617  ++index;
618  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
619  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
620  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
621  expr_range_info.hasNulls()) {
622  break;
623  }
624  }
625  found = true;
626  break;
627  case kCOUNT:
628  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
629  const auto& arg_ti = arg_expr->get_type_info();
630  if (arg_ti.is_string() && arg_ti.get_compression() == kENCODING_NONE) {
631  break;
632  }
633  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
634  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
635  expr_range_info.hasNulls()) {
636  break;
637  }
638  }
639  found = true;
640  if (!agg_info.skip_null_val) {
641  shared_mem_support = true; // currently just support 8 bytes per group
642  }
643  break;
644  case kSUM: {
645  auto arg_ti = arg_expr->get_type_info();
646  if (constrained_not_null(arg_expr, ra_exe_unit_.quals)) {
647  arg_ti.set_notnull(true);
648  }
649  if (!arg_ti.get_notnull()) {
650  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
651  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
652  !expr_range_info.hasNulls()) {
653  found = true;
654  }
655  } else {
656  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
657  switch (expr_range_info.getType()) {
660  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
661  found = true;
662  }
663  break;
665  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
666  found = true;
667  }
668  break;
669  default:
670  break;
671  }
672  }
673  break;
674  }
675  case kMIN: {
676  CHECK(agg_expr && agg_expr->get_arg());
677  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
678  if (arg_ti.is_string() || arg_ti.is_array()) {
679  break;
680  }
681  auto expr_range_info =
682  getExpressionRange(agg_expr->get_arg(), query_infos_, executor_);
683  auto init_max = get_agg_initial_val(agg_info.agg_kind,
684  chosen_type,
685  is_group_by || float_argument_input,
686  float_argument_input ? sizeof(float) : 8);
687  switch (expr_range_info.getType()) {
690  auto double_max =
691  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
692  if (expr_range_info.getFpMax() < double_max) {
693  found = true;
694  }
695  break;
696  }
698  if (expr_range_info.getIntMax() < init_max) {
699  found = true;
700  }
701  break;
702  default:
703  break;
704  }
705  break;
706  }
707  case kMAX: {
708  CHECK(agg_expr && agg_expr->get_arg());
709  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
710  if (arg_ti.is_string() || arg_ti.is_array()) {
711  break;
712  }
713  auto expr_range_info =
714  getExpressionRange(agg_expr->get_arg(), query_infos_, executor_);
715  auto init_min = get_agg_initial_val(agg_info.agg_kind,
716  chosen_type,
717  is_group_by || float_argument_input,
718  float_argument_input ? sizeof(float) : 8);
719  switch (expr_range_info.getType()) {
722  auto double_min =
723  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
724  if (expr_range_info.getFpMin() > double_min) {
725  found = true;
726  }
727  break;
728  }
730  if (expr_range_info.getIntMin() > init_min) {
731  found = true;
732  }
733  break;
734  default:
735  break;
736  }
737  break;
738  }
739  default:
740  keyless = false;
741  break;
742  }
743  }
744  if (!keyless) {
745  break;
746  }
747  if (!found) {
748  ++index;
749  }
750  }
751 
752  // shouldn't use keyless for projection only
758  return {keyless && found,
759  index,
760  ((num_agg_expr == 1) && (target_expr_list.size() <= 2))
761  ? shared_mem_support && shared_mem_valid_data_type
762  : false};
763 }
764 
770  const SQLTypeInfo& target_type_info) const {
771  bool result = false;
772  switch (target_type_info.get_type()) {
773  case SQLTypes::kTINYINT:
774  case SQLTypes::kSMALLINT:
775  case SQLTypes::kINT:
776  result = true;
777  break;
778  case SQLTypes::kTEXT:
779  if (target_type_info.get_compression() == EncodingType::kENCODING_DICT) {
780  result = true;
781  }
782  break;
783  default:
784  break;
785  }
786  return result;
787 }
788 
789 // TODO(Saman): this function is temporary and all these limitations should eventually
790 // be removed.
792  /*
793  UNNEST operations follow a slightly different internal memory layout compared to other
794  keyless aggregates Currently, we opt out of using shared memory if there is any UNNEST
795  operation involved.
796  */
797  if (dynamic_cast<Analyzer::UOper*>(expr) &&
798  static_cast<Analyzer::UOper*>(expr)->get_optype() == kUNNEST) {
799  return false;
800  }
801  return true;
802 }
803 
805  const std::list<Analyzer::OrderEntry>& order_entries) {
806  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
807  return false;
808  }
809  for (const auto order_entry : order_entries) {
810  CHECK_GE(order_entry.tle_no, 1);
811  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
812  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
813  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
814  return false;
815  }
816  // TODO(alex): relax the restrictions
817  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
818  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
819  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
820  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
821  return false;
822  }
823  if (agg_expr->get_arg()) {
824  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
825  if (arg_ti.is_fp()) {
826  return false;
827  }
828  auto expr_range_info = getExprRangeInfo(agg_expr->get_arg());
829  // TOD(adb): QMD not actually initialized here?
830  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
831  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
832  expr_range_info.has_nulls) &&
833  order_entry.is_desc == order_entry.nulls_first) {
834  return false;
835  }
836  }
837  const auto& target_ti = target_expr->get_type_info();
838  CHECK(!target_ti.is_array());
839  if (!target_ti.is_integer()) {
840  return false;
841  }
842  }
843  return true;
844 }
845 
847  llvm::Value* cond,
848  Executor* executor,
849  const bool chain_to_next,
850  const std::string& label_prefix,
851  DiamondCodegen* parent,
852  const bool share_false_edge_with_parent)
853  : executor_(executor), chain_to_next_(chain_to_next), parent_(parent) {
854  if (parent_) {
856  }
857  cond_true_ = llvm::BasicBlock::Create(LL_CONTEXT, label_prefix + "_true", ROW_FUNC);
858  if (share_false_edge_with_parent) {
859  CHECK(parent);
861  } else {
863  llvm::BasicBlock::Create(LL_CONTEXT, label_prefix + "_false", ROW_FUNC);
864  }
865 
866  LL_BUILDER.CreateCondBr(cond, cond_true_, cond_false_);
867  LL_BUILDER.SetInsertPoint(cond_true_);
868 }
869 
871  CHECK(!parent_);
872  chain_to_next_ = true;
873 }
874 
875 void GroupByAndAggregate::DiamondCodegen::setFalseTarget(llvm::BasicBlock* cond_false) {
877  cond_false_ = cond_false;
878 }
879 
882  LL_BUILDER.CreateBr(parent_->cond_false_);
883  } else if (chain_to_next_) {
884  LL_BUILDER.CreateBr(cond_false_);
885  }
887  LL_BUILDER.SetInsertPoint(orig_cond_false_);
888  }
889 }
890 
891 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
892  llvm::BasicBlock* sc_false,
893  const QueryMemoryDescriptor& query_mem_desc,
894  const CompilationOptions& co) {
895  CHECK(filter_result);
896 
897  bool can_return_error = false;
898  llvm::BasicBlock* filter_false{nullptr};
899 
900  {
901  const bool is_group_by = !ra_exe_unit_.groupby_exprs.empty();
902 
903  if (executor_->isArchMaxwell(co.device_type_)) {
905  }
906  DiamondCodegen filter_cfg(filter_result,
907  executor_,
908  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
909  "filter",
910  nullptr,
911  false);
912  filter_false = filter_cfg.cond_false_;
913 
914  if (is_group_by) {
916  !use_streaming_top_n(ra_exe_unit_, query_mem_desc.didOutputColumnar())) {
917  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
918  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
919  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
920  llvm::Value* old_total_matched_val{nullptr};
922  old_total_matched_val =
923  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
924  total_matched_ptr,
925  LL_INT(int32_t(1)),
926  llvm::AtomicOrdering::Monotonic);
927  } else {
928  old_total_matched_val = LL_BUILDER.CreateLoad(total_matched_ptr);
929  LL_BUILDER.CreateStore(
930  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
931  total_matched_ptr);
932  }
933  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
934  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
935  }
936 
937  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
938  if (query_mem_desc.usesGetGroupValueFast() ||
939  query_mem_desc.getQueryDescriptionType() ==
941  if (query_mem_desc.getGroupbyColCount() > 1) {
942  filter_cfg.setChainToNext();
943  }
944  // Don't generate null checks if the group slot is guaranteed to be non-null,
945  // as it's the case for get_group_value_fast* family.
946  can_return_error =
947  codegenAggCalls(agg_out_ptr_w_idx, {}, query_mem_desc, co, filter_cfg);
948  } else {
949  {
950  llvm::Value* nullcheck_cond{nullptr};
951  if (query_mem_desc.didOutputColumnar()) {
952  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
953  LL_INT(int32_t(0)));
954  } else {
955  nullcheck_cond = LL_BUILDER.CreateICmpNE(
956  std::get<0>(agg_out_ptr_w_idx),
957  llvm::ConstantPointerNull::get(
958  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
959  }
960  DiamondCodegen nullcheck_cfg(
961  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
962  codegenAggCalls(agg_out_ptr_w_idx, {}, query_mem_desc, co, filter_cfg);
963  }
964  can_return_error = true;
965  if (query_mem_desc.getQueryDescriptionType() ==
968  // Ignore rejection on pushing current row to top-K heap.
969  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
970  } else {
971  CodeGenerator code_generator(executor_);
972  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
973  // TODO(alex): remove the trunc once pos is converted to 32 bits
974  code_generator.posArg(nullptr),
975  get_int_type(32, LL_CONTEXT))));
976  }
977  }
978  } else {
979  if (ra_exe_unit_.estimator) {
980  std::stack<llvm::BasicBlock*> array_loops;
981  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
982  } else {
983  auto arg_it = ROW_FUNC->arg_begin();
984  std::vector<llvm::Value*> agg_out_vec;
985  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
986  agg_out_vec.push_back(&*arg_it++);
987  }
988  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
989  agg_out_vec,
990  query_mem_desc,
991  co,
992  filter_cfg);
993  }
994  }
995  }
996 
997  if (ra_exe_unit_.join_quals.empty()) {
998  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
999  } else if (sc_false) {
1000  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
1001  LL_BUILDER.SetInsertPoint(sc_false);
1002  LL_BUILDER.CreateBr(filter_false);
1003  LL_BUILDER.SetInsertPoint(saved_insert_block);
1004  }
1005 
1006  return can_return_error;
1007 }
1008 
1010  llvm::Value* groups_buffer,
1011  const QueryMemoryDescriptor& query_mem_desc,
1012  const CompilationOptions& co,
1013  DiamondCodegen& diamond_codegen) {
1015  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1016  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
1017  CHECK(!group_expr);
1018  if (!query_mem_desc.didOutputColumnar()) {
1019  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
1020  }
1021  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1022  ? 0
1023  : query_mem_desc.getRowSize() / sizeof(int64_t);
1024  CodeGenerator code_generator(executor_);
1025  if (use_streaming_top_n(ra_exe_unit_, query_mem_desc.didOutputColumnar())) {
1026  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
1027  CHECK_GE(only_order_entry.tle_no, int(1));
1028  const size_t target_idx = only_order_entry.tle_no - 1;
1029  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1030  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1031  const auto chosen_bytes =
1032  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1033  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1034  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1036  std::string fname = "get_bin_from_k_heap";
1037  const auto& oe_ti = order_entry_expr->get_type_info();
1038  llvm::Value* null_key_lv = nullptr;
1039  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1040  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1041  switch (bit_width) {
1042  case 32:
1043  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1044  break;
1045  case 64:
1046  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1047  break;
1048  default:
1049  CHECK(false);
1050  }
1051  fname += "_int" + std::to_string(bit_width) + "_t";
1052  } else {
1053  CHECK(oe_ti.is_fp());
1054  if (order_entry_lv->getType()->isDoubleTy()) {
1055  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1056  } else {
1057  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1058  }
1059  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1060  }
1061  const auto key_slot_idx =
1063  return emitCall(
1064  fname,
1065  {groups_buffer,
1066  LL_INT(n),
1067  LL_INT(row_size_quad),
1068  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1069  LL_BOOL(only_order_entry.is_desc),
1070  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1071  LL_BOOL(only_order_entry.nulls_first),
1072  null_key_lv,
1073  order_entry_lv});
1074  } else {
1075  llvm::Value* output_buffer_entry_count_lv{nullptr};
1077  output_buffer_entry_count_lv =
1078  LL_BUILDER.CreateLoad(get_arg_by_name(ROW_FUNC, "max_matched"));
1079  CHECK(output_buffer_entry_count_lv);
1080  }
1081  const auto group_expr_lv =
1082  LL_BUILDER.CreateLoad(get_arg_by_name(ROW_FUNC, "old_total_matched"));
1083  std::vector<llvm::Value*> args{
1084  groups_buffer,
1085  output_buffer_entry_count_lv
1086  ? output_buffer_entry_count_lv
1087  : LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1088  group_expr_lv,
1089  code_generator.posArg(nullptr)};
1090  if (query_mem_desc.didOutputColumnar()) {
1091  const auto columnar_output_offset =
1092  emitCall("get_columnar_scan_output_offset", args);
1093  return columnar_output_offset;
1094  }
1095  args.push_back(LL_INT(row_size_quad));
1096  return emitCall("get_scan_output_slot", args);
1097  }
1098 }
1099 
1100 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1101  const QueryMemoryDescriptor& query_mem_desc,
1102  const CompilationOptions& co,
1103  DiamondCodegen& diamond_codegen) {
1104  auto arg_it = ROW_FUNC->arg_begin();
1105  auto groups_buffer = arg_it++;
1106 
1107  std::stack<llvm::BasicBlock*> array_loops;
1108 
1109  // TODO(Saman): move this logic outside of this function.
1111  if (query_mem_desc.didOutputColumnar()) {
1112  return std::make_tuple(
1113  &*groups_buffer,
1114  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1115  } else {
1116  return std::make_tuple(
1117  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1118  nullptr);
1119  }
1120  }
1121 
1122  CHECK(query_mem_desc.getQueryDescriptionType() ==
1124  query_mem_desc.getQueryDescriptionType() ==
1126 
1127  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1128  ? 0
1129  : query_mem_desc.getRowSize() / sizeof(int64_t);
1130 
1131  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1132  ? sizeof(int64_t)
1133  : query_mem_desc.getEffectiveKeyWidth();
1134  // for multi-column group by
1135  llvm::Value* group_key = nullptr;
1136  llvm::Value* key_size_lv = nullptr;
1137 
1138  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1139  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.groupColWidthsSize()));
1140  if (query_mem_desc.getQueryDescriptionType() ==
1142  group_key =
1143  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1144  } else if (query_mem_desc.getQueryDescriptionType() ==
1146  group_key =
1147  col_width_size == sizeof(int32_t)
1148  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1149  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1150  }
1151  CHECK(group_key);
1152  CHECK(key_size_lv);
1153  }
1154 
1155  int32_t subkey_idx = 0;
1156  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1157  for (const auto group_expr : ra_exe_unit_.groupby_exprs) {
1158  const auto col_range_info = getExprRangeInfo(group_expr.get());
1159  const auto translated_null_value = static_cast<int64_t>(
1160  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1161  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1162  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1163  : checked_int64_t(col_range_info.max) +
1164  (col_range_info.bucket ? col_range_info.bucket : 1));
1165 
1166  const bool col_has_nulls =
1167  query_mem_desc.getQueryDescriptionType() ==
1169  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1170  ? query_mem_desc.hasNulls()
1171  : col_range_info.has_nulls)
1172  : false;
1173 
1174  const auto group_expr_lvs =
1175  executor_->groupByColumnCodegen(group_expr.get(),
1176  col_width_size,
1177  co,
1178  col_has_nulls,
1179  translated_null_value,
1180  diamond_codegen,
1181  array_loops,
1182  query_mem_desc.threadsShareMemory());
1183  const auto group_expr_lv = group_expr_lvs.translated_value;
1184  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1185  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1186  return codegenSingleColumnPerfectHash(query_mem_desc,
1187  co,
1188  &*groups_buffer,
1189  group_expr_lv,
1190  group_expr_lvs.original_value,
1191  row_size_quad);
1192  } else {
1193  // store the sub-key to the buffer
1194  LL_BUILDER.CreateStore(group_expr_lv,
1195  LL_BUILDER.CreateGEP(group_key, LL_INT(subkey_idx++)));
1196  }
1197  }
1198  if (query_mem_desc.getQueryDescriptionType() ==
1200  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1202  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1203  } else if (query_mem_desc.getQueryDescriptionType() ==
1206  &*groups_buffer,
1207  group_key,
1208  key_size_lv,
1209  query_mem_desc,
1210  col_width_size,
1211  row_size_quad);
1212  }
1213  CHECK(false);
1214  return std::make_tuple(nullptr, nullptr);
1215 }
1216 
1217 std::tuple<llvm::Value*, llvm::Value*>
1219  const QueryMemoryDescriptor& query_mem_desc,
1220  const CompilationOptions& co,
1221  llvm::Value* groups_buffer,
1222  llvm::Value* group_expr_lv_translated,
1223  llvm::Value* group_expr_lv_original,
1224  const int32_t row_size_quad) {
1225  CHECK(query_mem_desc.usesGetGroupValueFast());
1226  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1227  ? "get_columnar_group_bin_offset"
1228  : "get_group_value_fast"};
1229  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1230  get_group_fn_name += "_keyless";
1231  }
1232  if (query_mem_desc.interleavedBins(co.device_type_)) {
1233  CHECK(!query_mem_desc.didOutputColumnar());
1234  CHECK(query_mem_desc.hasKeylessHash());
1235  get_group_fn_name += "_semiprivate";
1236  }
1237  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1238  &*group_expr_lv_translated};
1239  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1240  query_mem_desc.mustUseBaselineSort()) {
1241  get_group_fn_name += "_with_original_key";
1242  get_group_fn_args.push_back(group_expr_lv_original);
1243  }
1244  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1245  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1246  if (!query_mem_desc.hasKeylessHash()) {
1247  if (!query_mem_desc.didOutputColumnar()) {
1248  get_group_fn_args.push_back(LL_INT(row_size_quad));
1249  }
1250  } else {
1251  if (!query_mem_desc.didOutputColumnar()) {
1252  get_group_fn_args.push_back(LL_INT(row_size_quad));
1253  }
1254  if (query_mem_desc.interleavedBins(co.device_type_)) {
1255  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1256  get_group_fn_args.push_back(warp_idx);
1257  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1258  }
1259  }
1260  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1261  return std::make_tuple(&*groups_buffer,
1262  emitCall(get_group_fn_name, get_group_fn_args));
1263  }
1264  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1265 }
1266 
1267 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1268  llvm::Value* groups_buffer,
1269  llvm::Value* group_key,
1270  llvm::Value* key_size_lv,
1271  const QueryMemoryDescriptor& query_mem_desc,
1272  const int32_t row_size_quad) {
1273  CHECK(query_mem_desc.getQueryDescriptionType() ==
1275  // compute the index (perfect hash)
1276  auto perfect_hash_func = codegenPerfectHashFunction();
1277  auto hash_lv =
1278  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1279 
1280  if (query_mem_desc.didOutputColumnar()) {
1281  const std::string set_matching_func_name{
1282  "set_matching_group_value_perfect_hash_columnar"};
1283  const std::vector<llvm::Value*> set_matching_func_arg{
1284  groups_buffer,
1285  hash_lv,
1286  group_key,
1287  key_size_lv,
1288  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1289  query_mem_desc.getEntryCount())};
1290  emitCall(set_matching_func_name, set_matching_func_arg);
1291  return std::make_tuple(groups_buffer, hash_lv);
1292  } else {
1293  return std::make_tuple(
1294  emitCall("get_matching_group_value_perfect_hash",
1295  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1296  nullptr);
1297  }
1298 }
1299 
1300 std::tuple<llvm::Value*, llvm::Value*>
1302  const CompilationOptions& co,
1303  llvm::Value* groups_buffer,
1304  llvm::Value* group_key,
1305  llvm::Value* key_size_lv,
1306  const QueryMemoryDescriptor& query_mem_desc,
1307  const size_t key_width,
1308  const int32_t row_size_quad) {
1309  auto arg_it = ROW_FUNC->arg_begin(); // groups_buffer
1310  ++arg_it; // current match count
1311  ++arg_it; // total match count
1312  ++arg_it; // old match count
1313  ++arg_it; // output buffer slots count
1314  ++arg_it; // aggregate init values
1315  CHECK(arg_it->getName() == "agg_init_val");
1316  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1317  CHECK(key_width == sizeof(int32_t));
1318  group_key =
1319  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1320  }
1321  std::vector<llvm::Value*> func_args{
1322  groups_buffer,
1323  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1324  &*group_key,
1325  &*key_size_lv,
1326  LL_INT(static_cast<int32_t>(key_width))};
1327  std::string func_name{"get_group_value"};
1328  if (query_mem_desc.didOutputColumnar()) {
1329  func_name += "_columnar_slot";
1330  } else {
1331  func_args.push_back(LL_INT(row_size_quad));
1332  func_args.push_back(&*arg_it);
1333  }
1334  if (co.with_dynamic_watchdog_) {
1335  func_name += "_with_watchdog";
1336  }
1337  if (query_mem_desc.didOutputColumnar()) {
1338  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1339  } else {
1340  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1341  }
1342 }
1343 
1345  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1346  auto ft = llvm::FunctionType::get(
1347  get_int_type(32, LL_CONTEXT),
1348  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1349  false);
1350  auto key_hash_func = llvm::Function::Create(ft,
1351  llvm::Function::ExternalLinkage,
1352  "perfect_key_hash",
1353  executor_->cgen_state_->module_);
1354  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1355  mark_function_always_inline(key_hash_func);
1356  auto& key_buff_arg = *key_hash_func->args().begin();
1357  llvm::Value* key_buff_lv = &key_buff_arg;
1358  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1359  llvm::IRBuilder<> key_hash_func_builder(bb);
1360  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1361  std::vector<int64_t> cardinalities;
1362  for (const auto groupby_expr : ra_exe_unit_.groupby_exprs) {
1363  auto col_range_info = getExprRangeInfo(groupby_expr.get());
1364  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1365  cardinalities.push_back(getBucketedCardinality(col_range_info));
1366  }
1367  size_t dim_idx = 0;
1368  for (const auto groupby_expr : ra_exe_unit_.groupby_exprs) {
1369  auto key_comp_lv = key_hash_func_builder.CreateLoad(
1370  key_hash_func_builder.CreateGEP(key_buff_lv, LL_INT(dim_idx)));
1371  auto col_range_info = getExprRangeInfo(groupby_expr.get());
1372  auto crt_term_lv =
1373  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1374  if (col_range_info.bucket) {
1375  crt_term_lv =
1376  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1377  }
1378  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1379  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1380  LL_INT(cardinalities[prev_dim_idx]));
1381  }
1382  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1383  ++dim_idx;
1384  }
1385  key_hash_func_builder.CreateRet(
1386  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1387  return key_hash_func;
1388 }
1389 
1391  const TargetInfo& agg_info,
1392  llvm::Value* target) {
1393  const auto& agg_type = agg_info.sql_type;
1394  const size_t chosen_bytes = agg_type.get_size();
1395 
1396  bool need_conversion{false};
1397  llvm::Value* arg_null{nullptr};
1398  llvm::Value* agg_null{nullptr};
1399  llvm::Value* target_to_cast{target};
1400  if (arg_type.is_fp()) {
1401  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1402  if (agg_type.is_fp()) {
1403  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1404  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1405  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1406  need_conversion = true;
1407  }
1408  } else {
1409  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1410  return target;
1411  }
1412  } else {
1413  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1414  if (agg_type.is_fp()) {
1415  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1416  need_conversion = true;
1417  target_to_cast = executor_->castToFP(target);
1418  } else {
1419  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1420  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1421  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1422  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1423  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1424  need_conversion = true;
1425  }
1426  }
1427  }
1428  if (need_conversion) {
1429  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1430  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1431  return LL_BUILDER.CreateSelect(
1432  cmp,
1433  agg_null,
1434  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1435  } else {
1436  return target;
1437  }
1438 }
1439 
1441  const Analyzer::WindowFunction* window_func,
1442  const QueryMemoryDescriptor& query_mem_desc,
1443  const CompilationOptions& co,
1444  DiamondCodegen& diamond_codegen) {
1445  const auto window_func_context =
1447  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1448  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1449  ? 0
1450  : query_mem_desc.getRowSize() / sizeof(int64_t);
1451  auto arg_it = ROW_FUNC->arg_begin();
1452  auto groups_buffer = arg_it++;
1453  CodeGenerator code_generator(executor_);
1454  if (!window_func_context->getRowNumber()) {
1455  CHECK(window_func->getKind() == SqlWindowFunctionKind::COUNT);
1456  window_func_context->setRowNumber(emitCall(
1457  "row_number_window_func",
1458  {LL_INT(reinterpret_cast<const int64_t>(window_func_context->output())),
1459  code_generator.posArg(nullptr)}));
1460  }
1461  const auto pos_in_window = LL_BUILDER.CreateTrunc(window_func_context->getRowNumber(),
1462  get_int_type(32, LL_CONTEXT));
1463  llvm::Value* entry_count_lv =
1464  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1465  std::vector<llvm::Value*> args{
1466  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1467  if (query_mem_desc.didOutputColumnar()) {
1468  const auto columnar_output_offset =
1469  emitCall("get_columnar_scan_output_offset", args);
1470  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1471  }
1472  args.push_back(LL_INT(row_size_quad));
1473  return emitCall("get_scan_output_slot", args);
1474  }
1475  auto arg_it = ROW_FUNC->arg_begin();
1476  auto groups_buffer = arg_it++;
1477  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1478 }
1479 
1481  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1482  const std::vector<llvm::Value*>& agg_out_vec,
1483  const QueryMemoryDescriptor& query_mem_desc,
1484  const CompilationOptions& co,
1485  DiamondCodegen& diamond_codegen) {
1486  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1487  // TODO(alex): unify the two cases, the output for non-group by queries
1488  // should be a contiguous buffer
1489  const bool is_group_by{std::get<0>(agg_out_ptr_w_idx)};
1490  bool can_return_error = false;
1491  if (is_group_by) {
1492  CHECK(agg_out_vec.empty());
1493  } else {
1494  CHECK(!agg_out_vec.empty());
1495  }
1496 
1497  // output buffer is casted into a byte stream to be able to handle data elements of
1498  // different sizes (only used when actual column width sizes are used)
1499  llvm::Value* output_buffer_byte_stream{nullptr};
1500  llvm::Value* out_row_idx{nullptr};
1501  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1503  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1504  std::get<0>(agg_out_ptr_w_idx),
1505  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1506  output_buffer_byte_stream->setName("out_buff_b_stream");
1507  CHECK(std::get<1>(agg_out_ptr_w_idx));
1508  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1509  llvm::Type::getInt64Ty(LL_CONTEXT));
1510  out_row_idx->setName("out_row_idx");
1511  }
1512 
1513  TargetExprCodegenBuilder target_builder(query_mem_desc, ra_exe_unit_, is_group_by);
1514  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1515  ++target_idx) {
1516  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1517  CHECK(target_expr);
1518 
1519  target_builder(target_expr, executor_, co);
1520  }
1521 
1522  target_builder.codegen(this,
1523  executor_,
1524  query_mem_desc,
1525  co,
1526  agg_out_ptr_w_idx,
1527  agg_out_vec,
1528  output_buffer_byte_stream,
1529  out_row_idx,
1530  diamond_codegen);
1531 
1532  for (auto target_expr : ra_exe_unit_.target_exprs) {
1533  CHECK(target_expr);
1534  executor_->plan_state_->isLazyFetchColumn(target_expr);
1535  }
1536 
1537  return can_return_error;
1538 }
1539 
1544  llvm::Value* output_buffer_byte_stream,
1545  llvm::Value* out_row_idx,
1546  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1547  const QueryMemoryDescriptor& query_mem_desc,
1548  const size_t chosen_bytes,
1549  const size_t agg_out_off,
1550  const size_t target_idx) {
1551  llvm::Value* agg_col_ptr{nullptr};
1552  if (query_mem_desc.didOutputColumnar()) {
1553  // TODO(Saman): remove the second columnar branch, and support all query description
1554  // types through the first branch. Then, input arguments should also be cleaned up
1555  if (!g_cluster &&
1557  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1558  chosen_bytes == 8);
1559  CHECK(output_buffer_byte_stream);
1560  CHECK(out_row_idx);
1561  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1562  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1563  auto out_per_col_byte_idx =
1564  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1565  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1566  LL_INT(static_cast<int64_t>(col_off)));
1567  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1568  auto output_ptr = LL_BUILDER.CreateGEP(output_buffer_byte_stream, byte_offset);
1569  agg_col_ptr = LL_BUILDER.CreateBitCast(
1570  output_ptr,
1571  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1572  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1573  } else {
1574  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1575  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1576  col_off /= chosen_bytes;
1577  CHECK(std::get<1>(agg_out_ptr_w_idx));
1578  auto offset = LL_BUILDER.CreateAdd(std::get<1>(agg_out_ptr_w_idx), LL_INT(col_off));
1579  agg_col_ptr = LL_BUILDER.CreateGEP(
1580  LL_BUILDER.CreateBitCast(
1581  std::get<0>(agg_out_ptr_w_idx),
1582  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0)),
1583  offset);
1584  }
1585  } else {
1586  uint32_t col_off = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1587  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1588  col_off /= chosen_bytes;
1589  agg_col_ptr = LL_BUILDER.CreateGEP(
1590  LL_BUILDER.CreateBitCast(
1591  std::get<0>(agg_out_ptr_w_idx),
1592  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0)),
1593  LL_INT(col_off));
1594  }
1595  CHECK(agg_col_ptr);
1596  return agg_col_ptr;
1597 }
1598 
1600  std::stack<llvm::BasicBlock*>& array_loops,
1601  GroupByAndAggregate::DiamondCodegen& diamond_codegen,
1602  const QueryMemoryDescriptor& query_mem_desc,
1603  const CompilationOptions& co) {
1604  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1605  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1606  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1607  estimator_comp_count_lv);
1608  int32_t subkey_idx = 0;
1609  for (const auto estimator_arg_comp : estimator_arg) {
1610  const auto estimator_arg_comp_lvs =
1611  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1612  query_mem_desc.getEffectiveKeyWidth(),
1613  co,
1614  false,
1615  0,
1616  diamond_codegen,
1617  array_loops,
1618  true);
1619  CHECK(!estimator_arg_comp_lvs.original_value);
1620  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1621  // store the sub-key to the buffer
1622  LL_BUILDER.CreateStore(estimator_arg_comp_lv,
1623  LL_BUILDER.CreateGEP(estimator_key_lv, LL_INT(subkey_idx++)));
1624  }
1625  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1626  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1627  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1628  const auto estimator_comp_bytes_lv =
1629  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1630  const auto bitmap_size_lv =
1631  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1632  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1633  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1634 }
1635 
1636 extern "C" void agg_count_distinct(int64_t* agg, const int64_t val) {
1637  reinterpret_cast<std::set<int64_t>*>(*agg)->insert(val);
1638 }
1639 
1640 extern "C" void agg_count_distinct_skip_val(int64_t* agg,
1641  const int64_t val,
1642  const int64_t skip_val) {
1643  if (val != skip_val) {
1644  agg_count_distinct(agg, val);
1645  }
1646 }
1647 
1649  const size_t target_idx,
1650  const Analyzer::Expr* target_expr,
1651  std::vector<llvm::Value*>& agg_args,
1652  const QueryMemoryDescriptor& query_mem_desc,
1653  const ExecutorDeviceType device_type) {
1654  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1655  const auto& arg_ti =
1656  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1657  if (arg_ti.is_fp()) {
1658  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1659  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1660  }
1661  const auto& count_distinct_descriptor =
1662  query_mem_desc.getCountDistinctDescriptor(target_idx);
1663  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1664  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1665  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1666  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1667  if (device_type == ExecutorDeviceType::GPU) {
1668  const auto base_dev_addr = getAdditionalLiteral(-1);
1669  const auto base_host_addr = getAdditionalLiteral(-2);
1670  agg_args.push_back(base_dev_addr);
1671  agg_args.push_back(base_host_addr);
1672  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1673  } else {
1674  emitCall("agg_approximate_count_distinct", agg_args);
1675  }
1676  return;
1677  }
1678  std::string agg_fname{"agg_count_distinct"};
1679  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1680  agg_fname += "_bitmap";
1681  agg_args.push_back(LL_INT(static_cast<int64_t>(count_distinct_descriptor.min_val)));
1682  }
1683  if (agg_info.skip_null_val) {
1684  auto null_lv = executor_->cgen_state_->castToTypeIn(
1685  (arg_ti.is_fp()
1686  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1687  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1688  64);
1689  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1690  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1691  agg_fname += "_skip_val";
1692  agg_args.push_back(null_lv);
1693  }
1694  if (device_type == ExecutorDeviceType::GPU) {
1695  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1696  agg_fname += "_gpu";
1697  const auto base_dev_addr = getAdditionalLiteral(-1);
1698  const auto base_host_addr = getAdditionalLiteral(-2);
1699  agg_args.push_back(base_dev_addr);
1700  agg_args.push_back(base_host_addr);
1701  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1702  CHECK_EQ(size_t(0),
1703  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1704  count_distinct_descriptor.sub_bitmap_count);
1705  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1706  count_distinct_descriptor.sub_bitmap_count)));
1707  }
1708  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1709  emitCall(agg_fname, agg_args);
1710  } else {
1711  executor_->cgen_state_->emitExternalCall(
1712  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1713  }
1714 }
1715 
1716 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
1717  CHECK_LT(off, 0);
1718  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
1719  return LL_BUILDER.CreateLoad(LL_BUILDER.CreateGEP(
1720  LL_BUILDER.CreateBitCast(lit_buff_lv,
1721  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)),
1722  LL_INT(off)));
1723 }
1724 
1725 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
1726  const Analyzer::Expr* target_expr,
1727  const CompilationOptions& co) {
1728  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
1729  // TODO(alex): handle arrays uniformly?
1730  CodeGenerator code_generator(executor_);
1731  if (target_expr) {
1732  const auto& target_ti = target_expr->get_type_info();
1733  if (target_ti.is_array() && !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1734  const auto target_lvs =
1735  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1736  : code_generator.codegen(
1737  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1738  if (target_ti.isChunkIteratorPackaging()) {
1739  // Something with the chunk transport is code that was generated from a source
1740  // other than an ARRAY[] expression
1741  CHECK_EQ(size_t(1), target_lvs.size());
1742  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
1743  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1744  const auto i8p_ty =
1745  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1746  const auto& elem_ti = target_ti.get_elem_type();
1747  return {
1748  executor_->cgen_state_->emitExternalCall(
1749  "array_buff",
1750  i8p_ty,
1751  {target_lvs.front(), code_generator.posArg(target_expr)}),
1752  executor_->cgen_state_->emitExternalCall(
1753  "array_size",
1754  i32_ty,
1755  {target_lvs.front(),
1756  code_generator.posArg(target_expr),
1757  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
1758  } else if (target_ti.isStandardBufferPackaging()) {
1759  if (agg_expr) {
1760  throw std::runtime_error(
1761  "Using array[] operator as argument to an aggregate operator is not "
1762  "supported");
1763  }
1764  return {target_lvs[0], target_lvs[1]};
1765  }
1766  }
1767  if (target_ti.is_geometry() &&
1768  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1769  auto generate_coord_lvs =
1770  [&](auto* selected_target_expr,
1771  bool const fetch_columns) -> std::vector<llvm::Value*> {
1772  const auto target_lvs =
1773  code_generator.codegen(selected_target_expr, fetch_columns, co);
1774  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
1775  target_lvs.size());
1776 
1777  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1778  const auto i8p_ty =
1779  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1780  std::vector<llvm::Value*> coords;
1781  size_t ctr = 0;
1782  for (const auto& target_lv : target_lvs) {
1783  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
1784  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
1785  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
1786  // coords array (TINYINT). Subsequent arrays are regular INT.
1787 
1788  const size_t elem_sz = ctr == 0 ? 1 : 4;
1789  ctr++;
1790  int32_t fixlen = -1;
1791  if (target_ti.get_type() == kPOINT) {
1792  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
1793  if (col_var) {
1794  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
1795  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
1796  fixlen = coords_cd->columnType.get_size();
1797  }
1798  }
1799  }
1800  if (fixlen > 0) {
1801  coords.push_back(executor_->cgen_state_->emitExternalCall(
1802  "fast_fixlen_array_buff",
1803  i8p_ty,
1804  {target_lv, code_generator.posArg(selected_target_expr)}));
1805  coords.push_back(executor_->cgen_state_->llInt(int64_t(fixlen)));
1806  continue;
1807  }
1808  coords.push_back(executor_->cgen_state_->emitExternalCall(
1809  "array_buff",
1810  i8p_ty,
1811  {target_lv, code_generator.posArg(selected_target_expr)}));
1812  coords.push_back(executor_->cgen_state_->emitExternalCall(
1813  "array_size",
1814  i32_ty,
1815  {target_lv,
1816  code_generator.posArg(selected_target_expr),
1817  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
1818  }
1819  return coords;
1820  };
1821 
1822  if (agg_expr) {
1823  return generate_coord_lvs(agg_expr->get_arg(), true);
1824  } else {
1825  return generate_coord_lvs(target_expr,
1826  !executor_->plan_state_->allow_lazy_fetch_);
1827  }
1828  }
1829  }
1830  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1831  : code_generator.codegen(
1832  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1833 }
1834 
1835 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
1836  const std::vector<llvm::Value*>& args) {
1837  return executor_->cgen_state_->emitCall(fname, args);
1838 }
1839 
1840 #undef ROW_FUNC
1841 #undef LL_FP
1842 #undef LL_INT
1843 #undef LL_BOOL
1844 #undef LL_BUILDER
1845 #undef LL_CONTEXT
1846 
1848  const RelAlgExecutionUnit& ra_exe_unit,
1849  const Catalog_Namespace::Catalog& catalog) {
1850  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
1851  return 0;
1852  }
1853  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
1854  const auto grouped_col_expr =
1855  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
1856  if (!grouped_col_expr) {
1857  continue;
1858  }
1859  if (grouped_col_expr->get_table_id() <= 0) {
1860  return 0;
1861  }
1862  const auto td = catalog.getMetadataForTable(grouped_col_expr->get_table_id());
1863  if (td->shardedColumnId == grouped_col_expr->get_column_id()) {
1864  return td->nShards;
1865  }
1866  }
1867  return 0;
1868 }
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:195
bool use_streaming_top_n(const RelAlgExecutionUnit &ra_exe_unit, const bool output_columnar)
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
llvm::Value * getAdditionalLiteral(const int32_t off)
int32_t get_agg_count(const std::vector< Analyzer::Expr *> &target_exprs)
void get_domain(DomainSet &domain_set) const override
Definition: Analyzer.cpp:2848
HOST DEVICE int get_size() const
Definition: sqltypes.h:329
#define LL_BUILDER
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:81
bool is_fp() const
Definition: sqltypes.h:450
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:115
std::vector< int8_t > get_col_byte_widths(const T &col_expr_list, const std::vector< ssize_t > &col_exprs_to_not_project)
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner >)
#define LL_CONTEXT
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
bool expr_is_rowid(const Analyzer::Expr *expr, const Catalog_Namespace::Catalog &cat)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
ExecutorDeviceType
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
SQLTypeInfo sql_type
Definition: TargetInfo.h:42
Streaming Top N algorithm.
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:65
void mark_function_always_inline(llvm::Function *func)
ColRangeInfo getColRangeInfo()
const std::list< Analyzer::OrderEntry > order_entries
#define LL_INT(v)
static const size_t baseline_threshold
Definition: Execute.h:980
QueryDescriptionType hash_type_
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:319
ColRangeInfo getExprRangeInfo(const Analyzer::Expr *expr) const
static bool supportedExprForGpuSharedMemUsage(Analyzer::Expr *expr)
#define CHECK_GE(x, y)
Definition: Logger.h:200
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: sqldefs.h:49
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:120
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:199
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:327
std::list< const Expr * > DomainSet
Definition: Analyzer.h:61
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
std::string to_string(char const *&&v)
bool g_bigint_count
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
Helpers for codegen of target expressions.
#define LL_BOOL(v)
Expr * get_arg() const
Definition: Analyzer.h:988
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:71
const SQLTypeInfo get_compact_type(const TargetInfo &target)
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value *> &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
const size_t limit
CountDistinctDescriptors initCountDistinctDescriptors()
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:114
bool supportedTypeForGpuSharedMemUsage(const SQLTypeInfo &target_type_info) const
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:168
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:35
const SortInfo sort_info
bool interleavedBins(const ExecutorDeviceType) const
const JoinQualsPerNestingLevel join_quals
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
void setFalseTarget(llvm::BasicBlock *cond_false)
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr *> &target_exprs, const size_t target_idx)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
Definition: sqldefs.h:71
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value *> &agg_out_ptr_w_idx, const std::vector< llvm::Value *> &agg_out_vec, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:116
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const std::tuple< llvm::Value *, llvm::Value *> &agg_out_ptr_w_idx, const std::vector< llvm::Value *> &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, GroupByAndAggregate::DiamondCodegen &diamond_codegen) const
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value *> &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
DiamondCodegen(llvm::Value *cond, Executor *executor, const bool chain_to_next, const std::string &label_prefix, DiamondCodegen *parent, const bool share_false_edge_with_parent)
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void > > checked_int64_t
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:41
ExecutorDeviceType device_type_
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:25
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:42
#define CHECK_LT(x, y)
Definition: Logger.h:197
Definition: sqltypes.h:54
const std::vector< InputTableInfo > & query_infos_
size_t getColOnlyOffInBytes(const size_t col_idx) const
void agg_count_distinct(int64_t *agg, const int64_t val)
#define CHECK_LE(x, y)
Definition: Logger.h:198
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value *> &args)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
Definition: sqldefs.h:71
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
KeylessInfo getKeylessInfo(const std::vector< Analyzer::Expr *> &target_expr_list, const bool is_group_by) const
Descriptor for the result set buffer layout.
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
size_t g_leaf_count
Definition: ParserNode.cpp:63
void codegenEstimator(std::stack< llvm::BasicBlock *> &array_loops, GroupByAndAggregate::DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
bool g_cluster
CountDistinctImplType
void add_transient_string_literals_for_expression(const Analyzer::Expr *expr, Executor *executor, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:77
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:503
#define CHECK(condition)
Definition: Logger.h:187
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
bool g_enable_watchdog
Definition: Execute.cpp:69
size_t getColOffInBytes(const size_t col_idx) const
Definition: sqltypes.h:47
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint)
Allocate GPU memory using GpuBuffers via DataMgr.
const bool with_dynamic_watchdog_
const RelAlgExecutionUnit & ra_exe_unit_
const size_t offset
QueryDescriptionType getQueryDescriptionType() const
Definition: sqldefs.h:71
Definition: sqldefs.h:71
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:1339
bool isSingleColumnGroupByWithPerfectHash() const
const int64_t const uint32_t const uint32_t const uint32_t const bool keyless
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
static WindowFunctionContext * getActiveWindowFunctionContext()
SQLOps get_optype() const
Definition: Analyzer.h:363
size_t getEffectiveKeyWidth() const
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)
void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)