OmniSciDB  29e35f4d58
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
20 
21 #include "CardinalityEstimator.h"
22 #include "CodeGenerator.h"
24 #include "ExpressionRange.h"
25 #include "ExpressionRewrite.h"
26 #include "GpuInitGroups.h"
27 #include "InPlaceSort.h"
29 #include "MaxwellCodegenPatch.h"
31 #include "TargetExprBuilder.h"
32 
33 #include "../CudaMgr/CudaMgr.h"
34 #include "../Shared/checked_alloc.h"
35 #include "../Utils/ChunkIter.h"
37 #include "Execute.h"
38 #include "QueryTemplateGenerator.h"
39 #include "RuntimeFunctions.h"
40 #include "StreamingTopN.h"
41 #include "TopKSort.h"
42 #include "WindowContext.h"
43 
44 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
45 
46 #include <numeric>
47 #include <thread>
48 
49 bool g_cluster{false};
50 bool g_bigint_count{false};
52 extern size_t g_leaf_count;
53 
54 namespace {
55 
56 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
57  int32_t agg_count{0};
58  for (auto target_expr : target_exprs) {
59  CHECK(target_expr);
60  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
61  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
62  const auto& ti = target_expr->get_type_info();
63  // TODO(pavan): or if is_geometry()
64  if (ti.is_array() || (ti.is_string() && ti.get_compression() == kENCODING_NONE)) {
65  agg_count += 2;
66  } else if (ti.is_geometry()) {
67  agg_count += ti.get_physical_coord_cols() * 2;
68  } else {
69  ++agg_count;
70  }
71  continue;
72  }
73  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
74  agg_count += 2;
75  } else {
76  ++agg_count;
77  }
78  }
79  return agg_count;
80 }
81 
83  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
84  if (!col) {
85  return false;
86  }
87  const auto cd =
88  get_column_descriptor_maybe(col->get_column_id(), col->get_table_id(), cat);
89  if (!cd || !cd->isVirtualCol) {
90  return false;
91  }
92  CHECK_EQ("rowid", cd->columnName);
93  return true;
94 }
95 
96 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
97  for (const auto& target_expr : ra_exe_unit.target_exprs) {
98  const auto agg_info = get_target_info(target_expr, g_bigint_count);
99  if (agg_info.is_agg && is_distinct_target(agg_info)) {
100  return true;
101  }
102  }
103  return false;
104 }
105 
107  const int64_t max_entry_count) {
108  try {
109  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
110  checked_int64_t(col_range_info.min)) >= max_entry_count;
111  } catch (...) {
112  return true;
113  }
114 }
115 
116 } // namespace
117 
119  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
120  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
121  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
122  // can expect this to be true anyway for grouped queries since the precise version
123  // uses significantly more memory.
124  const int64_t baseline_threshold =
125  has_count_distinct(ra_exe_unit_)
126  ? (device_type_ == ExecutorDeviceType::GPU ? (Executor::baseline_threshold / 4)
129  if (ra_exe_unit_.groupby_exprs.size() != 1) {
130  try {
131  checked_int64_t cardinality{1};
132  bool has_nulls{false};
133  for (const auto groupby_expr : ra_exe_unit_.groupby_exprs) {
134  auto col_range_info = getExprRangeInfo(groupby_expr.get());
135  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
136  // going through baseline hash if a non-integer type is encountered
137  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
138  }
139  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
140  CHECK_GE(crt_col_cardinality, 0);
141  cardinality *= crt_col_cardinality;
142  if (col_range_info.has_nulls) {
143  has_nulls = true;
144  }
145  }
146  // For zero or high cardinalities, use baseline layout.
147  if (!cardinality || cardinality > baseline_threshold) {
148  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
149  }
151  0,
152  int64_t(cardinality),
153  0,
154  has_nulls};
155  } catch (...) { // overflow when computing cardinality
156  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
157  }
158  }
159  // For single column groupby on high timestamps, force baseline hash due to wide ranges
160  // we are likely to encounter when applying quals to the expression range
161  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
162  // the range is small enough
163  if (ra_exe_unit_.groupby_exprs.front() &&
164  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
165  ra_exe_unit_.simple_quals.size() > 0) {
166  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
167  }
168  const auto col_range_info = getExprRangeInfo(ra_exe_unit_.groupby_exprs.front().get());
169  if (!ra_exe_unit_.groupby_exprs.front()) {
170  return col_range_info;
171  }
172  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
173  const int64_t col_count =
174  ra_exe_unit_.groupby_exprs.size() + ra_exe_unit_.target_exprs.size();
175  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
176  if (has_count_distinct(ra_exe_unit_)) {
177  max_entry_count = std::min(max_entry_count, baseline_threshold);
178  }
179  if ((!ra_exe_unit_.groupby_exprs.front()->get_type_info().is_string() &&
180  !expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get(), *executor_->catalog_)) &&
181  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
182  !col_range_info.bucket) {
184  col_range_info.min,
185  col_range_info.max,
186  0,
187  col_range_info.has_nulls};
188  }
189  return col_range_info;
190 }
191 
193  if (!expr) {
194  return {QueryDescriptionType::Projection, 0, 0, 0, false};
195  }
196 
197  const auto expr_range = getExpressionRange(
198  expr, query_infos_, executor_, boost::make_optional(ra_exe_unit_.simple_quals));
199  switch (expr_range.getType()) {
202  expr_range.getIntMin(),
203  expr_range.getIntMax(),
204  expr_range.getBucket(),
205  expr_range.hasNulls()};
210  default:
211  CHECK(false);
212  }
213  CHECK(false);
214  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
215 }
216 
218  checked_int64_t crt_col_cardinality =
219  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
220  if (col_range_info.bucket) {
221  crt_col_cardinality /= col_range_info.bucket;
222  }
223  return static_cast<int64_t>(crt_col_cardinality +
224  (1 + (col_range_info.has_nulls ? 1 : 0)));
225 }
226 
227 #define LL_CONTEXT executor_->cgen_state_->context_
228 #define LL_BUILDER executor_->cgen_state_->ir_builder_
229 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
230 #define LL_INT(v) executor_->cgen_state_->llInt(v)
231 #define LL_FP(v) executor_->cgen_state_->llFp(v)
232 #define ROW_FUNC executor_->cgen_state_->row_func_
233 
235  Executor* executor,
236  const ExecutorDeviceType device_type,
237  const RelAlgExecutionUnit& ra_exe_unit,
238  const std::vector<InputTableInfo>& query_infos,
239  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner)
240  : executor_(executor)
241  , ra_exe_unit_(ra_exe_unit)
242  , query_infos_(query_infos)
243  , row_set_mem_owner_(row_set_mem_owner)
244  , device_type_(device_type) {
245  for (const auto groupby_expr : ra_exe_unit_.groupby_exprs) {
246  if (!groupby_expr) {
247  continue;
248  }
249  const auto& groupby_ti = groupby_expr->get_type_info();
250  if (groupby_ti.is_string() && groupby_ti.get_compression() != kENCODING_DICT) {
251  throw std::runtime_error(
252  "Cannot group by string columns which are not dictionary encoded.");
253  }
254  if (groupby_ti.is_array()) {
255  throw std::runtime_error("Group by array not supported");
256  }
257  if (groupby_ti.is_geometry()) {
258  throw std::runtime_error("Group by geometry not supported");
259  }
260  }
261 }
262 
264  const size_t shard_count) const {
265  size_t device_count{0};
267  device_count = executor_->getCatalog()->getDataMgr().getCudaMgr()->getDeviceCount();
268  CHECK_GT(device_count, 0u);
269  }
270 
271  int64_t bucket{col_range_info.bucket};
272 
273  if (shard_count) {
274  CHECK(!col_range_info.bucket);
275  /*
276  when a node has fewer devices than shard count,
277  a) In a distributed setup, the minimum distance between two keys would be
278  device_count because shards are stored consecutively across the physical tables, i.e
279  if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1 would
280  have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf node
281  has only 1 device, in this case, all the keys from each node are loaded on the
282  device each.
283 
284  b) In a single node setup, the distance would be minimum of device_count or
285  difference of device_count - shard_count. For example: If a single node server
286  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
287  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9 device
288  3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum of
289  device_count or difference.
290 
291  When a node has device count equal to or more than shard count then the
292  minimum distance is always at least shard_count * no of leaf nodes.
293  */
294  if (device_count < shard_count) {
295  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
296  : std::min(device_count, shard_count - device_count);
297  } else {
298  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
299  }
300  }
301 
302  return bucket;
303 }
304 
305 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
306  const bool allow_multifrag,
307  const size_t max_groups_buffer_entry_count,
308  const int8_t crt_min_byte_width,
309  RenderInfo* render_info,
310  const bool output_columnar_hint) {
311  const auto shard_count =
314  : 0;
315  bool sort_on_gpu_hint =
316  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
319  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
320  // but the total output buffer size would be too big or it's a sharded top query.
321  // For the sake of managing risk, use the new result set way very selectively for
322  // this case only (alongside the baseline layout we've enabled for a while now).
323  bool must_use_baseline_sort = shard_count;
324  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
325  while (true) {
326  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
327  max_groups_buffer_entry_count,
328  crt_min_byte_width,
329  sort_on_gpu_hint,
330  render_info,
331  must_use_baseline_sort,
332  output_columnar_hint);
333  CHECK(query_mem_desc);
334  if (query_mem_desc->sortOnGpu() &&
335  (query_mem_desc->getBufferSizeBytes(device_type_) +
336  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
337  2 * 1024 * 1024 * 1024L) {
338  must_use_baseline_sort = true;
339  sort_on_gpu_hint = false;
340  } else {
341  break;
342  }
343  }
344  return query_mem_desc;
345 }
346 
347 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
348  const bool allow_multifrag,
349  const size_t max_groups_buffer_entry_count,
350  const int8_t crt_min_byte_width,
351  const bool sort_on_gpu_hint,
352  RenderInfo* render_info,
353  const bool must_use_baseline_sort,
354  const bool output_columnar_hint) {
356 
357  const auto count_distinct_descriptors = initCountDistinctDescriptors();
358 
359  auto group_col_widths = get_col_byte_widths(ra_exe_unit_.groupby_exprs, {});
360 
361  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
362 
363  auto col_range_info_nosharding = getColRangeInfo();
364 
365  const auto shard_count =
368  : 0;
369 
370  const auto col_range_info =
371  ColRangeInfo{col_range_info_nosharding.hash_type_,
372  col_range_info_nosharding.min,
373  col_range_info_nosharding.max,
374  getShardedTopBucket(col_range_info_nosharding, shard_count),
375  col_range_info_nosharding.has_nulls};
376 
377  // Non-grouped aggregates do not support accessing aggregated ranges
378  // Keyless hash is currently only supported with single-column perfect hash
379  const auto keyless_info =
380  !(is_group_by &&
381  col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
382  ra_exe_unit_.groupby_exprs.size() == 1)
383  ? KeylessInfo{false, -1, false}
384  : getKeylessInfo(ra_exe_unit_.target_exprs, is_group_by);
385 
386  if (g_enable_watchdog &&
387  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
388  max_groups_buffer_entry_count > 120000000) ||
389  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
390  ra_exe_unit_.groupby_exprs.size() == 1 &&
391  (col_range_info.max - col_range_info.min) /
392  std::max(col_range_info.bucket, int64_t(1)) >
393  130000000))) {
394  throw WatchdogException("Query would use too much memory");
395  }
397  ra_exe_unit_,
398  query_infos_,
399  col_range_info,
400  keyless_info,
401  allow_multifrag,
402  device_type_,
403  crt_min_byte_width,
404  sort_on_gpu_hint,
405  shard_count,
406  max_groups_buffer_entry_count,
407  render_info,
408  count_distinct_descriptors,
409  must_use_baseline_sort,
410  output_columnar_hint);
411 }
412 
415 }
416 
417 namespace {
418 
420  const Analyzer::Expr* expr,
421  Executor* executor,
422  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner) {
423  if (!expr) {
424  return;
425  }
426 
427  const auto array_expr = dynamic_cast<const Analyzer::ArrayExpr*>(expr);
428  if (array_expr) {
429  for (size_t i = 0; i < array_expr->getElementCount(); i++) {
431  array_expr->getElement(i), executor, row_set_mem_owner);
432  }
433  return;
434  }
435 
436  const auto cast_expr = dynamic_cast<const Analyzer::UOper*>(expr);
437  const auto& expr_ti = expr->get_type_info();
438  if (cast_expr && cast_expr->get_optype() == kCAST && expr_ti.is_string()) {
439  CHECK_EQ(kENCODING_DICT, expr_ti.get_compression());
440  auto sdp = executor->getStringDictionaryProxy(
441  expr_ti.get_comp_param(), row_set_mem_owner, true);
442  CHECK(sdp);
443  const auto str_lit_expr =
444  dynamic_cast<const Analyzer::Constant*>(cast_expr->get_operand());
445  if (str_lit_expr && str_lit_expr->get_constval().stringval) {
446  sdp->getOrAddTransient(*str_lit_expr->get_constval().stringval);
447  }
448  return;
449  }
450  const auto case_expr = dynamic_cast<const Analyzer::CaseExpr*>(expr);
451  if (!case_expr) {
452  return;
453  }
454  Analyzer::DomainSet domain_set;
455  case_expr->get_domain(domain_set);
456  if (domain_set.empty()) {
457  return;
458  }
459  if (expr_ti.is_string()) {
460  CHECK_EQ(kENCODING_DICT, expr_ti.get_compression());
461  auto sdp = executor->getStringDictionaryProxy(
462  expr_ti.get_comp_param(), row_set_mem_owner, true);
463  CHECK(sdp);
464  for (const auto domain_expr : domain_set) {
465  const auto cast_expr = dynamic_cast<const Analyzer::UOper*>(domain_expr);
466  const auto str_lit_expr =
467  cast_expr && cast_expr->get_optype() == kCAST
468  ? dynamic_cast<const Analyzer::Constant*>(cast_expr->get_operand())
469  : dynamic_cast<const Analyzer::Constant*>(domain_expr);
470  if (str_lit_expr && str_lit_expr->get_constval().stringval) {
471  sdp->getOrAddTransient(*str_lit_expr->get_constval().stringval);
472  }
473  }
474  }
475 }
476 
477 } // namespace
478 
480  const RelAlgExecutionUnit& ra_exe_unit,
481  Executor* executor,
482  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner) {
483  for (const auto group_expr : ra_exe_unit.groupby_exprs) {
485  group_expr.get(), executor, row_set_mem_owner);
486  }
487  for (const auto target_expr : ra_exe_unit.target_exprs) {
488  const auto& target_type = target_expr->get_type_info();
489  if (target_type.is_string() && target_type.get_compression() != kENCODING_DICT) {
490  continue;
491  }
492  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
493  if (agg_expr) {
494  if (agg_expr->get_aggtype() == kSINGLE_VALUE ||
495  agg_expr->get_aggtype() == kSAMPLE) {
497  agg_expr->get_arg(), executor, row_set_mem_owner);
498  }
499  } else {
501  target_expr, executor, row_set_mem_owner);
502  }
503  }
504  row_set_mem_owner->addLiteralStringDictProxy(executor->lit_str_dict_proxy_);
505 }
506 
508  CountDistinctDescriptors count_distinct_descriptors;
509  for (const auto target_expr : ra_exe_unit_.target_exprs) {
510  auto agg_info = get_target_info(target_expr, g_bigint_count);
511  if (is_distinct_target(agg_info)) {
512  CHECK(agg_info.is_agg);
513  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
514  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
515  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
516  if (arg_ti.is_string() && arg_ti.get_compression() != kENCODING_DICT) {
517  throw std::runtime_error(
518  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
519  }
520  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_array()) {
521  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
522  }
523  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
524  throw std::runtime_error(
525  "APPROX_COUNT_DISTINCT on geometry columns not supported");
526  }
527  if (agg_info.is_distinct && arg_ti.is_geometry()) {
528  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
529  }
530  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
531  auto arg_range_info =
532  arg_ti.is_fp() ? no_range_info : getExprRangeInfo(agg_expr->get_arg());
533  CountDistinctImplType count_distinct_impl_type{CountDistinctImplType::StdSet};
534  int64_t bitmap_sz_bits{0};
535  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
536  const auto error_rate = agg_expr->get_error_rate();
537  if (error_rate) {
538  CHECK(error_rate->get_type_info().get_type() == kINT);
539  CHECK_GE(error_rate->get_constval().intval, 1);
540  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
541  } else {
542  bitmap_sz_bits = g_hll_precision_bits;
543  }
544  }
545  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
546  !(arg_ti.is_array() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
547  // implementation for arrays
548  if (arg_range_info.isEmpty()) {
549  count_distinct_descriptors.emplace_back(
551  0,
552  64,
553  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
554  device_type_,
555  1});
556  continue;
557  }
558  count_distinct_impl_type = CountDistinctImplType::Bitmap;
559  if (agg_info.agg_kind == kCOUNT) {
560  bitmap_sz_bits = arg_range_info.max - arg_range_info.min + 1;
561  const int64_t MAX_BITMAP_BITS{8 * 1000 * 1000 * 1000L};
562  if (bitmap_sz_bits <= 0 || bitmap_sz_bits > MAX_BITMAP_BITS) {
563  count_distinct_impl_type = CountDistinctImplType::StdSet;
564  }
565  }
566  }
567  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
568  count_distinct_impl_type == CountDistinctImplType::StdSet &&
569  !(arg_ti.is_array() || arg_ti.is_geometry())) {
570  count_distinct_impl_type = CountDistinctImplType::Bitmap;
571  }
572  if (g_enable_watchdog &&
573  count_distinct_impl_type == CountDistinctImplType::StdSet) {
574  throw WatchdogException("Cannot use a fast path for COUNT distinct");
575  }
576  const auto sub_bitmap_count =
578  count_distinct_descriptors.emplace_back(
579  CountDistinctDescriptor{count_distinct_impl_type,
580  arg_range_info.min,
581  bitmap_sz_bits,
582  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
583  device_type_,
584  sub_bitmap_count});
585  } else {
586  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
587  CountDistinctImplType::Invalid, 0, 0, false, device_type_, 0});
588  }
589  }
590  return count_distinct_descriptors;
591 }
592 
604  const std::vector<Analyzer::Expr*>& target_expr_list,
605  const bool is_group_by) const {
606  bool keyless{true}, found{false}, shared_mem_support{false},
607  shared_mem_valid_data_type{true};
608  /* Currently support shared memory usage for a limited subset of possible aggregate
609  * operations. shared_mem_support and
610  * shared_mem_valid_data_type are declared to ensure such support. */
611  int32_t num_agg_expr{0}; // used for shared memory support on the GPU
612  int32_t index{0};
613  for (const auto target_expr : target_expr_list) {
614  const auto agg_info = get_target_info(target_expr, g_bigint_count);
615  const auto chosen_type = get_compact_type(agg_info);
616  // TODO(Saman): should be eventually removed, once I make sure what data types can
617  // be used in this shared memory setting.
618 
619  shared_mem_valid_data_type =
620  shared_mem_valid_data_type && supportedTypeForGpuSharedMemUsage(chosen_type);
621 
622  if (agg_info.is_agg) {
623  num_agg_expr++;
624  }
625  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
626  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
627  CHECK(agg_expr);
628  const auto arg_expr = agg_arg(target_expr);
629  const bool float_argument_input = takes_float_argument(agg_info);
630  switch (agg_info.agg_kind) {
631  case kAVG:
632  ++index;
633  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
634  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
635  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
636  expr_range_info.hasNulls()) {
637  break;
638  }
639  }
640  found = true;
641  break;
642  case kCOUNT:
643  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
644  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
645  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
646  expr_range_info.hasNulls()) {
647  break;
648  }
649  }
650  found = true;
651  if (!agg_info.skip_null_val) {
652  shared_mem_support = true; // currently just support 8 bytes per group
653  }
654  break;
655  case kSUM: {
656  auto arg_ti = arg_expr->get_type_info();
657  if (constrained_not_null(arg_expr, ra_exe_unit_.quals)) {
658  arg_ti.set_notnull(true);
659  }
660  if (!arg_ti.get_notnull()) {
661  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
662  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
663  !expr_range_info.hasNulls()) {
664  found = true;
665  }
666  } else {
667  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
668  switch (expr_range_info.getType()) {
671  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
672  found = true;
673  }
674  break;
676  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
677  found = true;
678  }
679  break;
680  default:
681  break;
682  }
683  }
684  break;
685  }
686  case kMIN: {
687  CHECK(agg_expr && agg_expr->get_arg());
688  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
689  if (arg_ti.is_string() || arg_ti.is_array()) {
690  break;
691  }
692  auto expr_range_info =
693  getExpressionRange(agg_expr->get_arg(), query_infos_, executor_);
694  auto init_max = get_agg_initial_val(agg_info.agg_kind,
695  chosen_type,
696  is_group_by || float_argument_input,
697  float_argument_input ? sizeof(float) : 8);
698  switch (expr_range_info.getType()) {
701  auto double_max =
702  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
703  if (expr_range_info.getFpMax() < double_max) {
704  found = true;
705  }
706  break;
707  }
709  if (expr_range_info.getIntMax() < init_max) {
710  found = true;
711  }
712  break;
713  default:
714  break;
715  }
716  break;
717  }
718  case kMAX: {
719  CHECK(agg_expr && agg_expr->get_arg());
720  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
721  if (arg_ti.is_string() || arg_ti.is_array()) {
722  break;
723  }
724  auto expr_range_info =
725  getExpressionRange(agg_expr->get_arg(), query_infos_, executor_);
726  // NULL sentinel and init value for kMAX are identical, which results in
727  // ambiguity in detecting empty keys in presence of nulls.
728  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
729  expr_range_info.hasNulls()) {
730  break;
731  }
732  auto init_min = get_agg_initial_val(agg_info.agg_kind,
733  chosen_type,
734  is_group_by || float_argument_input,
735  float_argument_input ? sizeof(float) : 8);
736  switch (expr_range_info.getType()) {
739  auto double_min =
740  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
741  if (expr_range_info.getFpMin() > double_min) {
742  found = true;
743  }
744  break;
745  }
747  if (expr_range_info.getIntMin() > init_min) {
748  found = true;
749  }
750  break;
751  default:
752  break;
753  }
754  break;
755  }
756  default:
757  keyless = false;
758  break;
759  }
760  }
761  if (!keyless) {
762  break;
763  }
764  if (!found) {
765  ++index;
766  }
767  }
768 
769  // shouldn't use keyless for projection only
775  return {keyless && found,
776  index,
777  ((num_agg_expr == 1) && (target_expr_list.size() <= 2))
778  ? shared_mem_support && shared_mem_valid_data_type
779  : false};
780 }
781 
787  const SQLTypeInfo& target_type_info) const {
788  bool result = false;
789  switch (target_type_info.get_type()) {
790  case SQLTypes::kTINYINT:
791  case SQLTypes::kSMALLINT:
792  case SQLTypes::kINT:
793  result = true;
794  break;
795  case SQLTypes::kTEXT:
796  if (target_type_info.get_compression() == EncodingType::kENCODING_DICT) {
797  result = true;
798  }
799  break;
800  default:
801  break;
802  }
803  return result;
804 }
805 
806 // TODO(Saman): this function is temporary and all these limitations should eventually
807 // be removed.
809  /*
810  UNNEST operations follow a slightly different internal memory layout compared to other
811  keyless aggregates Currently, we opt out of using shared memory if there is any UNNEST
812  operation involved.
813  */
814  if (dynamic_cast<Analyzer::UOper*>(expr) &&
815  static_cast<Analyzer::UOper*>(expr)->get_optype() == kUNNEST) {
816  return false;
817  }
818  return true;
819 }
820 
822  const std::list<Analyzer::OrderEntry>& order_entries) {
823  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
824  return false;
825  }
826  for (const auto order_entry : order_entries) {
827  CHECK_GE(order_entry.tle_no, 1);
828  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
829  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
830  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
831  return false;
832  }
833  // TODO(alex): relax the restrictions
834  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
835  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
836  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
837  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
838  return false;
839  }
840  if (agg_expr->get_arg()) {
841  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
842  if (arg_ti.is_fp()) {
843  return false;
844  }
845  auto expr_range_info = getExprRangeInfo(agg_expr->get_arg());
846  // TOD(adb): QMD not actually initialized here?
847  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
848  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
849  expr_range_info.has_nulls) &&
850  order_entry.is_desc == order_entry.nulls_first) {
851  return false;
852  }
853  }
854  const auto& target_ti = target_expr->get_type_info();
855  CHECK(!target_ti.is_array());
856  if (!target_ti.is_integer()) {
857  return false;
858  }
859  }
860  return true;
861 }
862 
864  llvm::Value* cond,
865  Executor* executor,
866  const bool chain_to_next,
867  const std::string& label_prefix,
868  DiamondCodegen* parent,
869  const bool share_false_edge_with_parent)
870  : executor_(executor), chain_to_next_(chain_to_next), parent_(parent) {
871  if (parent_) {
873  }
874  cond_true_ = llvm::BasicBlock::Create(LL_CONTEXT, label_prefix + "_true", ROW_FUNC);
875  if (share_false_edge_with_parent) {
876  CHECK(parent);
878  } else {
880  llvm::BasicBlock::Create(LL_CONTEXT, label_prefix + "_false", ROW_FUNC);
881  }
882 
883  LL_BUILDER.CreateCondBr(cond, cond_true_, cond_false_);
884  LL_BUILDER.SetInsertPoint(cond_true_);
885 }
886 
888  CHECK(!parent_);
889  chain_to_next_ = true;
890 }
891 
892 void GroupByAndAggregate::DiamondCodegen::setFalseTarget(llvm::BasicBlock* cond_false) {
894  cond_false_ = cond_false;
895 }
896 
899  LL_BUILDER.CreateBr(parent_->cond_false_);
900  } else if (chain_to_next_) {
901  LL_BUILDER.CreateBr(cond_false_);
902  }
904  LL_BUILDER.SetInsertPoint(orig_cond_false_);
905  }
906 }
907 
908 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
909  llvm::BasicBlock* sc_false,
910  const QueryMemoryDescriptor& query_mem_desc,
911  const CompilationOptions& co) {
912  CHECK(filter_result);
913 
914  bool can_return_error = false;
915  llvm::BasicBlock* filter_false{nullptr};
916 
917  {
918  const bool is_group_by = !ra_exe_unit_.groupby_exprs.empty();
919 
920  if (executor_->isArchMaxwell(co.device_type_)) {
922  }
923  DiamondCodegen filter_cfg(filter_result,
924  executor_,
925  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
926  "filter",
927  nullptr,
928  false);
929  filter_false = filter_cfg.cond_false_;
930 
931  if (is_group_by) {
933  !use_streaming_top_n(ra_exe_unit_, query_mem_desc.didOutputColumnar())) {
934  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
935  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
936  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
937  llvm::Value* old_total_matched_val{nullptr};
939  old_total_matched_val =
940  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
941  total_matched_ptr,
942  LL_INT(int32_t(1)),
943  llvm::AtomicOrdering::Monotonic);
944  } else {
945  old_total_matched_val = LL_BUILDER.CreateLoad(total_matched_ptr);
946  LL_BUILDER.CreateStore(
947  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
948  total_matched_ptr);
949  }
950  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
951  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
952  }
953 
954  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
955  if (query_mem_desc.usesGetGroupValueFast() ||
956  query_mem_desc.getQueryDescriptionType() ==
958  if (query_mem_desc.getGroupbyColCount() > 1) {
959  filter_cfg.setChainToNext();
960  }
961  // Don't generate null checks if the group slot is guaranteed to be non-null,
962  // as it's the case for get_group_value_fast* family.
963  can_return_error =
964  codegenAggCalls(agg_out_ptr_w_idx, {}, query_mem_desc, co, filter_cfg);
965  } else {
966  {
967  llvm::Value* nullcheck_cond{nullptr};
968  if (query_mem_desc.didOutputColumnar()) {
969  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
970  LL_INT(int32_t(0)));
971  } else {
972  nullcheck_cond = LL_BUILDER.CreateICmpNE(
973  std::get<0>(agg_out_ptr_w_idx),
974  llvm::ConstantPointerNull::get(
975  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
976  }
977  DiamondCodegen nullcheck_cfg(
978  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
979  codegenAggCalls(agg_out_ptr_w_idx, {}, query_mem_desc, co, filter_cfg);
980  }
981  can_return_error = true;
982  if (query_mem_desc.getQueryDescriptionType() ==
985  // Ignore rejection on pushing current row to top-K heap.
986  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
987  } else {
988  CodeGenerator code_generator(executor_);
989  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
990  // TODO(alex): remove the trunc once pos is converted to 32 bits
991  code_generator.posArg(nullptr),
992  get_int_type(32, LL_CONTEXT))));
993  }
994  }
995  } else {
996  if (ra_exe_unit_.estimator) {
997  std::stack<llvm::BasicBlock*> array_loops;
998  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
999  } else {
1000  auto arg_it = ROW_FUNC->arg_begin();
1001  std::vector<llvm::Value*> agg_out_vec;
1002  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
1003  agg_out_vec.push_back(&*arg_it++);
1004  }
1005  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
1006  agg_out_vec,
1007  query_mem_desc,
1008  co,
1009  filter_cfg);
1010  }
1011  }
1012  }
1013 
1014  if (ra_exe_unit_.join_quals.empty()) {
1015  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
1016  } else if (sc_false) {
1017  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
1018  LL_BUILDER.SetInsertPoint(sc_false);
1019  LL_BUILDER.CreateBr(filter_false);
1020  LL_BUILDER.SetInsertPoint(saved_insert_block);
1021  }
1022 
1023  return can_return_error;
1024 }
1025 
1027  llvm::Value* groups_buffer,
1028  const QueryMemoryDescriptor& query_mem_desc,
1029  const CompilationOptions& co,
1030  DiamondCodegen& diamond_codegen) {
1032  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1033  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
1034  CHECK(!group_expr);
1035  if (!query_mem_desc.didOutputColumnar()) {
1036  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
1037  }
1038  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1039  ? 0
1040  : query_mem_desc.getRowSize() / sizeof(int64_t);
1041  CodeGenerator code_generator(executor_);
1042  if (use_streaming_top_n(ra_exe_unit_, query_mem_desc.didOutputColumnar())) {
1043  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
1044  CHECK_GE(only_order_entry.tle_no, int(1));
1045  const size_t target_idx = only_order_entry.tle_no - 1;
1046  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1047  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1048  const auto chosen_bytes =
1049  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1050  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1051  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1053  std::string fname = "get_bin_from_k_heap";
1054  const auto& oe_ti = order_entry_expr->get_type_info();
1055  llvm::Value* null_key_lv = nullptr;
1056  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1057  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1058  switch (bit_width) {
1059  case 32:
1060  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1061  break;
1062  case 64:
1063  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1064  break;
1065  default:
1066  CHECK(false);
1067  }
1068  fname += "_int" + std::to_string(bit_width) + "_t";
1069  } else {
1070  CHECK(oe_ti.is_fp());
1071  if (order_entry_lv->getType()->isDoubleTy()) {
1072  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1073  } else {
1074  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1075  }
1076  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1077  }
1078  const auto key_slot_idx =
1080  return emitCall(
1081  fname,
1082  {groups_buffer,
1083  LL_INT(n),
1084  LL_INT(row_size_quad),
1085  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1086  LL_BOOL(only_order_entry.is_desc),
1087  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1088  LL_BOOL(only_order_entry.nulls_first),
1089  null_key_lv,
1090  order_entry_lv});
1091  } else {
1092  llvm::Value* output_buffer_entry_count_lv{nullptr};
1094  output_buffer_entry_count_lv =
1095  LL_BUILDER.CreateLoad(get_arg_by_name(ROW_FUNC, "max_matched"));
1096  CHECK(output_buffer_entry_count_lv);
1097  }
1098  const auto group_expr_lv =
1099  LL_BUILDER.CreateLoad(get_arg_by_name(ROW_FUNC, "old_total_matched"));
1100  std::vector<llvm::Value*> args{
1101  groups_buffer,
1102  output_buffer_entry_count_lv
1103  ? output_buffer_entry_count_lv
1104  : LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1105  group_expr_lv,
1106  code_generator.posArg(nullptr)};
1107  if (query_mem_desc.didOutputColumnar()) {
1108  const auto columnar_output_offset =
1109  emitCall("get_columnar_scan_output_offset", args);
1110  return columnar_output_offset;
1111  }
1112  args.push_back(LL_INT(row_size_quad));
1113  return emitCall("get_scan_output_slot", args);
1114  }
1115 }
1116 
1117 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1118  const QueryMemoryDescriptor& query_mem_desc,
1119  const CompilationOptions& co,
1120  DiamondCodegen& diamond_codegen) {
1121  auto arg_it = ROW_FUNC->arg_begin();
1122  auto groups_buffer = arg_it++;
1123 
1124  std::stack<llvm::BasicBlock*> array_loops;
1125 
1126  // TODO(Saman): move this logic outside of this function.
1128  if (query_mem_desc.didOutputColumnar()) {
1129  return std::make_tuple(
1130  &*groups_buffer,
1131  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1132  } else {
1133  return std::make_tuple(
1134  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1135  nullptr);
1136  }
1137  }
1138 
1139  CHECK(query_mem_desc.getQueryDescriptionType() ==
1141  query_mem_desc.getQueryDescriptionType() ==
1143 
1144  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1145  ? 0
1146  : query_mem_desc.getRowSize() / sizeof(int64_t);
1147 
1148  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1149  ? sizeof(int64_t)
1150  : query_mem_desc.getEffectiveKeyWidth();
1151  // for multi-column group by
1152  llvm::Value* group_key = nullptr;
1153  llvm::Value* key_size_lv = nullptr;
1154 
1155  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1156  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1157  if (query_mem_desc.getQueryDescriptionType() ==
1159  group_key =
1160  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1161  } else if (query_mem_desc.getQueryDescriptionType() ==
1163  group_key =
1164  col_width_size == sizeof(int32_t)
1165  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1166  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1167  }
1168  CHECK(group_key);
1169  CHECK(key_size_lv);
1170  }
1171 
1172  int32_t subkey_idx = 0;
1173  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1174  for (const auto group_expr : ra_exe_unit_.groupby_exprs) {
1175  const auto col_range_info = getExprRangeInfo(group_expr.get());
1176  const auto translated_null_value = static_cast<int64_t>(
1177  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1178  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1179  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1180  : checked_int64_t(col_range_info.max) +
1181  (col_range_info.bucket ? col_range_info.bucket : 1));
1182 
1183  const bool col_has_nulls =
1184  query_mem_desc.getQueryDescriptionType() ==
1186  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1187  ? query_mem_desc.hasNulls()
1188  : col_range_info.has_nulls)
1189  : false;
1190 
1191  const auto group_expr_lvs =
1192  executor_->groupByColumnCodegen(group_expr.get(),
1193  col_width_size,
1194  co,
1195  col_has_nulls,
1196  translated_null_value,
1197  diamond_codegen,
1198  array_loops,
1199  query_mem_desc.threadsShareMemory());
1200  const auto group_expr_lv = group_expr_lvs.translated_value;
1201  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1202  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1203  return codegenSingleColumnPerfectHash(query_mem_desc,
1204  co,
1205  &*groups_buffer,
1206  group_expr_lv,
1207  group_expr_lvs.original_value,
1208  row_size_quad);
1209  } else {
1210  // store the sub-key to the buffer
1211  LL_BUILDER.CreateStore(group_expr_lv,
1212  LL_BUILDER.CreateGEP(group_key, LL_INT(subkey_idx++)));
1213  }
1214  }
1215  if (query_mem_desc.getQueryDescriptionType() ==
1217  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1219  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1220  } else if (query_mem_desc.getQueryDescriptionType() ==
1223  &*groups_buffer,
1224  group_key,
1225  key_size_lv,
1226  query_mem_desc,
1227  col_width_size,
1228  row_size_quad);
1229  }
1230  CHECK(false);
1231  return std::make_tuple(nullptr, nullptr);
1232 }
1233 
1234 std::tuple<llvm::Value*, llvm::Value*>
1236  const QueryMemoryDescriptor& query_mem_desc,
1237  const CompilationOptions& co,
1238  llvm::Value* groups_buffer,
1239  llvm::Value* group_expr_lv_translated,
1240  llvm::Value* group_expr_lv_original,
1241  const int32_t row_size_quad) {
1242  CHECK(query_mem_desc.usesGetGroupValueFast());
1243  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1244  ? "get_columnar_group_bin_offset"
1245  : "get_group_value_fast"};
1246  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1247  get_group_fn_name += "_keyless";
1248  }
1249  if (query_mem_desc.interleavedBins(co.device_type_)) {
1250  CHECK(!query_mem_desc.didOutputColumnar());
1251  CHECK(query_mem_desc.hasKeylessHash());
1252  get_group_fn_name += "_semiprivate";
1253  }
1254  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1255  &*group_expr_lv_translated};
1256  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1257  query_mem_desc.mustUseBaselineSort()) {
1258  get_group_fn_name += "_with_original_key";
1259  get_group_fn_args.push_back(group_expr_lv_original);
1260  }
1261  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1262  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1263  if (!query_mem_desc.hasKeylessHash()) {
1264  if (!query_mem_desc.didOutputColumnar()) {
1265  get_group_fn_args.push_back(LL_INT(row_size_quad));
1266  }
1267  } else {
1268  if (!query_mem_desc.didOutputColumnar()) {
1269  get_group_fn_args.push_back(LL_INT(row_size_quad));
1270  }
1271  if (query_mem_desc.interleavedBins(co.device_type_)) {
1272  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1273  get_group_fn_args.push_back(warp_idx);
1274  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1275  }
1276  }
1277  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1278  return std::make_tuple(&*groups_buffer,
1279  emitCall(get_group_fn_name, get_group_fn_args));
1280  }
1281  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1282 }
1283 
1284 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1285  llvm::Value* groups_buffer,
1286  llvm::Value* group_key,
1287  llvm::Value* key_size_lv,
1288  const QueryMemoryDescriptor& query_mem_desc,
1289  const int32_t row_size_quad) {
1290  CHECK(query_mem_desc.getQueryDescriptionType() ==
1292  // compute the index (perfect hash)
1293  auto perfect_hash_func = codegenPerfectHashFunction();
1294  auto hash_lv =
1295  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1296 
1297  if (query_mem_desc.didOutputColumnar()) {
1298  const std::string set_matching_func_name{
1299  "set_matching_group_value_perfect_hash_columnar"};
1300  const std::vector<llvm::Value*> set_matching_func_arg{
1301  groups_buffer,
1302  hash_lv,
1303  group_key,
1304  key_size_lv,
1305  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1306  query_mem_desc.getEntryCount())};
1307  emitCall(set_matching_func_name, set_matching_func_arg);
1308  return std::make_tuple(groups_buffer, hash_lv);
1309  } else {
1310  return std::make_tuple(
1311  emitCall("get_matching_group_value_perfect_hash",
1312  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1313  nullptr);
1314  }
1315 }
1316 
1317 std::tuple<llvm::Value*, llvm::Value*>
1319  const CompilationOptions& co,
1320  llvm::Value* groups_buffer,
1321  llvm::Value* group_key,
1322  llvm::Value* key_size_lv,
1323  const QueryMemoryDescriptor& query_mem_desc,
1324  const size_t key_width,
1325  const int32_t row_size_quad) {
1326  auto arg_it = ROW_FUNC->arg_begin(); // groups_buffer
1327  ++arg_it; // current match count
1328  ++arg_it; // total match count
1329  ++arg_it; // old match count
1330  ++arg_it; // output buffer slots count
1331  ++arg_it; // aggregate init values
1332  CHECK(arg_it->getName() == "agg_init_val");
1333  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1334  CHECK(key_width == sizeof(int32_t));
1335  group_key =
1336  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1337  }
1338  std::vector<llvm::Value*> func_args{
1339  groups_buffer,
1340  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1341  &*group_key,
1342  &*key_size_lv,
1343  LL_INT(static_cast<int32_t>(key_width))};
1344  std::string func_name{"get_group_value"};
1345  if (query_mem_desc.didOutputColumnar()) {
1346  func_name += "_columnar_slot";
1347  } else {
1348  func_args.push_back(LL_INT(row_size_quad));
1349  func_args.push_back(&*arg_it);
1350  }
1351  if (co.with_dynamic_watchdog_) {
1352  func_name += "_with_watchdog";
1353  }
1354  if (query_mem_desc.didOutputColumnar()) {
1355  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1356  } else {
1357  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1358  }
1359 }
1360 
1362  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1363  auto ft = llvm::FunctionType::get(
1364  get_int_type(32, LL_CONTEXT),
1365  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1366  false);
1367  auto key_hash_func = llvm::Function::Create(ft,
1368  llvm::Function::ExternalLinkage,
1369  "perfect_key_hash",
1370  executor_->cgen_state_->module_);
1371  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1372  mark_function_always_inline(key_hash_func);
1373  auto& key_buff_arg = *key_hash_func->args().begin();
1374  llvm::Value* key_buff_lv = &key_buff_arg;
1375  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1376  llvm::IRBuilder<> key_hash_func_builder(bb);
1377  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1378  std::vector<int64_t> cardinalities;
1379  for (const auto groupby_expr : ra_exe_unit_.groupby_exprs) {
1380  auto col_range_info = getExprRangeInfo(groupby_expr.get());
1381  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1382  cardinalities.push_back(getBucketedCardinality(col_range_info));
1383  }
1384  size_t dim_idx = 0;
1385  for (const auto groupby_expr : ra_exe_unit_.groupby_exprs) {
1386  auto key_comp_lv = key_hash_func_builder.CreateLoad(
1387  key_hash_func_builder.CreateGEP(key_buff_lv, LL_INT(dim_idx)));
1388  auto col_range_info = getExprRangeInfo(groupby_expr.get());
1389  auto crt_term_lv =
1390  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1391  if (col_range_info.bucket) {
1392  crt_term_lv =
1393  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1394  }
1395  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1396  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1397  LL_INT(cardinalities[prev_dim_idx]));
1398  }
1399  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1400  ++dim_idx;
1401  }
1402  key_hash_func_builder.CreateRet(
1403  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1404  return key_hash_func;
1405 }
1406 
1408  const TargetInfo& agg_info,
1409  llvm::Value* target) {
1410  const auto& agg_type = agg_info.sql_type;
1411  const size_t chosen_bytes = agg_type.get_size();
1412 
1413  bool need_conversion{false};
1414  llvm::Value* arg_null{nullptr};
1415  llvm::Value* agg_null{nullptr};
1416  llvm::Value* target_to_cast{target};
1417  if (arg_type.is_fp()) {
1418  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1419  if (agg_type.is_fp()) {
1420  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1421  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1422  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1423  need_conversion = true;
1424  }
1425  } else {
1426  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1427  return target;
1428  }
1429  } else {
1430  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1431  if (agg_type.is_fp()) {
1432  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1433  need_conversion = true;
1434  target_to_cast = executor_->castToFP(target);
1435  } else {
1436  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1437  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1438  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1439  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1440  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1441  need_conversion = true;
1442  }
1443  }
1444  }
1445  if (need_conversion) {
1446  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1447  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1448  return LL_BUILDER.CreateSelect(
1449  cmp,
1450  agg_null,
1451  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1452  } else {
1453  return target;
1454  }
1455 }
1456 
1458  const Analyzer::WindowFunction* window_func,
1459  const QueryMemoryDescriptor& query_mem_desc,
1460  const CompilationOptions& co,
1461  DiamondCodegen& diamond_codegen) {
1462  const auto window_func_context =
1464  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1465  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1466  ? 0
1467  : query_mem_desc.getRowSize() / sizeof(int64_t);
1468  auto arg_it = ROW_FUNC->arg_begin();
1469  auto groups_buffer = arg_it++;
1470  CodeGenerator code_generator(executor_);
1471  if (!window_func_context->getRowNumber()) {
1472  CHECK(window_func->getKind() == SqlWindowFunctionKind::COUNT);
1473  window_func_context->setRowNumber(emitCall(
1474  "row_number_window_func",
1475  {LL_INT(reinterpret_cast<const int64_t>(window_func_context->output())),
1476  code_generator.posArg(nullptr)}));
1477  }
1478  const auto pos_in_window = LL_BUILDER.CreateTrunc(window_func_context->getRowNumber(),
1479  get_int_type(32, LL_CONTEXT));
1480  llvm::Value* entry_count_lv =
1481  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1482  std::vector<llvm::Value*> args{
1483  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1484  if (query_mem_desc.didOutputColumnar()) {
1485  const auto columnar_output_offset =
1486  emitCall("get_columnar_scan_output_offset", args);
1487  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1488  }
1489  args.push_back(LL_INT(row_size_quad));
1490  return emitCall("get_scan_output_slot", args);
1491  }
1492  auto arg_it = ROW_FUNC->arg_begin();
1493  auto groups_buffer = arg_it++;
1494  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1495 }
1496 
1498  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1499  const std::vector<llvm::Value*>& agg_out_vec,
1500  const QueryMemoryDescriptor& query_mem_desc,
1501  const CompilationOptions& co,
1502  DiamondCodegen& diamond_codegen) {
1503  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1504  // TODO(alex): unify the two cases, the output for non-group by queries
1505  // should be a contiguous buffer
1506  const bool is_group_by{std::get<0>(agg_out_ptr_w_idx)};
1507  bool can_return_error = false;
1508  if (is_group_by) {
1509  CHECK(agg_out_vec.empty());
1510  } else {
1511  CHECK(!agg_out_vec.empty());
1512  }
1513 
1514  // output buffer is casted into a byte stream to be able to handle data elements of
1515  // different sizes (only used when actual column width sizes are used)
1516  llvm::Value* output_buffer_byte_stream{nullptr};
1517  llvm::Value* out_row_idx{nullptr};
1518  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1520  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1521  std::get<0>(agg_out_ptr_w_idx),
1522  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1523  output_buffer_byte_stream->setName("out_buff_b_stream");
1524  CHECK(std::get<1>(agg_out_ptr_w_idx));
1525  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1526  llvm::Type::getInt64Ty(LL_CONTEXT));
1527  out_row_idx->setName("out_row_idx");
1528  }
1529 
1530  TargetExprCodegenBuilder target_builder(query_mem_desc, ra_exe_unit_, is_group_by);
1531  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1532  ++target_idx) {
1533  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1534  CHECK(target_expr);
1535 
1536  target_builder(target_expr, executor_, co);
1537  }
1538 
1539  target_builder.codegen(this,
1540  executor_,
1541  query_mem_desc,
1542  co,
1543  agg_out_ptr_w_idx,
1544  agg_out_vec,
1545  output_buffer_byte_stream,
1546  out_row_idx,
1547  diamond_codegen);
1548 
1549  for (auto target_expr : ra_exe_unit_.target_exprs) {
1550  CHECK(target_expr);
1551  executor_->plan_state_->isLazyFetchColumn(target_expr);
1552  }
1553 
1554  return can_return_error;
1555 }
1556 
1561  llvm::Value* output_buffer_byte_stream,
1562  llvm::Value* out_row_idx,
1563  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1564  const QueryMemoryDescriptor& query_mem_desc,
1565  const size_t chosen_bytes,
1566  const size_t agg_out_off,
1567  const size_t target_idx) {
1568  llvm::Value* agg_col_ptr{nullptr};
1569  if (query_mem_desc.didOutputColumnar()) {
1570  // TODO(Saman): remove the second columnar branch, and support all query description
1571  // types through the first branch. Then, input arguments should also be cleaned up
1572  if (!g_cluster &&
1574  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1575  chosen_bytes == 8);
1576  CHECK(output_buffer_byte_stream);
1577  CHECK(out_row_idx);
1578  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1579  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1580  auto out_per_col_byte_idx =
1581  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1582  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1583  LL_INT(static_cast<int64_t>(col_off)));
1584  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1585  auto output_ptr = LL_BUILDER.CreateGEP(output_buffer_byte_stream, byte_offset);
1586  agg_col_ptr = LL_BUILDER.CreateBitCast(
1587  output_ptr,
1588  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1589  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1590  } else {
1591  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1592  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1593  col_off /= chosen_bytes;
1594  CHECK(std::get<1>(agg_out_ptr_w_idx));
1595  auto offset = LL_BUILDER.CreateAdd(std::get<1>(agg_out_ptr_w_idx), LL_INT(col_off));
1596  agg_col_ptr = LL_BUILDER.CreateGEP(
1597  LL_BUILDER.CreateBitCast(
1598  std::get<0>(agg_out_ptr_w_idx),
1599  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0)),
1600  offset);
1601  }
1602  } else {
1603  uint32_t col_off = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1604  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1605  col_off /= chosen_bytes;
1606  agg_col_ptr = LL_BUILDER.CreateGEP(
1607  LL_BUILDER.CreateBitCast(
1608  std::get<0>(agg_out_ptr_w_idx),
1609  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0)),
1610  LL_INT(col_off));
1611  }
1612  CHECK(agg_col_ptr);
1613  return agg_col_ptr;
1614 }
1615 
1617  std::stack<llvm::BasicBlock*>& array_loops,
1618  GroupByAndAggregate::DiamondCodegen& diamond_codegen,
1619  const QueryMemoryDescriptor& query_mem_desc,
1620  const CompilationOptions& co) {
1621  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1622  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1623  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1624  estimator_comp_count_lv);
1625  int32_t subkey_idx = 0;
1626  for (const auto estimator_arg_comp : estimator_arg) {
1627  const auto estimator_arg_comp_lvs =
1628  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1629  query_mem_desc.getEffectiveKeyWidth(),
1630  co,
1631  false,
1632  0,
1633  diamond_codegen,
1634  array_loops,
1635  true);
1636  CHECK(!estimator_arg_comp_lvs.original_value);
1637  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1638  // store the sub-key to the buffer
1639  LL_BUILDER.CreateStore(estimator_arg_comp_lv,
1640  LL_BUILDER.CreateGEP(estimator_key_lv, LL_INT(subkey_idx++)));
1641  }
1642  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1643  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1644  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1645  const auto estimator_comp_bytes_lv =
1646  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1647  const auto bitmap_size_lv =
1648  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1649  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1650  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1651 }
1652 
1653 extern "C" void agg_count_distinct(int64_t* agg, const int64_t val) {
1654  reinterpret_cast<std::set<int64_t>*>(*agg)->insert(val);
1655 }
1656 
1657 extern "C" void agg_count_distinct_skip_val(int64_t* agg,
1658  const int64_t val,
1659  const int64_t skip_val) {
1660  if (val != skip_val) {
1661  agg_count_distinct(agg, val);
1662  }
1663 }
1664 
1666  const size_t target_idx,
1667  const Analyzer::Expr* target_expr,
1668  std::vector<llvm::Value*>& agg_args,
1669  const QueryMemoryDescriptor& query_mem_desc,
1670  const ExecutorDeviceType device_type) {
1671  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1672  const auto& arg_ti =
1673  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1674  if (arg_ti.is_fp()) {
1675  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1676  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1677  }
1678  const auto& count_distinct_descriptor =
1679  query_mem_desc.getCountDistinctDescriptor(target_idx);
1680  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1681  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1682  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1683  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1684  if (device_type == ExecutorDeviceType::GPU) {
1685  const auto base_dev_addr = getAdditionalLiteral(-1);
1686  const auto base_host_addr = getAdditionalLiteral(-2);
1687  agg_args.push_back(base_dev_addr);
1688  agg_args.push_back(base_host_addr);
1689  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1690  } else {
1691  emitCall("agg_approximate_count_distinct", agg_args);
1692  }
1693  return;
1694  }
1695  std::string agg_fname{"agg_count_distinct"};
1696  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1697  agg_fname += "_bitmap";
1698  agg_args.push_back(LL_INT(static_cast<int64_t>(count_distinct_descriptor.min_val)));
1699  }
1700  if (agg_info.skip_null_val) {
1701  auto null_lv = executor_->cgen_state_->castToTypeIn(
1702  (arg_ti.is_fp()
1703  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1704  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1705  64);
1706  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1707  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1708  agg_fname += "_skip_val";
1709  agg_args.push_back(null_lv);
1710  }
1711  if (device_type == ExecutorDeviceType::GPU) {
1712  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1713  agg_fname += "_gpu";
1714  const auto base_dev_addr = getAdditionalLiteral(-1);
1715  const auto base_host_addr = getAdditionalLiteral(-2);
1716  agg_args.push_back(base_dev_addr);
1717  agg_args.push_back(base_host_addr);
1718  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1719  CHECK_EQ(size_t(0),
1720  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1721  count_distinct_descriptor.sub_bitmap_count);
1722  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1723  count_distinct_descriptor.sub_bitmap_count)));
1724  }
1725  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1726  emitCall(agg_fname, agg_args);
1727  } else {
1728  executor_->cgen_state_->emitExternalCall(
1729  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1730  }
1731 }
1732 
1733 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
1734  CHECK_LT(off, 0);
1735  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
1736  return LL_BUILDER.CreateLoad(LL_BUILDER.CreateGEP(
1737  LL_BUILDER.CreateBitCast(lit_buff_lv,
1738  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)),
1739  LL_INT(off)));
1740 }
1741 
1742 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
1743  const Analyzer::Expr* target_expr,
1744  const CompilationOptions& co) {
1745  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
1746  // TODO(alex): handle arrays uniformly?
1747  CodeGenerator code_generator(executor_);
1748  if (target_expr) {
1749  const auto& target_ti = target_expr->get_type_info();
1750  if (target_ti.is_array() && !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1751  const auto target_lvs =
1752  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1753  : code_generator.codegen(
1754  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1755  if (target_ti.isChunkIteratorPackaging()) {
1756  // Something with the chunk transport is code that was generated from a source
1757  // other than an ARRAY[] expression
1758  CHECK_EQ(size_t(1), target_lvs.size());
1759  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
1760  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1761  const auto i8p_ty =
1762  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1763  const auto& elem_ti = target_ti.get_elem_type();
1764  return {
1765  executor_->cgen_state_->emitExternalCall(
1766  "array_buff",
1767  i8p_ty,
1768  {target_lvs.front(), code_generator.posArg(target_expr)}),
1769  executor_->cgen_state_->emitExternalCall(
1770  "array_size",
1771  i32_ty,
1772  {target_lvs.front(),
1773  code_generator.posArg(target_expr),
1774  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
1775  } else if (target_ti.isStandardBufferPackaging()) {
1776  if (agg_expr) {
1777  throw std::runtime_error(
1778  "Using array[] operator as argument to an aggregate operator is not "
1779  "supported");
1780  }
1781  return {target_lvs[0], target_lvs[1]};
1782  }
1783  }
1784  if (target_ti.is_geometry() &&
1785  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1786  auto generate_coord_lvs =
1787  [&](auto* selected_target_expr,
1788  bool const fetch_columns) -> std::vector<llvm::Value*> {
1789  const auto target_lvs =
1790  code_generator.codegen(selected_target_expr, fetch_columns, co);
1791  const auto geo_expr = dynamic_cast<const Analyzer::GeoExpr*>(target_expr);
1792  if (geo_expr) {
1793  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
1794  target_lvs.size());
1795  return target_lvs;
1796  }
1797  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
1798  target_lvs.size());
1799 
1800  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1801  const auto i8p_ty =
1802  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1803  std::vector<llvm::Value*> coords;
1804  size_t ctr = 0;
1805  for (const auto& target_lv : target_lvs) {
1806  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
1807  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
1808  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
1809  // coords array (TINYINT). Subsequent arrays are regular INT.
1810 
1811  const size_t elem_sz = ctr == 0 ? 1 : 4;
1812  ctr++;
1813  int32_t fixlen = -1;
1814  if (target_ti.get_type() == kPOINT) {
1815  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
1816  if (col_var) {
1817  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
1818  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
1819  fixlen = coords_cd->columnType.get_size();
1820  }
1821  }
1822  }
1823  if (fixlen > 0) {
1824  coords.push_back(executor_->cgen_state_->emitExternalCall(
1825  "fast_fixlen_array_buff",
1826  i8p_ty,
1827  {target_lv, code_generator.posArg(selected_target_expr)}));
1828  coords.push_back(executor_->cgen_state_->llInt(int64_t(fixlen)));
1829  continue;
1830  }
1831  coords.push_back(executor_->cgen_state_->emitExternalCall(
1832  "array_buff",
1833  i8p_ty,
1834  {target_lv, code_generator.posArg(selected_target_expr)}));
1835  coords.push_back(executor_->cgen_state_->emitExternalCall(
1836  "array_size",
1837  i32_ty,
1838  {target_lv,
1839  code_generator.posArg(selected_target_expr),
1840  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
1841  }
1842  return coords;
1843  };
1844 
1845  if (agg_expr) {
1846  return generate_coord_lvs(agg_expr->get_arg(), true);
1847  } else {
1848  return generate_coord_lvs(target_expr,
1849  !executor_->plan_state_->allow_lazy_fetch_);
1850  }
1851  }
1852  }
1853  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1854  : code_generator.codegen(
1855  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1856 }
1857 
1858 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
1859  const std::vector<llvm::Value*>& args) {
1860  return executor_->cgen_state_->emitCall(fname, args);
1861 }
1862 
1863 void GroupByAndAggregate::checkErrorCode(llvm::Value* retCode) {
1864  auto zero_const = llvm::ConstantInt::get(retCode->getType(), 0, true);
1865  auto rc_check_condition = executor_->cgen_state_->ir_builder_.CreateICmp(
1866  llvm::ICmpInst::ICMP_EQ, retCode, zero_const);
1867 
1868  executor_->cgen_state_->emitErrorCheck(rc_check_condition, retCode, "rc");
1869 }
1870 
1871 #undef ROW_FUNC
1872 #undef LL_FP
1873 #undef LL_INT
1874 #undef LL_BOOL
1875 #undef LL_BUILDER
1876 #undef LL_CONTEXT
1877 
1879  const RelAlgExecutionUnit& ra_exe_unit,
1880  const Catalog_Namespace::Catalog& catalog) {
1881  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
1882  return 0;
1883  }
1884  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
1885  const auto grouped_col_expr =
1886  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
1887  if (!grouped_col_expr) {
1888  continue;
1889  }
1890  if (grouped_col_expr->get_table_id() <= 0) {
1891  return 0;
1892  }
1893  const auto td = catalog.getMetadataForTable(grouped_col_expr->get_table_id());
1894  if (td->shardedColumnId == grouped_col_expr->get_column_id()) {
1895  return td->nShards;
1896  }
1897  }
1898  return 0;
1899 }
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:201
bool use_streaming_top_n(const RelAlgExecutionUnit &ra_exe_unit, const bool output_columnar)
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
llvm::Value * getAdditionalLiteral(const int32_t off)
int32_t get_agg_count(const std::vector< Analyzer::Expr *> &target_exprs)
void get_domain(DomainSet &domain_set) const override
Definition: Analyzer.cpp:2919
HOST DEVICE int get_size() const
Definition: sqltypes.h:336
#define LL_BUILDER
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:81
bool is_fp() const
Definition: sqltypes.h:481
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:115
std::vector< int8_t > get_col_byte_widths(const T &col_expr_list, const std::vector< ssize_t > &col_exprs_to_not_project)
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner >)
#define LL_CONTEXT
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
bool expr_is_rowid(const Analyzer::Expr *expr, const Catalog_Namespace::Catalog &cat)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
ExecutorDeviceType
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
SQLTypeInfo sql_type
Definition: TargetInfo.h:42
Streaming Top N algorithm.
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:66
void mark_function_always_inline(llvm::Function *func)
ColRangeInfo getColRangeInfo()
const std::list< Analyzer::OrderEntry > order_entries
#define LL_INT(v)
static const size_t baseline_threshold
Definition: Execute.h:948
QueryDescriptionType hash_type_
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:326
ColRangeInfo getExprRangeInfo(const Analyzer::Expr *expr) const
static bool supportedExprForGpuSharedMemUsage(Analyzer::Expr *expr)
#define CHECK_GE(x, y)
Definition: Logger.h:206
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: sqldefs.h:49
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
void checkErrorCode(llvm::Value *retCode)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:121
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:205
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:334
std::list< const Expr * > DomainSet
Definition: Analyzer.h:61
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
std::string to_string(char const *&&v)
bool g_bigint_count
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
Helpers for codegen of target expressions.
#define LL_BOOL(v)
Expr * get_arg() const
Definition: Analyzer.h:1045
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:73
const SQLTypeInfo get_compact_type(const TargetInfo &target)
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value *> &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
const size_t limit
CountDistinctDescriptors initCountDistinctDescriptors()
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:116
bool supportedTypeForGpuSharedMemUsage(const SQLTypeInfo &target_type_info) const
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:170
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:35
const SortInfo sort_info
bool interleavedBins(const ExecutorDeviceType) const
const JoinQualsPerNestingLevel join_quals
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
void setFalseTarget(llvm::BasicBlock *cond_false)
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr *> &target_exprs, const size_t target_idx)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
Definition: sqldefs.h:75
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value *> &agg_out_ptr_w_idx, const std::vector< llvm::Value *> &agg_out_vec, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:117
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const std::tuple< llvm::Value *, llvm::Value *> &agg_out_ptr_w_idx, const std::vector< llvm::Value *> &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, GroupByAndAggregate::DiamondCodegen &diamond_codegen) const
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value *> &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
DiamondCodegen(llvm::Value *cond, Executor *executor, const bool chain_to_next, const std::string &label_prefix, DiamondCodegen *parent, const bool share_false_edge_with_parent)
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void > > checked_int64_t
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:41
ExecutorDeviceType device_type_
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:25
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:42
#define CHECK_LT(x, y)
Definition: Logger.h:203
Definition: sqltypes.h:55
const std::vector< InputTableInfo > & query_infos_
size_t getColOnlyOffInBytes(const size_t col_idx) const
void agg_count_distinct(int64_t *agg, const int64_t val)
#define CHECK_LE(x, y)
Definition: Logger.h:204
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value *> &args)
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
Definition: sqldefs.h:76
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
KeylessInfo getKeylessInfo(const std::vector< Analyzer::Expr *> &target_expr_list, const bool is_group_by) const
Descriptor for the result set buffer layout.
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
size_t g_leaf_count
Definition: ParserNode.cpp:66
void codegenEstimator(std::stack< llvm::BasicBlock *> &array_loops, GroupByAndAggregate::DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
bool g_cluster
CountDistinctImplType
void add_transient_string_literals_for_expression(const Analyzer::Expr *expr, Executor *executor, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:78
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:503
#define CHECK(condition)
Definition: Logger.h:193
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
bool g_enable_watchdog
Definition: Execute.cpp:71
size_t getColOffInBytes(const size_t col_idx) const
Definition: sqltypes.h:48
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint)
Allocate GPU memory using GpuBuffers via DataMgr.
const bool with_dynamic_watchdog_
const RelAlgExecutionUnit & ra_exe_unit_
const size_t offset
QueryDescriptionType getQueryDescriptionType() const
Definition: sqldefs.h:74
Definition: sqldefs.h:72
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:1396
bool isSingleColumnGroupByWithPerfectHash() const
const int64_t const uint32_t const uint32_t const uint32_t const bool keyless
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
static WindowFunctionContext * getActiveWindowFunctionContext()
SQLOps get_optype() const
Definition: Analyzer.h:364
size_t getEffectiveKeyWidth() const
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)
void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)