OmniSciDB  1dac507f6e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
GroupByAndAggregate.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GroupByAndAggregate.h"
18 #include "AggregateUtils.h"
20 
21 #include "CardinalityEstimator.h"
22 #include "CodeGenerator.h"
24 #include "ExpressionRange.h"
25 #include "ExpressionRewrite.h"
26 #include "GpuInitGroups.h"
27 #include "InPlaceSort.h"
29 #include "MaxwellCodegenPatch.h"
31 #include "TargetExprBuilder.h"
32 
33 #include "../CudaMgr/CudaMgr.h"
34 #include "../Shared/checked_alloc.h"
35 #include "../Utils/ChunkIter.h"
37 #include "Execute.h"
38 #include "QueryTemplateGenerator.h"
39 #include "RuntimeFunctions.h"
40 #include "StreamingTopN.h"
41 #include "TopKSort.h"
42 #include "WindowContext.h"
43 
44 #include <llvm/Transforms/Utils/BasicBlockUtils.h>
45 
46 #include <numeric>
47 #include <thread>
48 
49 bool g_cluster{false};
50 bool g_bigint_count{false};
52 extern size_t g_leaf_count;
53 
54 namespace {
55 
56 int32_t get_agg_count(const std::vector<Analyzer::Expr*>& target_exprs) {
57  int32_t agg_count{0};
58  for (auto target_expr : target_exprs) {
59  CHECK(target_expr);
60  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
61  if (!agg_expr || agg_expr->get_aggtype() == kSAMPLE) {
62  const auto& ti = target_expr->get_type_info();
63  // TODO(pavan): or if is_geometry()
64  if (ti.is_array() || (ti.is_string() && ti.get_compression() == kENCODING_NONE)) {
65  agg_count += 2;
66  } else if (ti.is_geometry()) {
67  agg_count += ti.get_physical_coord_cols() * 2;
68  } else {
69  ++agg_count;
70  }
71  continue;
72  }
73  if (agg_expr && agg_expr->get_aggtype() == kAVG) {
74  agg_count += 2;
75  } else {
76  ++agg_count;
77  }
78  }
79  return agg_count;
80 }
81 
83  const auto col = dynamic_cast<const Analyzer::ColumnVar*>(expr);
84  if (!col) {
85  return false;
86  }
87  const auto cd =
88  get_column_descriptor_maybe(col->get_column_id(), col->get_table_id(), cat);
89  if (!cd || !cd->isVirtualCol) {
90  return false;
91  }
92  CHECK_EQ("rowid", cd->columnName);
93  return true;
94 }
95 
96 bool has_count_distinct(const RelAlgExecutionUnit& ra_exe_unit) {
97  for (const auto& target_expr : ra_exe_unit.target_exprs) {
98  const auto agg_info = get_target_info(target_expr, g_bigint_count);
99  if (agg_info.is_agg && is_distinct_target(agg_info)) {
100  return true;
101  }
102  }
103  return false;
104 }
105 
107  const int64_t max_entry_count) {
108  try {
109  return static_cast<int64_t>(checked_int64_t(col_range_info.max) -
110  checked_int64_t(col_range_info.min)) >= max_entry_count;
111  } catch (...) {
112  return true;
113  }
114 }
115 
116 } // namespace
117 
119  // Use baseline layout more eagerly on the GPU if the query uses count distinct,
120  // because our HyperLogLog implementation is 4x less memory efficient on GPU.
121  // Technically, this only applies to APPROX_COUNT_DISTINCT, but in practice we
122  // can expect this to be true anyway for grouped queries since the precise version
123  // uses significantly more memory.
124  const int64_t baseline_threshold =
129  if (ra_exe_unit_.groupby_exprs.size() != 1) {
130  try {
131  checked_int64_t cardinality{1};
132  bool has_nulls{false};
133  for (const auto groupby_expr : ra_exe_unit_.groupby_exprs) {
134  auto col_range_info = getExprRangeInfo(groupby_expr.get());
135  if (col_range_info.hash_type_ != QueryDescriptionType::GroupByPerfectHash) {
136  // going through baseline hash if a non-integer type is encountered
137  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
138  }
139  auto crt_col_cardinality = getBucketedCardinality(col_range_info);
140  CHECK_GE(crt_col_cardinality, 0);
141  cardinality *= crt_col_cardinality;
142  if (col_range_info.has_nulls) {
143  has_nulls = true;
144  }
145  }
146  // For zero or high cardinalities, use baseline layout.
147  if (!cardinality || cardinality > baseline_threshold) {
148  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
149  }
151  0,
152  int64_t(cardinality),
153  0,
154  has_nulls};
155  } catch (...) { // overflow when computing cardinality
156  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
157  }
158  }
159  // For single column groupby on high timestamps, force baseline hash due to wide ranges
160  // we are likely to encounter when applying quals to the expression range
161  // TODO: consider allowing TIMESTAMP(9) (nanoseconds) with quals to use perfect hash if
162  // the range is small enough
163  if (ra_exe_unit_.groupby_exprs.front() &&
164  ra_exe_unit_.groupby_exprs.front()->get_type_info().is_high_precision_timestamp() &&
165  ra_exe_unit_.simple_quals.size() > 0) {
166  return {QueryDescriptionType::GroupByBaselineHash, 0, 0, 0, false};
167  }
168  const auto col_range_info = getExprRangeInfo(ra_exe_unit_.groupby_exprs.front().get());
169  if (!ra_exe_unit_.groupby_exprs.front()) {
170  return col_range_info;
171  }
172  static const int64_t MAX_BUFFER_SIZE = 1 << 30;
173  const int64_t col_count =
175  int64_t max_entry_count = MAX_BUFFER_SIZE / (col_count * sizeof(int64_t));
177  max_entry_count = std::min(max_entry_count, baseline_threshold);
178  }
179  if ((!ra_exe_unit_.groupby_exprs.front()->get_type_info().is_string() &&
180  !expr_is_rowid(ra_exe_unit_.groupby_exprs.front().get(), *executor_->catalog_)) &&
181  is_column_range_too_big_for_perfect_hash(col_range_info, max_entry_count) &&
182  !col_range_info.bucket) {
184  col_range_info.min,
185  col_range_info.max,
186  0,
187  col_range_info.has_nulls};
188  }
189  return col_range_info;
190 }
191 
193  if (!expr) {
194  return {QueryDescriptionType::Projection, 0, 0, 0, false};
195  }
196 
197  const auto expr_range = getExpressionRange(
198  expr, query_infos_, executor_, boost::make_optional(ra_exe_unit_.simple_quals));
199  switch (expr_range.getType()) {
202  expr_range.getIntMin(),
203  expr_range.getIntMax(),
204  expr_range.getBucket(),
205  expr_range.hasNulls()};
210  default:
211  CHECK(false);
212  }
213  CHECK(false);
214  return {QueryDescriptionType::NonGroupedAggregate, 0, 0, 0, false};
215 }
216 
218  checked_int64_t crt_col_cardinality =
219  checked_int64_t(col_range_info.max) - checked_int64_t(col_range_info.min);
220  if (col_range_info.bucket) {
221  crt_col_cardinality /= col_range_info.bucket;
222  }
223  return static_cast<int64_t>(crt_col_cardinality +
224  (1 + (col_range_info.has_nulls ? 1 : 0)));
225 }
226 
227 #define LL_CONTEXT executor_->cgen_state_->context_
228 #define LL_BUILDER executor_->cgen_state_->ir_builder_
229 #define LL_BOOL(v) executor_->cgen_state_->llBool(v)
230 #define LL_INT(v) executor_->cgen_state_->llInt(v)
231 #define LL_FP(v) executor_->cgen_state_->llFp(v)
232 #define ROW_FUNC executor_->cgen_state_->row_func_
233 
235  Executor* executor,
236  const ExecutorDeviceType device_type,
237  const RelAlgExecutionUnit& ra_exe_unit,
238  const std::vector<InputTableInfo>& query_infos,
239  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner)
240  : executor_(executor)
241  , ra_exe_unit_(ra_exe_unit)
242  , query_infos_(query_infos)
243  , row_set_mem_owner_(row_set_mem_owner)
244  , device_type_(device_type) {
245  for (const auto groupby_expr : ra_exe_unit_.groupby_exprs) {
246  if (!groupby_expr) {
247  continue;
248  }
249  const auto& groupby_ti = groupby_expr->get_type_info();
250  if (groupby_ti.is_string() && groupby_ti.get_compression() != kENCODING_DICT) {
251  throw std::runtime_error(
252  "Cannot group by string columns which are not dictionary encoded.");
253  }
254  if (groupby_ti.is_array()) {
255  throw std::runtime_error("Group by array not supported");
256  }
257  if (groupby_ti.is_geometry()) {
258  throw std::runtime_error("Group by geometry not supported");
259  }
260  }
261 }
262 
264  const size_t shard_count) const {
265  size_t device_count{0};
267  device_count = executor_->getCatalog()->getDataMgr().getCudaMgr()->getDeviceCount();
268  CHECK_GT(device_count, 0u);
269  }
270 
271  int64_t bucket{col_range_info.bucket};
272 
273  if (shard_count) {
274  CHECK(!col_range_info.bucket);
275  /*
276  when a node has fewer devices than shard count,
277  a) In a distributed setup, the minimum distance between two keys would be
278  device_count because shards are stored consecutively across the physical tables, i.e
279  if a shard column has values 0 to 9, and 3 shards on each leaf, then node 1 would
280  have values: 0,1,2,6,7,8 and node 2 would have values: 3,4,5,9. If each leaf node
281  has only 1 device, in this case, all the keys from each node are loaded on the
282  device each.
283 
284  b) In a single node setup, the distance would be minimum of device_count or
285  difference of device_count - shard_count. For example: If a single node server
286  running on 3 devices a shard column has values 0 to 9 in a table with 4 shards,
287  device to fragment keys mapping would be: device 1 - 4,8,3,7 device 2 - 1,5,9 device
288  3 - 2, 6 The bucket value would be 4(shards) - 3(devices) = 1 i.e. minimum of
289  device_count or difference.
290 
291  When a node has device count equal to or more than shard count then the
292  minimum distance is always at least shard_count * no of leaf nodes.
293  */
294  if (device_count < shard_count) {
295  bucket = g_leaf_count ? std::max(device_count, static_cast<size_t>(1))
296  : std::min(device_count, shard_count - device_count);
297  } else {
298  bucket = shard_count * std::max(g_leaf_count, static_cast<size_t>(1));
299  }
300  }
301 
302  return bucket;
303 }
304 
305 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptor(
306  const bool allow_multifrag,
307  const size_t max_groups_buffer_entry_count,
308  const int8_t crt_min_byte_width,
309  RenderInfo* render_info,
310  const bool output_columnar_hint) {
311  const auto shard_count =
314  : 0;
315  bool sort_on_gpu_hint =
316  device_type_ == ExecutorDeviceType::GPU && allow_multifrag &&
319  // must_use_baseline_sort is true iff we'd sort on GPU with the old algorithm
320  // but the total output buffer size would be too big or it's a sharded top query.
321  // For the sake of managing risk, use the new result set way very selectively for
322  // this case only (alongside the baseline layout we've enabled for a while now).
323  bool must_use_baseline_sort = shard_count;
324  std::unique_ptr<QueryMemoryDescriptor> query_mem_desc;
325  while (true) {
326  query_mem_desc = initQueryMemoryDescriptorImpl(allow_multifrag,
327  max_groups_buffer_entry_count,
328  crt_min_byte_width,
329  sort_on_gpu_hint,
330  render_info,
331  must_use_baseline_sort,
332  output_columnar_hint);
333  CHECK(query_mem_desc);
334  if (query_mem_desc->sortOnGpu() &&
335  (query_mem_desc->getBufferSizeBytes(device_type_) +
336  align_to_int64(query_mem_desc->getEntryCount() * sizeof(int32_t))) >
337  2 * 1024 * 1024 * 1024L) {
338  must_use_baseline_sort = true;
339  sort_on_gpu_hint = false;
340  } else {
341  break;
342  }
343  }
344  return query_mem_desc;
345 }
346 
347 std::unique_ptr<QueryMemoryDescriptor> GroupByAndAggregate::initQueryMemoryDescriptorImpl(
348  const bool allow_multifrag,
349  const size_t max_groups_buffer_entry_count,
350  const int8_t crt_min_byte_width,
351  const bool sort_on_gpu_hint,
352  RenderInfo* render_info,
353  const bool must_use_baseline_sort,
354  const bool output_columnar_hint) {
356 
357  const auto count_distinct_descriptors = initCountDistinctDescriptors();
358 
359  auto group_col_widths = get_col_byte_widths(ra_exe_unit_.groupby_exprs, {});
360 
361  const bool is_group_by{!ra_exe_unit_.groupby_exprs.empty()};
362 
363  auto col_range_info_nosharding = getColRangeInfo();
364 
365  const auto shard_count =
366  device_type_ == ExecutorDeviceType::GPU
367  ? shard_count_for_top_groups(ra_exe_unit_, *executor_->getCatalog())
368  : 0;
369 
370  const auto col_range_info =
371  ColRangeInfo{col_range_info_nosharding.hash_type_,
372  col_range_info_nosharding.min,
373  col_range_info_nosharding.max,
374  getShardedTopBucket(col_range_info_nosharding, shard_count),
375  col_range_info_nosharding.has_nulls};
376 
377  // Non-grouped aggregates do not support accessing aggregated ranges
378  // Keyless hash is currently only supported with single-column perfect hash
379  const auto keyless_info =
380  !(is_group_by &&
381  col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
382  ra_exe_unit_.groupby_exprs.size() == 1)
383  ? KeylessInfo{false, -1, false}
384  : getKeylessInfo(ra_exe_unit_.target_exprs, is_group_by);
385 
386  if (g_enable_watchdog &&
387  ((col_range_info.hash_type_ == QueryDescriptionType::GroupByBaselineHash &&
388  max_groups_buffer_entry_count > 120000000) ||
389  (col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
390  ra_exe_unit_.groupby_exprs.size() == 1 &&
391  (col_range_info.max - col_range_info.min) /
392  std::max(col_range_info.bucket, int64_t(1)) >
393  130000000))) {
394  throw WatchdogException("Query would use too much memory");
395  }
396  return QueryMemoryDescriptor::init(executor_,
397  ra_exe_unit_,
398  query_infos_,
399  col_range_info,
400  keyless_info,
401  allow_multifrag,
402  device_type_,
403  crt_min_byte_width,
404  sort_on_gpu_hint,
405  shard_count,
406  max_groups_buffer_entry_count,
407  render_info,
408  count_distinct_descriptors,
409  must_use_baseline_sort,
410  output_columnar_hint);
411 }
412 
415 }
416 
417 namespace {
418 
420  const Analyzer::Expr* expr,
421  Executor* executor,
422  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner) {
423  if (!expr) {
424  return;
425  }
426 
427  const auto array_expr = dynamic_cast<const Analyzer::ArrayExpr*>(expr);
428  if (array_expr) {
429  for (size_t i = 0; i < array_expr->getElementCount(); i++) {
431  array_expr->getElement(i), executor, row_set_mem_owner);
432  }
433  return;
434  }
435 
436  const auto cast_expr = dynamic_cast<const Analyzer::UOper*>(expr);
437  const auto& expr_ti = expr->get_type_info();
438  if (cast_expr && cast_expr->get_optype() == kCAST && expr_ti.is_string()) {
439  CHECK_EQ(kENCODING_DICT, expr_ti.get_compression());
440  auto sdp = executor->getStringDictionaryProxy(
441  expr_ti.get_comp_param(), row_set_mem_owner, true);
442  CHECK(sdp);
443  const auto str_lit_expr =
444  dynamic_cast<const Analyzer::Constant*>(cast_expr->get_operand());
445  if (str_lit_expr && str_lit_expr->get_constval().stringval) {
446  sdp->getOrAddTransient(*str_lit_expr->get_constval().stringval);
447  }
448  return;
449  }
450  const auto case_expr = dynamic_cast<const Analyzer::CaseExpr*>(expr);
451  if (!case_expr) {
452  return;
453  }
454  Analyzer::DomainSet domain_set;
455  case_expr->get_domain(domain_set);
456  if (domain_set.empty()) {
457  return;
458  }
459  if (expr_ti.is_string()) {
460  CHECK_EQ(kENCODING_DICT, expr_ti.get_compression());
461  auto sdp = executor->getStringDictionaryProxy(
462  expr_ti.get_comp_param(), row_set_mem_owner, true);
463  CHECK(sdp);
464  for (const auto domain_expr : domain_set) {
465  const auto cast_expr = dynamic_cast<const Analyzer::UOper*>(domain_expr);
466  const auto str_lit_expr =
467  cast_expr && cast_expr->get_optype() == kCAST
468  ? dynamic_cast<const Analyzer::Constant*>(cast_expr->get_operand())
469  : dynamic_cast<const Analyzer::Constant*>(domain_expr);
470  if (str_lit_expr && str_lit_expr->get_constval().stringval) {
471  sdp->getOrAddTransient(*str_lit_expr->get_constval().stringval);
472  }
473  }
474  }
475 }
476 
477 } // namespace
478 
480  const RelAlgExecutionUnit& ra_exe_unit,
481  Executor* executor,
482  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner) {
483  for (const auto group_expr : ra_exe_unit.groupby_exprs) {
485  group_expr.get(), executor, row_set_mem_owner);
486  }
487  for (const auto target_expr : ra_exe_unit.target_exprs) {
488  const auto& target_type = target_expr->get_type_info();
489  if (target_type.is_string() && target_type.get_compression() != kENCODING_DICT) {
490  continue;
491  }
492  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
493  if (agg_expr) {
494  if (agg_expr->get_aggtype() == kSAMPLE) {
496  agg_expr->get_arg(), executor, row_set_mem_owner);
497  }
498  } else {
500  target_expr, executor, row_set_mem_owner);
501  }
502  }
503  row_set_mem_owner->addLiteralStringDictProxy(executor->lit_str_dict_proxy_);
504 }
505 
507  CountDistinctDescriptors count_distinct_descriptors;
508  for (const auto target_expr : ra_exe_unit_.target_exprs) {
509  auto agg_info = get_target_info(target_expr, g_bigint_count);
510  if (is_distinct_target(agg_info)) {
511  CHECK(agg_info.is_agg);
512  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
513  const auto agg_expr = static_cast<const Analyzer::AggExpr*>(target_expr);
514  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
515  if (arg_ti.is_string() && arg_ti.get_compression() != kENCODING_DICT) {
516  throw std::runtime_error(
517  "Strings must be dictionary-encoded for COUNT(DISTINCT).");
518  }
519  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_array()) {
520  throw std::runtime_error("APPROX_COUNT_DISTINCT on arrays not supported yet");
521  }
522  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT && arg_ti.is_geometry()) {
523  throw std::runtime_error(
524  "APPROX_COUNT_DISTINCT on geometry columns not supported");
525  }
526  if (agg_info.is_distinct && arg_ti.is_geometry()) {
527  throw std::runtime_error("COUNT DISTINCT on geometry columns not supported");
528  }
529  ColRangeInfo no_range_info{QueryDescriptionType::Projection, 0, 0, 0, false};
530  auto arg_range_info =
531  arg_ti.is_fp() ? no_range_info : getExprRangeInfo(agg_expr->get_arg());
532  CountDistinctImplType count_distinct_impl_type{CountDistinctImplType::StdSet};
533  int64_t bitmap_sz_bits{0};
534  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
535  const auto error_rate = agg_expr->get_error_rate();
536  if (error_rate) {
537  CHECK(error_rate->get_type_info().get_type() == kINT);
538  CHECK_GE(error_rate->get_constval().intval, 1);
539  bitmap_sz_bits = hll_size_for_rate(error_rate->get_constval().smallintval);
540  } else {
541  bitmap_sz_bits = g_hll_precision_bits;
542  }
543  }
544  if (arg_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
545  !(arg_ti.is_array() || arg_ti.is_geometry())) { // TODO(alex): allow bitmap
546  // implementation for arrays
547  if (arg_range_info.isEmpty()) {
548  count_distinct_descriptors.emplace_back(
550  0,
551  64,
552  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
553  device_type_,
554  1});
555  continue;
556  }
557  count_distinct_impl_type = CountDistinctImplType::Bitmap;
558  if (agg_info.agg_kind == kCOUNT) {
559  bitmap_sz_bits = arg_range_info.max - arg_range_info.min + 1;
560  const int64_t MAX_BITMAP_BITS{8 * 1000 * 1000 * 1000L};
561  if (bitmap_sz_bits <= 0 || bitmap_sz_bits > MAX_BITMAP_BITS) {
562  count_distinct_impl_type = CountDistinctImplType::StdSet;
563  }
564  }
565  }
566  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT &&
567  count_distinct_impl_type == CountDistinctImplType::StdSet &&
568  !(arg_ti.is_array() || arg_ti.is_geometry())) {
569  count_distinct_impl_type = CountDistinctImplType::Bitmap;
570  }
571  if (g_enable_watchdog &&
572  count_distinct_impl_type == CountDistinctImplType::StdSet) {
573  throw WatchdogException("Cannot use a fast path for COUNT distinct");
574  }
575  const auto sub_bitmap_count =
577  count_distinct_descriptors.emplace_back(
578  CountDistinctDescriptor{count_distinct_impl_type,
579  arg_range_info.min,
580  bitmap_sz_bits,
581  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT,
582  device_type_,
583  sub_bitmap_count});
584  } else {
585  count_distinct_descriptors.emplace_back(CountDistinctDescriptor{
586  CountDistinctImplType::Invalid, 0, 0, false, device_type_, 0});
587  }
588  }
589  return count_distinct_descriptors;
590 }
591 
603  const std::vector<Analyzer::Expr*>& target_expr_list,
604  const bool is_group_by) const {
605  bool keyless{true}, found{false}, shared_mem_support{false},
606  shared_mem_valid_data_type{true};
607  /* Currently support shared memory usage for a limited subset of possible aggregate
608  * operations. shared_mem_support and
609  * shared_mem_valid_data_type are declared to ensure such support. */
610  int32_t num_agg_expr{0}; // used for shared memory support on the GPU
611  int32_t index{0};
612  for (const auto target_expr : target_expr_list) {
613  const auto agg_info = get_target_info(target_expr, g_bigint_count);
614  const auto chosen_type = get_compact_type(agg_info);
615  // TODO(Saman): should be eventually removed, once I make sure what data types can
616  // be used in this shared memory setting.
617 
618  shared_mem_valid_data_type =
619  shared_mem_valid_data_type && supportedTypeForGpuSharedMemUsage(chosen_type);
620 
621  if (agg_info.is_agg) {
622  num_agg_expr++;
623  }
624  if (!found && agg_info.is_agg && !is_distinct_target(agg_info)) {
625  auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
626  CHECK(agg_expr);
627  const auto arg_expr = agg_arg(target_expr);
628  const bool float_argument_input = takes_float_argument(agg_info);
629  switch (agg_info.agg_kind) {
630  case kAVG:
631  ++index;
632  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
633  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
634  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
635  expr_range_info.hasNulls()) {
636  break;
637  }
638  }
639  found = true;
640  break;
641  case kCOUNT:
642  if (arg_expr && !arg_expr->get_type_info().get_notnull()) {
643  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
644  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
645  expr_range_info.hasNulls()) {
646  break;
647  }
648  }
649  found = true;
650  if (!agg_info.skip_null_val) {
651  shared_mem_support = true; // currently just support 8 bytes per group
652  }
653  break;
654  case kSUM: {
655  auto arg_ti = arg_expr->get_type_info();
656  if (constrained_not_null(arg_expr, ra_exe_unit_.quals)) {
657  arg_ti.set_notnull(true);
658  }
659  if (!arg_ti.get_notnull()) {
660  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
661  if (expr_range_info.getType() != ExpressionRangeType::Invalid &&
662  !expr_range_info.hasNulls()) {
663  found = true;
664  }
665  } else {
666  auto expr_range_info = getExpressionRange(arg_expr, query_infos_, executor_);
667  switch (expr_range_info.getType()) {
670  if (expr_range_info.getFpMax() < 0 || expr_range_info.getFpMin() > 0) {
671  found = true;
672  }
673  break;
675  if (expr_range_info.getIntMax() < 0 || expr_range_info.getIntMin() > 0) {
676  found = true;
677  }
678  break;
679  default:
680  break;
681  }
682  }
683  break;
684  }
685  case kMIN: {
686  CHECK(agg_expr && agg_expr->get_arg());
687  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
688  if (arg_ti.is_string() || arg_ti.is_array()) {
689  break;
690  }
691  auto expr_range_info =
692  getExpressionRange(agg_expr->get_arg(), query_infos_, executor_);
693  auto init_max = get_agg_initial_val(agg_info.agg_kind,
694  chosen_type,
695  is_group_by || float_argument_input,
696  float_argument_input ? sizeof(float) : 8);
697  switch (expr_range_info.getType()) {
700  auto double_max =
701  *reinterpret_cast<const double*>(may_alias_ptr(&init_max));
702  if (expr_range_info.getFpMax() < double_max) {
703  found = true;
704  }
705  break;
706  }
708  if (expr_range_info.getIntMax() < init_max) {
709  found = true;
710  }
711  break;
712  default:
713  break;
714  }
715  break;
716  }
717  case kMAX: {
718  CHECK(agg_expr && agg_expr->get_arg());
719  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
720  if (arg_ti.is_string() || arg_ti.is_array()) {
721  break;
722  }
723  auto expr_range_info =
724  getExpressionRange(agg_expr->get_arg(), query_infos_, executor_);
725  // NULL sentinel and init value for kMAX are identical, which results in
726  // ambiguity in detecting empty keys in presence of nulls.
727  if (expr_range_info.getType() == ExpressionRangeType::Invalid ||
728  expr_range_info.hasNulls()) {
729  break;
730  }
731  auto init_min = get_agg_initial_val(agg_info.agg_kind,
732  chosen_type,
733  is_group_by || float_argument_input,
734  float_argument_input ? sizeof(float) : 8);
735  switch (expr_range_info.getType()) {
738  auto double_min =
739  *reinterpret_cast<const double*>(may_alias_ptr(&init_min));
740  if (expr_range_info.getFpMin() > double_min) {
741  found = true;
742  }
743  break;
744  }
746  if (expr_range_info.getIntMin() > init_min) {
747  found = true;
748  }
749  break;
750  default:
751  break;
752  }
753  break;
754  }
755  default:
756  keyless = false;
757  break;
758  }
759  }
760  if (!keyless) {
761  break;
762  }
763  if (!found) {
764  ++index;
765  }
766  }
767 
768  // shouldn't use keyless for projection only
774  return {keyless && found,
775  index,
776  ((num_agg_expr == 1) && (target_expr_list.size() <= 2))
777  ? shared_mem_support && shared_mem_valid_data_type
778  : false};
779 }
780 
786  const SQLTypeInfo& target_type_info) const {
787  bool result = false;
788  switch (target_type_info.get_type()) {
789  case SQLTypes::kTINYINT:
790  case SQLTypes::kSMALLINT:
791  case SQLTypes::kINT:
792  result = true;
793  break;
794  case SQLTypes::kTEXT:
795  if (target_type_info.get_compression() == EncodingType::kENCODING_DICT) {
796  result = true;
797  }
798  break;
799  default:
800  break;
801  }
802  return result;
803 }
804 
805 // TODO(Saman): this function is temporary and all these limitations should eventually
806 // be removed.
808  /*
809  UNNEST operations follow a slightly different internal memory layout compared to other
810  keyless aggregates Currently, we opt out of using shared memory if there is any UNNEST
811  operation involved.
812  */
813  if (dynamic_cast<Analyzer::UOper*>(expr) &&
814  static_cast<Analyzer::UOper*>(expr)->get_optype() == kUNNEST) {
815  return false;
816  }
817  return true;
818 }
819 
821  const std::list<Analyzer::OrderEntry>& order_entries) {
822  if (order_entries.size() > 1) { // TODO(alex): lift this restriction
823  return false;
824  }
825  for (const auto order_entry : order_entries) {
826  CHECK_GE(order_entry.tle_no, 1);
827  CHECK_LE(static_cast<size_t>(order_entry.tle_no), ra_exe_unit_.target_exprs.size());
828  const auto target_expr = ra_exe_unit_.target_exprs[order_entry.tle_no - 1];
829  if (!dynamic_cast<Analyzer::AggExpr*>(target_expr)) {
830  return false;
831  }
832  // TODO(alex): relax the restrictions
833  auto agg_expr = static_cast<Analyzer::AggExpr*>(target_expr);
834  if (agg_expr->get_is_distinct() || agg_expr->get_aggtype() == kAVG ||
835  agg_expr->get_aggtype() == kMIN || agg_expr->get_aggtype() == kMAX ||
836  agg_expr->get_aggtype() == kAPPROX_COUNT_DISTINCT) {
837  return false;
838  }
839  if (agg_expr->get_arg()) {
840  const auto& arg_ti = agg_expr->get_arg()->get_type_info();
841  if (arg_ti.is_fp()) {
842  return false;
843  }
844  auto expr_range_info = getExprRangeInfo(agg_expr->get_arg());
845  // TOD(adb): QMD not actually initialized here?
846  if ((!(expr_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash &&
847  /* query_mem_desc.getGroupbyColCount() == 1 */ false) ||
848  expr_range_info.has_nulls) &&
849  order_entry.is_desc == order_entry.nulls_first) {
850  return false;
851  }
852  }
853  const auto& target_ti = target_expr->get_type_info();
854  CHECK(!target_ti.is_array());
855  if (!target_ti.is_integer()) {
856  return false;
857  }
858  }
859  return true;
860 }
861 
863  llvm::Value* cond,
864  Executor* executor,
865  const bool chain_to_next,
866  const std::string& label_prefix,
867  DiamondCodegen* parent,
868  const bool share_false_edge_with_parent)
869  : executor_(executor), chain_to_next_(chain_to_next), parent_(parent) {
870  if (parent_) {
872  }
873  cond_true_ = llvm::BasicBlock::Create(LL_CONTEXT, label_prefix + "_true", ROW_FUNC);
874  if (share_false_edge_with_parent) {
875  CHECK(parent);
877  } else {
879  llvm::BasicBlock::Create(LL_CONTEXT, label_prefix + "_false", ROW_FUNC);
880  }
881 
882  LL_BUILDER.CreateCondBr(cond, cond_true_, cond_false_);
883  LL_BUILDER.SetInsertPoint(cond_true_);
884 }
885 
887  CHECK(!parent_);
888  chain_to_next_ = true;
889 }
890 
891 void GroupByAndAggregate::DiamondCodegen::setFalseTarget(llvm::BasicBlock* cond_false) {
892  CHECK(!parent_ || orig_cond_false_ != parent_->cond_false_);
893  cond_false_ = cond_false;
894 }
895 
897  if (parent_ && orig_cond_false_ != parent_->cond_false_) {
898  LL_BUILDER.CreateBr(parent_->cond_false_);
899  } else if (chain_to_next_) {
900  LL_BUILDER.CreateBr(cond_false_);
901  }
902  if (!parent_ || (!chain_to_next_ && cond_false_ != parent_->cond_false_)) {
903  LL_BUILDER.SetInsertPoint(orig_cond_false_);
904  }
905 }
906 
907 bool GroupByAndAggregate::codegen(llvm::Value* filter_result,
908  llvm::BasicBlock* sc_false,
910  const CompilationOptions& co) {
911  CHECK(filter_result);
912 
913  bool can_return_error = false;
914  llvm::BasicBlock* filter_false{nullptr};
915 
916  {
917  const bool is_group_by = !ra_exe_unit_.groupby_exprs.empty();
918 
919  if (executor_->isArchMaxwell(co.device_type_)) {
921  }
922  DiamondCodegen filter_cfg(filter_result,
923  executor_,
924  !is_group_by || query_mem_desc.usesGetGroupValueFast(),
925  "filter",
926  nullptr,
927  false);
928  filter_false = filter_cfg.cond_false_;
929 
930  if (is_group_by) {
932  !use_streaming_top_n(ra_exe_unit_, query_mem_desc.didOutputColumnar())) {
933  const auto crt_matched = get_arg_by_name(ROW_FUNC, "crt_matched");
934  LL_BUILDER.CreateStore(LL_INT(int32_t(1)), crt_matched);
935  auto total_matched_ptr = get_arg_by_name(ROW_FUNC, "total_matched");
936  llvm::Value* old_total_matched_val{nullptr};
938  old_total_matched_val =
939  LL_BUILDER.CreateAtomicRMW(llvm::AtomicRMWInst::Add,
940  total_matched_ptr,
941  LL_INT(int32_t(1)),
942  llvm::AtomicOrdering::Monotonic);
943  } else {
944  old_total_matched_val = LL_BUILDER.CreateLoad(total_matched_ptr);
945  LL_BUILDER.CreateStore(
946  LL_BUILDER.CreateAdd(old_total_matched_val, LL_INT(int32_t(1))),
947  total_matched_ptr);
948  }
949  auto old_total_matched_ptr = get_arg_by_name(ROW_FUNC, "old_total_matched");
950  LL_BUILDER.CreateStore(old_total_matched_val, old_total_matched_ptr);
951  }
952 
953  auto agg_out_ptr_w_idx = codegenGroupBy(query_mem_desc, co, filter_cfg);
954  if (query_mem_desc.usesGetGroupValueFast() ||
955  query_mem_desc.getQueryDescriptionType() ==
957  if (query_mem_desc.getGroupbyColCount() > 1) {
958  filter_cfg.setChainToNext();
959  }
960  // Don't generate null checks if the group slot is guaranteed to be non-null,
961  // as it's the case for get_group_value_fast* family.
962  can_return_error =
963  codegenAggCalls(agg_out_ptr_w_idx, {}, query_mem_desc, co, filter_cfg);
964  } else {
965  {
966  llvm::Value* nullcheck_cond{nullptr};
967  if (query_mem_desc.didOutputColumnar()) {
968  nullcheck_cond = LL_BUILDER.CreateICmpSGE(std::get<1>(agg_out_ptr_w_idx),
969  LL_INT(int32_t(0)));
970  } else {
971  nullcheck_cond = LL_BUILDER.CreateICmpNE(
972  std::get<0>(agg_out_ptr_w_idx),
973  llvm::ConstantPointerNull::get(
974  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)));
975  }
976  DiamondCodegen nullcheck_cfg(
977  nullcheck_cond, executor_, false, "groupby_nullcheck", &filter_cfg, false);
978  codegenAggCalls(agg_out_ptr_w_idx, {}, query_mem_desc, co, filter_cfg);
979  }
980  can_return_error = true;
981  if (query_mem_desc.getQueryDescriptionType() ==
984  // Ignore rejection on pushing current row to top-K heap.
985  LL_BUILDER.CreateRet(LL_INT(int32_t(0)));
986  } else {
987  CodeGenerator code_generator(executor_);
988  LL_BUILDER.CreateRet(LL_BUILDER.CreateNeg(LL_BUILDER.CreateTrunc(
989  // TODO(alex): remove the trunc once pos is converted to 32 bits
990  code_generator.posArg(nullptr),
991  get_int_type(32, LL_CONTEXT))));
992  }
993  }
994  } else {
995  if (ra_exe_unit_.estimator) {
996  std::stack<llvm::BasicBlock*> array_loops;
997  codegenEstimator(array_loops, filter_cfg, query_mem_desc, co);
998  } else {
999  auto arg_it = ROW_FUNC->arg_begin();
1000  std::vector<llvm::Value*> agg_out_vec;
1001  for (int32_t i = 0; i < get_agg_count(ra_exe_unit_.target_exprs); ++i) {
1002  agg_out_vec.push_back(&*arg_it++);
1003  }
1004  can_return_error = codegenAggCalls(std::make_tuple(nullptr, nullptr),
1005  agg_out_vec,
1006  query_mem_desc,
1007  co,
1008  filter_cfg);
1009  }
1010  }
1011  }
1012 
1013  if (ra_exe_unit_.join_quals.empty()) {
1014  executor_->cgen_state_->ir_builder_.CreateRet(LL_INT(int32_t(0)));
1015  } else if (sc_false) {
1016  const auto saved_insert_block = LL_BUILDER.GetInsertBlock();
1017  LL_BUILDER.SetInsertPoint(sc_false);
1018  LL_BUILDER.CreateBr(filter_false);
1019  LL_BUILDER.SetInsertPoint(saved_insert_block);
1020  }
1021 
1022  return can_return_error;
1023 }
1024 
1026  llvm::Value* groups_buffer,
1028  const CompilationOptions& co,
1029  DiamondCodegen& diamond_codegen) {
1031  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1032  const auto group_expr = ra_exe_unit_.groupby_exprs.front();
1033  CHECK(!group_expr);
1034  if (!query_mem_desc.didOutputColumnar()) {
1035  CHECK_EQ(size_t(0), query_mem_desc.getRowSize() % sizeof(int64_t));
1036  }
1037  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1038  ? 0
1039  : query_mem_desc.getRowSize() / sizeof(int64_t);
1040  CodeGenerator code_generator(executor_);
1041  if (use_streaming_top_n(ra_exe_unit_, query_mem_desc.didOutputColumnar())) {
1042  const auto& only_order_entry = ra_exe_unit_.sort_info.order_entries.front();
1043  CHECK_GE(only_order_entry.tle_no, int(1));
1044  const size_t target_idx = only_order_entry.tle_no - 1;
1045  CHECK_LT(target_idx, ra_exe_unit_.target_exprs.size());
1046  const auto order_entry_expr = ra_exe_unit_.target_exprs[target_idx];
1047  const auto chosen_bytes =
1048  static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(target_idx));
1049  auto order_entry_lv = executor_->cgen_state_->castToTypeIn(
1050  code_generator.codegen(order_entry_expr, true, co).front(), chosen_bytes * 8);
1052  std::string fname = "get_bin_from_k_heap";
1053  const auto& oe_ti = order_entry_expr->get_type_info();
1054  llvm::Value* null_key_lv = nullptr;
1055  if (oe_ti.is_integer() || oe_ti.is_decimal() || oe_ti.is_time()) {
1056  const size_t bit_width = order_entry_lv->getType()->getIntegerBitWidth();
1057  switch (bit_width) {
1058  case 32:
1059  null_key_lv = LL_INT(static_cast<int32_t>(inline_int_null_val(oe_ti)));
1060  break;
1061  case 64:
1062  null_key_lv = LL_INT(static_cast<int64_t>(inline_int_null_val(oe_ti)));
1063  break;
1064  default:
1065  CHECK(false);
1066  }
1067  fname += "_int" + std::to_string(bit_width) + "_t";
1068  } else {
1069  CHECK(oe_ti.is_fp());
1070  if (order_entry_lv->getType()->isDoubleTy()) {
1071  null_key_lv = LL_FP(static_cast<double>(inline_fp_null_val(oe_ti)));
1072  } else {
1073  null_key_lv = LL_FP(static_cast<float>(inline_fp_null_val(oe_ti)));
1074  }
1075  fname += order_entry_lv->getType()->isDoubleTy() ? "_double" : "_float";
1076  }
1077  const auto key_slot_idx =
1079  return emitCall(
1080  fname,
1081  {groups_buffer,
1082  LL_INT(n),
1083  LL_INT(row_size_quad),
1084  LL_INT(static_cast<uint32_t>(query_mem_desc.getColOffInBytes(key_slot_idx))),
1085  LL_BOOL(only_order_entry.is_desc),
1086  LL_BOOL(!order_entry_expr->get_type_info().get_notnull()),
1087  LL_BOOL(only_order_entry.nulls_first),
1088  null_key_lv,
1089  order_entry_lv});
1090  } else {
1091  llvm::Value* output_buffer_entry_count_lv{nullptr};
1093  output_buffer_entry_count_lv =
1094  LL_BUILDER.CreateLoad(get_arg_by_name(ROW_FUNC, "max_matched"));
1095  CHECK(output_buffer_entry_count_lv);
1096  }
1097  const auto group_expr_lv =
1098  LL_BUILDER.CreateLoad(get_arg_by_name(ROW_FUNC, "old_total_matched"));
1099  std::vector<llvm::Value*> args{
1100  groups_buffer,
1101  output_buffer_entry_count_lv
1102  ? output_buffer_entry_count_lv
1103  : LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1104  group_expr_lv,
1105  code_generator.posArg(nullptr)};
1106  if (query_mem_desc.didOutputColumnar()) {
1107  const auto columnar_output_offset =
1108  emitCall("get_columnar_scan_output_offset", args);
1109  return columnar_output_offset;
1110  }
1111  args.push_back(LL_INT(row_size_quad));
1112  return emitCall("get_scan_output_slot", args);
1113  }
1114 }
1115 
1116 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenGroupBy(
1118  const CompilationOptions& co,
1119  DiamondCodegen& diamond_codegen) {
1120  auto arg_it = ROW_FUNC->arg_begin();
1121  auto groups_buffer = arg_it++;
1122 
1123  std::stack<llvm::BasicBlock*> array_loops;
1124 
1125  // TODO(Saman): move this logic outside of this function.
1127  if (query_mem_desc.didOutputColumnar()) {
1128  return std::make_tuple(
1129  &*groups_buffer,
1130  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen));
1131  } else {
1132  return std::make_tuple(
1133  codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen),
1134  nullptr);
1135  }
1136  }
1137 
1138  CHECK(query_mem_desc.getQueryDescriptionType() ==
1140  query_mem_desc.getQueryDescriptionType() ==
1142 
1143  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1144  ? 0
1145  : query_mem_desc.getRowSize() / sizeof(int64_t);
1146 
1147  const auto col_width_size = query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1148  ? sizeof(int64_t)
1149  : query_mem_desc.getEffectiveKeyWidth();
1150  // for multi-column group by
1151  llvm::Value* group_key = nullptr;
1152  llvm::Value* key_size_lv = nullptr;
1153 
1154  if (!query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1155  key_size_lv = LL_INT(static_cast<int32_t>(query_mem_desc.getGroupbyColCount()));
1156  if (query_mem_desc.getQueryDescriptionType() ==
1158  group_key =
1159  LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1160  } else if (query_mem_desc.getQueryDescriptionType() ==
1162  group_key =
1163  col_width_size == sizeof(int32_t)
1164  ? LL_BUILDER.CreateAlloca(llvm::Type::getInt32Ty(LL_CONTEXT), key_size_lv)
1165  : LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT), key_size_lv);
1166  }
1167  CHECK(group_key);
1168  CHECK(key_size_lv);
1169  }
1170 
1171  int32_t subkey_idx = 0;
1172  CHECK(query_mem_desc.getGroupbyColCount() == ra_exe_unit_.groupby_exprs.size());
1173  for (const auto group_expr : ra_exe_unit_.groupby_exprs) {
1174  const auto col_range_info = getExprRangeInfo(group_expr.get());
1175  const auto translated_null_value = static_cast<int64_t>(
1176  query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1177  ? checked_int64_t(query_mem_desc.getMaxVal()) +
1178  (query_mem_desc.getBucket() ? query_mem_desc.getBucket() : 1)
1179  : checked_int64_t(col_range_info.max) +
1180  (col_range_info.bucket ? col_range_info.bucket : 1));
1181 
1182  const bool col_has_nulls =
1183  query_mem_desc.getQueryDescriptionType() ==
1185  ? (query_mem_desc.isSingleColumnGroupByWithPerfectHash()
1186  ? query_mem_desc.hasNulls()
1187  : col_range_info.has_nulls)
1188  : false;
1189 
1190  const auto group_expr_lvs =
1191  executor_->groupByColumnCodegen(group_expr.get(),
1192  col_width_size,
1193  co,
1194  col_has_nulls,
1195  translated_null_value,
1196  diamond_codegen,
1197  array_loops,
1198  query_mem_desc.threadsShareMemory());
1199  const auto group_expr_lv = group_expr_lvs.translated_value;
1200  if (query_mem_desc.isSingleColumnGroupByWithPerfectHash()) {
1201  CHECK_EQ(size_t(1), ra_exe_unit_.groupby_exprs.size());
1202  return codegenSingleColumnPerfectHash(query_mem_desc,
1203  co,
1204  &*groups_buffer,
1205  group_expr_lv,
1206  group_expr_lvs.original_value,
1207  row_size_quad);
1208  } else {
1209  // store the sub-key to the buffer
1210  LL_BUILDER.CreateStore(group_expr_lv,
1211  LL_BUILDER.CreateGEP(group_key, LL_INT(subkey_idx++)));
1212  }
1213  }
1214  if (query_mem_desc.getQueryDescriptionType() ==
1216  CHECK(ra_exe_unit_.groupby_exprs.size() != 1);
1218  &*groups_buffer, group_key, key_size_lv, query_mem_desc, row_size_quad);
1219  } else if (query_mem_desc.getQueryDescriptionType() ==
1222  &*groups_buffer,
1223  group_key,
1224  key_size_lv,
1225  query_mem_desc,
1226  col_width_size,
1227  row_size_quad);
1228  }
1229  CHECK(false);
1230  return std::make_tuple(nullptr, nullptr);
1231 }
1232 
1233 std::tuple<llvm::Value*, llvm::Value*>
1236  const CompilationOptions& co,
1237  llvm::Value* groups_buffer,
1238  llvm::Value* group_expr_lv_translated,
1239  llvm::Value* group_expr_lv_original,
1240  const int32_t row_size_quad) {
1241  CHECK(query_mem_desc.usesGetGroupValueFast());
1242  std::string get_group_fn_name{query_mem_desc.didOutputColumnar()
1243  ? "get_columnar_group_bin_offset"
1244  : "get_group_value_fast"};
1245  if (!query_mem_desc.didOutputColumnar() && query_mem_desc.hasKeylessHash()) {
1246  get_group_fn_name += "_keyless";
1247  }
1248  if (query_mem_desc.interleavedBins(co.device_type_)) {
1249  CHECK(!query_mem_desc.didOutputColumnar());
1250  CHECK(query_mem_desc.hasKeylessHash());
1251  get_group_fn_name += "_semiprivate";
1252  }
1253  std::vector<llvm::Value*> get_group_fn_args{&*groups_buffer,
1254  &*group_expr_lv_translated};
1255  if (group_expr_lv_original && get_group_fn_name == "get_group_value_fast" &&
1256  query_mem_desc.mustUseBaselineSort()) {
1257  get_group_fn_name += "_with_original_key";
1258  get_group_fn_args.push_back(group_expr_lv_original);
1259  }
1260  get_group_fn_args.push_back(LL_INT(query_mem_desc.getMinVal()));
1261  get_group_fn_args.push_back(LL_INT(query_mem_desc.getBucket()));
1262  if (!query_mem_desc.hasKeylessHash()) {
1263  if (!query_mem_desc.didOutputColumnar()) {
1264  get_group_fn_args.push_back(LL_INT(row_size_quad));
1265  }
1266  } else {
1267  if (!query_mem_desc.didOutputColumnar()) {
1268  get_group_fn_args.push_back(LL_INT(row_size_quad));
1269  }
1270  if (query_mem_desc.interleavedBins(co.device_type_)) {
1271  auto warp_idx = emitCall("thread_warp_idx", {LL_INT(executor_->warpSize())});
1272  get_group_fn_args.push_back(warp_idx);
1273  get_group_fn_args.push_back(LL_INT(executor_->warpSize()));
1274  }
1275  }
1276  if (get_group_fn_name == "get_columnar_group_bin_offset") {
1277  return std::make_tuple(&*groups_buffer,
1278  emitCall(get_group_fn_name, get_group_fn_args));
1279  }
1280  return std::make_tuple(emitCall(get_group_fn_name, get_group_fn_args), nullptr);
1281 }
1282 
1283 std::tuple<llvm::Value*, llvm::Value*> GroupByAndAggregate::codegenMultiColumnPerfectHash(
1284  llvm::Value* groups_buffer,
1285  llvm::Value* group_key,
1286  llvm::Value* key_size_lv,
1287  const QueryMemoryDescriptor& query_mem_desc,
1288  const int32_t row_size_quad) {
1289  CHECK(query_mem_desc.getQueryDescriptionType() ==
1291  // compute the index (perfect hash)
1292  auto perfect_hash_func = codegenPerfectHashFunction();
1293  auto hash_lv =
1294  LL_BUILDER.CreateCall(perfect_hash_func, std::vector<llvm::Value*>{group_key});
1295 
1296  if (query_mem_desc.didOutputColumnar()) {
1297  const std::string set_matching_func_name{
1298  "set_matching_group_value_perfect_hash_columnar"};
1299  const std::vector<llvm::Value*> set_matching_func_arg{
1300  groups_buffer,
1301  hash_lv,
1302  group_key,
1303  key_size_lv,
1304  llvm::ConstantInt::get(get_int_type(32, LL_CONTEXT),
1305  query_mem_desc.getEntryCount())};
1306  emitCall(set_matching_func_name, set_matching_func_arg);
1307  return std::make_tuple(groups_buffer, hash_lv);
1308  } else {
1309  return std::make_tuple(
1310  emitCall("get_matching_group_value_perfect_hash",
1311  {groups_buffer, hash_lv, group_key, key_size_lv, LL_INT(row_size_quad)}),
1312  nullptr);
1313  }
1314 }
1315 
1316 std::tuple<llvm::Value*, llvm::Value*>
1318  const CompilationOptions& co,
1319  llvm::Value* groups_buffer,
1320  llvm::Value* group_key,
1321  llvm::Value* key_size_lv,
1322  const QueryMemoryDescriptor& query_mem_desc,
1323  const size_t key_width,
1324  const int32_t row_size_quad) {
1325  auto arg_it = ROW_FUNC->arg_begin(); // groups_buffer
1326  ++arg_it; // current match count
1327  ++arg_it; // total match count
1328  ++arg_it; // old match count
1329  ++arg_it; // output buffer slots count
1330  ++arg_it; // aggregate init values
1331  CHECK(arg_it->getName() == "agg_init_val");
1332  if (group_key->getType() != llvm::Type::getInt64PtrTy(LL_CONTEXT)) {
1333  CHECK(key_width == sizeof(int32_t));
1334  group_key =
1335  LL_BUILDER.CreatePointerCast(group_key, llvm::Type::getInt64PtrTy(LL_CONTEXT));
1336  }
1337  std::vector<llvm::Value*> func_args{
1338  groups_buffer,
1339  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount())),
1340  &*group_key,
1341  &*key_size_lv,
1342  LL_INT(static_cast<int32_t>(key_width))};
1343  std::string func_name{"get_group_value"};
1344  if (query_mem_desc.didOutputColumnar()) {
1345  func_name += "_columnar_slot";
1346  } else {
1347  func_args.push_back(LL_INT(row_size_quad));
1348  func_args.push_back(&*arg_it);
1349  }
1350  if (co.with_dynamic_watchdog_) {
1351  func_name += "_with_watchdog";
1352  }
1353  if (query_mem_desc.didOutputColumnar()) {
1354  return std::make_tuple(groups_buffer, emitCall(func_name, func_args));
1355  } else {
1356  return std::make_tuple(emitCall(func_name, func_args), nullptr);
1357  }
1358 }
1359 
1361  CHECK_GT(ra_exe_unit_.groupby_exprs.size(), size_t(1));
1362  auto ft = llvm::FunctionType::get(
1363  get_int_type(32, LL_CONTEXT),
1364  std::vector<llvm::Type*>{llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)},
1365  false);
1366  auto key_hash_func = llvm::Function::Create(ft,
1367  llvm::Function::ExternalLinkage,
1368  "perfect_key_hash",
1369  executor_->cgen_state_->module_);
1370  executor_->cgen_state_->helper_functions_.push_back(key_hash_func);
1371  mark_function_always_inline(key_hash_func);
1372  auto& key_buff_arg = *key_hash_func->args().begin();
1373  llvm::Value* key_buff_lv = &key_buff_arg;
1374  auto bb = llvm::BasicBlock::Create(LL_CONTEXT, "entry", key_hash_func);
1375  llvm::IRBuilder<> key_hash_func_builder(bb);
1376  llvm::Value* hash_lv{llvm::ConstantInt::get(get_int_type(64, LL_CONTEXT), 0)};
1377  std::vector<int64_t> cardinalities;
1378  for (const auto groupby_expr : ra_exe_unit_.groupby_exprs) {
1379  auto col_range_info = getExprRangeInfo(groupby_expr.get());
1380  CHECK(col_range_info.hash_type_ == QueryDescriptionType::GroupByPerfectHash);
1381  cardinalities.push_back(getBucketedCardinality(col_range_info));
1382  }
1383  size_t dim_idx = 0;
1384  for (const auto groupby_expr : ra_exe_unit_.groupby_exprs) {
1385  auto key_comp_lv = key_hash_func_builder.CreateLoad(
1386  key_hash_func_builder.CreateGEP(key_buff_lv, LL_INT(dim_idx)));
1387  auto col_range_info = getExprRangeInfo(groupby_expr.get());
1388  auto crt_term_lv =
1389  key_hash_func_builder.CreateSub(key_comp_lv, LL_INT(col_range_info.min));
1390  if (col_range_info.bucket) {
1391  crt_term_lv =
1392  key_hash_func_builder.CreateSDiv(crt_term_lv, LL_INT(col_range_info.bucket));
1393  }
1394  for (size_t prev_dim_idx = 0; prev_dim_idx < dim_idx; ++prev_dim_idx) {
1395  crt_term_lv = key_hash_func_builder.CreateMul(crt_term_lv,
1396  LL_INT(cardinalities[prev_dim_idx]));
1397  }
1398  hash_lv = key_hash_func_builder.CreateAdd(hash_lv, crt_term_lv);
1399  ++dim_idx;
1400  }
1401  key_hash_func_builder.CreateRet(
1402  key_hash_func_builder.CreateTrunc(hash_lv, get_int_type(32, LL_CONTEXT)));
1403  return key_hash_func;
1404 }
1405 
1407  const TargetInfo& agg_info,
1408  llvm::Value* target) {
1409  const auto& agg_type = agg_info.sql_type;
1410  const size_t chosen_bytes = agg_type.get_size();
1411 
1412  bool need_conversion{false};
1413  llvm::Value* arg_null{nullptr};
1414  llvm::Value* agg_null{nullptr};
1415  llvm::Value* target_to_cast{target};
1416  if (arg_type.is_fp()) {
1417  arg_null = executor_->cgen_state_->inlineFpNull(arg_type);
1418  if (agg_type.is_fp()) {
1419  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1420  if (!static_cast<llvm::ConstantFP*>(arg_null)->isExactlyValue(
1421  static_cast<llvm::ConstantFP*>(agg_null)->getValueAPF())) {
1422  need_conversion = true;
1423  }
1424  } else {
1425  CHECK(agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT);
1426  return target;
1427  }
1428  } else {
1429  arg_null = executor_->cgen_state_->inlineIntNull(arg_type);
1430  if (agg_type.is_fp()) {
1431  agg_null = executor_->cgen_state_->inlineFpNull(agg_type);
1432  need_conversion = true;
1433  target_to_cast = executor_->castToFP(target);
1434  } else {
1435  agg_null = executor_->cgen_state_->inlineIntNull(agg_type);
1436  if ((static_cast<llvm::ConstantInt*>(arg_null)->getBitWidth() !=
1437  static_cast<llvm::ConstantInt*>(agg_null)->getBitWidth()) ||
1438  (static_cast<llvm::ConstantInt*>(arg_null)->getValue() !=
1439  static_cast<llvm::ConstantInt*>(agg_null)->getValue())) {
1440  need_conversion = true;
1441  }
1442  }
1443  }
1444  if (need_conversion) {
1445  auto cmp = arg_type.is_fp() ? LL_BUILDER.CreateFCmpOEQ(target, arg_null)
1446  : LL_BUILDER.CreateICmpEQ(target, arg_null);
1447  return LL_BUILDER.CreateSelect(
1448  cmp,
1449  agg_null,
1450  executor_->cgen_state_->castToTypeIn(target_to_cast, chosen_bytes << 3));
1451  } else {
1452  return target;
1453  }
1454 }
1455 
1457  const Analyzer::WindowFunction* window_func,
1458  const QueryMemoryDescriptor& query_mem_desc,
1459  const CompilationOptions& co,
1460  DiamondCodegen& diamond_codegen) {
1461  const auto window_func_context =
1463  if (window_func_context && window_function_is_aggregate(window_func->getKind())) {
1464  const int32_t row_size_quad = query_mem_desc.didOutputColumnar()
1465  ? 0
1466  : query_mem_desc.getRowSize() / sizeof(int64_t);
1467  auto arg_it = ROW_FUNC->arg_begin();
1468  auto groups_buffer = arg_it++;
1469  CodeGenerator code_generator(executor_);
1470  if (!window_func_context->getRowNumber()) {
1471  CHECK(window_func->getKind() == SqlWindowFunctionKind::COUNT);
1472  window_func_context->setRowNumber(emitCall(
1473  "row_number_window_func",
1474  {LL_INT(reinterpret_cast<const int64_t>(window_func_context->output())),
1475  code_generator.posArg(nullptr)}));
1476  }
1477  const auto pos_in_window = LL_BUILDER.CreateTrunc(window_func_context->getRowNumber(),
1478  get_int_type(32, LL_CONTEXT));
1479  llvm::Value* entry_count_lv =
1480  LL_INT(static_cast<int32_t>(query_mem_desc.getEntryCount()));
1481  std::vector<llvm::Value*> args{
1482  &*groups_buffer, entry_count_lv, pos_in_window, code_generator.posArg(nullptr)};
1483  if (query_mem_desc.didOutputColumnar()) {
1484  const auto columnar_output_offset =
1485  emitCall("get_columnar_scan_output_offset", args);
1486  return LL_BUILDER.CreateSExt(columnar_output_offset, get_int_type(64, LL_CONTEXT));
1487  }
1488  args.push_back(LL_INT(row_size_quad));
1489  return emitCall("get_scan_output_slot", args);
1490  }
1491  auto arg_it = ROW_FUNC->arg_begin();
1492  auto groups_buffer = arg_it++;
1493  return codegenOutputSlot(&*groups_buffer, query_mem_desc, co, diamond_codegen);
1494 }
1495 
1497  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx_in,
1498  const std::vector<llvm::Value*>& agg_out_vec,
1499  const QueryMemoryDescriptor& query_mem_desc,
1500  const CompilationOptions& co,
1501  DiamondCodegen& diamond_codegen) {
1502  auto agg_out_ptr_w_idx = agg_out_ptr_w_idx_in;
1503  // TODO(alex): unify the two cases, the output for non-group by queries
1504  // should be a contiguous buffer
1505  const bool is_group_by{std::get<0>(agg_out_ptr_w_idx)};
1506  bool can_return_error = false;
1507  if (is_group_by) {
1508  CHECK(agg_out_vec.empty());
1509  } else {
1510  CHECK(!agg_out_vec.empty());
1511  }
1512 
1513  // output buffer is casted into a byte stream to be able to handle data elements of
1514  // different sizes (only used when actual column width sizes are used)
1515  llvm::Value* output_buffer_byte_stream{nullptr};
1516  llvm::Value* out_row_idx{nullptr};
1517  if (query_mem_desc.didOutputColumnar() && !g_cluster &&
1519  output_buffer_byte_stream = LL_BUILDER.CreateBitCast(
1520  std::get<0>(agg_out_ptr_w_idx),
1521  llvm::PointerType::get(llvm::Type::getInt8Ty(LL_CONTEXT), 0));
1522  output_buffer_byte_stream->setName("out_buff_b_stream");
1523  CHECK(std::get<1>(agg_out_ptr_w_idx));
1524  out_row_idx = LL_BUILDER.CreateZExt(std::get<1>(agg_out_ptr_w_idx),
1525  llvm::Type::getInt64Ty(LL_CONTEXT));
1526  out_row_idx->setName("out_row_idx");
1527  }
1528 
1529  TargetExprCodegenBuilder target_builder(query_mem_desc, ra_exe_unit_, is_group_by);
1530  for (size_t target_idx = 0; target_idx < ra_exe_unit_.target_exprs.size();
1531  ++target_idx) {
1532  auto target_expr = ra_exe_unit_.target_exprs[target_idx];
1533  CHECK(target_expr);
1534 
1535  target_builder(target_expr, executor_, co);
1536  }
1537 
1538  target_builder.codegen(this,
1539  executor_,
1540  query_mem_desc,
1541  co,
1542  agg_out_ptr_w_idx,
1543  agg_out_vec,
1544  output_buffer_byte_stream,
1545  out_row_idx,
1546  diamond_codegen);
1547 
1548  for (auto target_expr : ra_exe_unit_.target_exprs) {
1549  CHECK(target_expr);
1550  executor_->plan_state_->isLazyFetchColumn(target_expr);
1551  }
1552 
1553  return can_return_error;
1554 }
1555 
1560  llvm::Value* output_buffer_byte_stream,
1561  llvm::Value* out_row_idx,
1562  const std::tuple<llvm::Value*, llvm::Value*>& agg_out_ptr_w_idx,
1563  const QueryMemoryDescriptor& query_mem_desc,
1564  const size_t chosen_bytes,
1565  const size_t agg_out_off,
1566  const size_t target_idx) {
1567  llvm::Value* agg_col_ptr{nullptr};
1568  if (query_mem_desc.didOutputColumnar()) {
1569  // TODO(Saman): remove the second columnar branch, and support all query description
1570  // types through the first branch. Then, input arguments should also be cleaned up
1571  if (!g_cluster &&
1573  CHECK(chosen_bytes == 1 || chosen_bytes == 2 || chosen_bytes == 4 ||
1574  chosen_bytes == 8);
1575  CHECK(output_buffer_byte_stream);
1576  CHECK(out_row_idx);
1577  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1578  // multiplying by chosen_bytes, i.e., << log2(chosen_bytes)
1579  auto out_per_col_byte_idx =
1580  LL_BUILDER.CreateShl(out_row_idx, __builtin_ffs(chosen_bytes) - 1);
1581  auto byte_offset = LL_BUILDER.CreateAdd(out_per_col_byte_idx,
1582  LL_INT(static_cast<int64_t>(col_off)));
1583  byte_offset->setName("out_byte_off_target_" + std::to_string(target_idx));
1584  auto output_ptr = LL_BUILDER.CreateGEP(output_buffer_byte_stream, byte_offset);
1585  agg_col_ptr = LL_BUILDER.CreateBitCast(
1586  output_ptr,
1587  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0));
1588  agg_col_ptr->setName("out_ptr_target_" + std::to_string(target_idx));
1589  } else {
1590  uint32_t col_off = query_mem_desc.getColOffInBytes(agg_out_off);
1591  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1592  col_off /= chosen_bytes;
1593  CHECK(std::get<1>(agg_out_ptr_w_idx));
1594  auto offset = LL_BUILDER.CreateAdd(std::get<1>(agg_out_ptr_w_idx), LL_INT(col_off));
1595  agg_col_ptr = LL_BUILDER.CreateGEP(
1596  LL_BUILDER.CreateBitCast(
1597  std::get<0>(agg_out_ptr_w_idx),
1598  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0)),
1599  offset);
1600  }
1601  } else {
1602  uint32_t col_off = query_mem_desc.getColOnlyOffInBytes(agg_out_off);
1603  CHECK_EQ(size_t(0), col_off % chosen_bytes);
1604  col_off /= chosen_bytes;
1605  agg_col_ptr = LL_BUILDER.CreateGEP(
1606  LL_BUILDER.CreateBitCast(
1607  std::get<0>(agg_out_ptr_w_idx),
1608  llvm::PointerType::get(get_int_type((chosen_bytes << 3), LL_CONTEXT), 0)),
1609  LL_INT(col_off));
1610  }
1611  CHECK(agg_col_ptr);
1612  return agg_col_ptr;
1613 }
1614 
1616  std::stack<llvm::BasicBlock*>& array_loops,
1617  GroupByAndAggregate::DiamondCodegen& diamond_codegen,
1618  const QueryMemoryDescriptor& query_mem_desc,
1619  const CompilationOptions& co) {
1620  const auto& estimator_arg = ra_exe_unit_.estimator->getArgument();
1621  auto estimator_comp_count_lv = LL_INT(static_cast<int32_t>(estimator_arg.size()));
1622  auto estimator_key_lv = LL_BUILDER.CreateAlloca(llvm::Type::getInt64Ty(LL_CONTEXT),
1623  estimator_comp_count_lv);
1624  int32_t subkey_idx = 0;
1625  for (const auto estimator_arg_comp : estimator_arg) {
1626  const auto estimator_arg_comp_lvs =
1627  executor_->groupByColumnCodegen(estimator_arg_comp.get(),
1628  query_mem_desc.getEffectiveKeyWidth(),
1629  co,
1630  false,
1631  0,
1632  diamond_codegen,
1633  array_loops,
1634  true);
1635  CHECK(!estimator_arg_comp_lvs.original_value);
1636  const auto estimator_arg_comp_lv = estimator_arg_comp_lvs.translated_value;
1637  // store the sub-key to the buffer
1638  LL_BUILDER.CreateStore(estimator_arg_comp_lv,
1639  LL_BUILDER.CreateGEP(estimator_key_lv, LL_INT(subkey_idx++)));
1640  }
1641  const auto int8_ptr_ty = llvm::PointerType::get(get_int_type(8, LL_CONTEXT), 0);
1642  const auto bitmap = LL_BUILDER.CreateBitCast(&*ROW_FUNC->arg_begin(), int8_ptr_ty);
1643  const auto key_bytes = LL_BUILDER.CreateBitCast(estimator_key_lv, int8_ptr_ty);
1644  const auto estimator_comp_bytes_lv =
1645  LL_INT(static_cast<int32_t>(estimator_arg.size() * sizeof(int64_t)));
1646  const auto bitmap_size_lv =
1647  LL_INT(static_cast<uint32_t>(ra_exe_unit_.estimator->getBufferSize()));
1648  emitCall(ra_exe_unit_.estimator->getRuntimeFunctionName(),
1649  {bitmap, &*bitmap_size_lv, key_bytes, &*estimator_comp_bytes_lv});
1650 }
1651 
1652 extern "C" void agg_count_distinct(int64_t* agg, const int64_t val) {
1653  reinterpret_cast<std::set<int64_t>*>(*agg)->insert(val);
1654 }
1655 
1656 extern "C" void agg_count_distinct_skip_val(int64_t* agg,
1657  const int64_t val,
1658  const int64_t skip_val) {
1659  if (val != skip_val) {
1660  agg_count_distinct(agg, val);
1661  }
1662 }
1663 
1665  const size_t target_idx,
1666  const Analyzer::Expr* target_expr,
1667  std::vector<llvm::Value*>& agg_args,
1668  const QueryMemoryDescriptor& query_mem_desc,
1669  const ExecutorDeviceType device_type) {
1670  const auto agg_info = get_target_info(target_expr, g_bigint_count);
1671  const auto& arg_ti =
1672  static_cast<const Analyzer::AggExpr*>(target_expr)->get_arg()->get_type_info();
1673  if (arg_ti.is_fp()) {
1674  agg_args.back() = executor_->cgen_state_->ir_builder_.CreateBitCast(
1675  agg_args.back(), get_int_type(64, executor_->cgen_state_->context_));
1676  }
1677  const auto& count_distinct_descriptor =
1678  query_mem_desc.getCountDistinctDescriptor(target_idx);
1679  CHECK(count_distinct_descriptor.impl_type_ != CountDistinctImplType::Invalid);
1680  if (agg_info.agg_kind == kAPPROX_COUNT_DISTINCT) {
1681  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1682  agg_args.push_back(LL_INT(int32_t(count_distinct_descriptor.bitmap_sz_bits)));
1683  if (device_type == ExecutorDeviceType::GPU) {
1684  const auto base_dev_addr = getAdditionalLiteral(-1);
1685  const auto base_host_addr = getAdditionalLiteral(-2);
1686  agg_args.push_back(base_dev_addr);
1687  agg_args.push_back(base_host_addr);
1688  emitCall("agg_approximate_count_distinct_gpu", agg_args);
1689  } else {
1690  emitCall("agg_approximate_count_distinct", agg_args);
1691  }
1692  return;
1693  }
1694  std::string agg_fname{"agg_count_distinct"};
1695  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1696  agg_fname += "_bitmap";
1697  agg_args.push_back(LL_INT(static_cast<int64_t>(count_distinct_descriptor.min_val)));
1698  }
1699  if (agg_info.skip_null_val) {
1700  auto null_lv = executor_->cgen_state_->castToTypeIn(
1701  (arg_ti.is_fp()
1702  ? static_cast<llvm::Value*>(executor_->cgen_state_->inlineFpNull(arg_ti))
1703  : static_cast<llvm::Value*>(executor_->cgen_state_->inlineIntNull(arg_ti))),
1704  64);
1705  null_lv = executor_->cgen_state_->ir_builder_.CreateBitCast(
1706  null_lv, get_int_type(64, executor_->cgen_state_->context_));
1707  agg_fname += "_skip_val";
1708  agg_args.push_back(null_lv);
1709  }
1710  if (device_type == ExecutorDeviceType::GPU) {
1711  CHECK(count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap);
1712  agg_fname += "_gpu";
1713  const auto base_dev_addr = getAdditionalLiteral(-1);
1714  const auto base_host_addr = getAdditionalLiteral(-2);
1715  agg_args.push_back(base_dev_addr);
1716  agg_args.push_back(base_host_addr);
1717  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.sub_bitmap_count)));
1718  CHECK_EQ(size_t(0),
1719  count_distinct_descriptor.bitmapPaddedSizeBytes() %
1720  count_distinct_descriptor.sub_bitmap_count);
1721  agg_args.push_back(LL_INT(int64_t(count_distinct_descriptor.bitmapPaddedSizeBytes() /
1722  count_distinct_descriptor.sub_bitmap_count)));
1723  }
1724  if (count_distinct_descriptor.impl_type_ == CountDistinctImplType::Bitmap) {
1725  emitCall(agg_fname, agg_args);
1726  } else {
1727  executor_->cgen_state_->emitExternalCall(
1728  agg_fname, llvm::Type::getVoidTy(LL_CONTEXT), agg_args);
1729  }
1730 }
1731 
1732 llvm::Value* GroupByAndAggregate::getAdditionalLiteral(const int32_t off) {
1733  CHECK_LT(off, 0);
1734  const auto lit_buff_lv = get_arg_by_name(ROW_FUNC, "literals");
1735  return LL_BUILDER.CreateLoad(LL_BUILDER.CreateGEP(
1736  LL_BUILDER.CreateBitCast(lit_buff_lv,
1737  llvm::PointerType::get(get_int_type(64, LL_CONTEXT), 0)),
1738  LL_INT(off)));
1739 }
1740 
1741 std::vector<llvm::Value*> GroupByAndAggregate::codegenAggArg(
1742  const Analyzer::Expr* target_expr,
1743  const CompilationOptions& co) {
1744  const auto agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr);
1745  // TODO(alex): handle arrays uniformly?
1746  CodeGenerator code_generator(executor_);
1747  if (target_expr) {
1748  const auto& target_ti = target_expr->get_type_info();
1749  if (target_ti.is_array() && !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1750  const auto target_lvs =
1751  agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1752  : code_generator.codegen(
1753  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1754  if (target_ti.isChunkIteratorPackaging()) {
1755  // Something with the chunk transport is code that was generated from a source
1756  // other than an ARRAY[] expression
1757  CHECK_EQ(size_t(1), target_lvs.size());
1758  CHECK(!agg_expr || agg_expr->get_aggtype() == kSAMPLE);
1759  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1760  const auto i8p_ty =
1761  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1762  const auto& elem_ti = target_ti.get_elem_type();
1763  return {
1764  executor_->cgen_state_->emitExternalCall(
1765  "array_buff",
1766  i8p_ty,
1767  {target_lvs.front(), code_generator.posArg(target_expr)}),
1768  executor_->cgen_state_->emitExternalCall(
1769  "array_size",
1770  i32_ty,
1771  {target_lvs.front(),
1772  code_generator.posArg(target_expr),
1773  executor_->cgen_state_->llInt(log2_bytes(elem_ti.get_logical_size()))})};
1774  } else if (target_ti.isStandardBufferPackaging()) {
1775  if (agg_expr) {
1776  throw std::runtime_error(
1777  "Using array[] operator as argument to an aggregate operator is not "
1778  "supported");
1779  }
1780  return {target_lvs[0], target_lvs[1]};
1781  }
1782  }
1783  if (target_ti.is_geometry() &&
1784  !executor_->plan_state_->isLazyFetchColumn(target_expr)) {
1785  auto generate_coord_lvs =
1786  [&](auto* selected_target_expr,
1787  bool const fetch_columns) -> std::vector<llvm::Value*> {
1788  const auto target_lvs =
1789  code_generator.codegen(selected_target_expr, fetch_columns, co);
1790  const auto geo_expr = dynamic_cast<const Analyzer::GeoExpr*>(target_expr);
1791  if (geo_expr) {
1792  CHECK_EQ(2 * static_cast<size_t>(target_ti.get_physical_coord_cols()),
1793  target_lvs.size());
1794  return target_lvs;
1795  }
1796  CHECK_EQ(static_cast<size_t>(target_ti.get_physical_coord_cols()),
1797  target_lvs.size());
1798 
1799  const auto i32_ty = get_int_type(32, executor_->cgen_state_->context_);
1800  const auto i8p_ty =
1801  llvm::PointerType::get(get_int_type(8, executor_->cgen_state_->context_), 0);
1802  std::vector<llvm::Value*> coords;
1803  size_t ctr = 0;
1804  for (const auto& target_lv : target_lvs) {
1805  // TODO(adb): consider adding a utility to sqltypes so we can get the types of
1806  // the physical coords cols based on the sqltype (e.g. TINYINT for col 0, INT
1807  // for col 1 for pols / mpolys, etc). Hardcoding for now. first array is the
1808  // coords array (TINYINT). Subsequent arrays are regular INT.
1809 
1810  const size_t elem_sz = ctr == 0 ? 1 : 4;
1811  ctr++;
1812  int32_t fixlen = -1;
1813  if (target_ti.get_type() == kPOINT) {
1814  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
1815  if (col_var) {
1816  const auto coords_cd = executor_->getPhysicalColumnDescriptor(col_var, 1);
1817  if (coords_cd && coords_cd->columnType.get_type() == kARRAY) {
1818  fixlen = coords_cd->columnType.get_size();
1819  }
1820  }
1821  }
1822  if (fixlen > 0) {
1823  coords.push_back(executor_->cgen_state_->emitExternalCall(
1824  "fast_fixlen_array_buff",
1825  i8p_ty,
1826  {target_lv, code_generator.posArg(selected_target_expr)}));
1827  coords.push_back(executor_->cgen_state_->llInt(int64_t(fixlen)));
1828  continue;
1829  }
1830  coords.push_back(executor_->cgen_state_->emitExternalCall(
1831  "array_buff",
1832  i8p_ty,
1833  {target_lv, code_generator.posArg(selected_target_expr)}));
1834  coords.push_back(executor_->cgen_state_->emitExternalCall(
1835  "array_size",
1836  i32_ty,
1837  {target_lv,
1838  code_generator.posArg(selected_target_expr),
1839  executor_->cgen_state_->llInt(log2_bytes(elem_sz))}));
1840  }
1841  return coords;
1842  };
1843 
1844  if (agg_expr) {
1845  return generate_coord_lvs(agg_expr->get_arg(), true);
1846  } else {
1847  return generate_coord_lvs(target_expr,
1848  !executor_->plan_state_->allow_lazy_fetch_);
1849  }
1850  }
1851  }
1852  return agg_expr ? code_generator.codegen(agg_expr->get_arg(), true, co)
1853  : code_generator.codegen(
1854  target_expr, !executor_->plan_state_->allow_lazy_fetch_, co);
1855 }
1856 
1857 llvm::Value* GroupByAndAggregate::emitCall(const std::string& fname,
1858  const std::vector<llvm::Value*>& args) {
1859  return executor_->cgen_state_->emitCall(fname, args);
1860 }
1861 
1862 #undef ROW_FUNC
1863 #undef LL_FP
1864 #undef LL_INT
1865 #undef LL_BOOL
1866 #undef LL_BUILDER
1867 #undef LL_CONTEXT
1868 
1870  const RelAlgExecutionUnit& ra_exe_unit,
1871  const Catalog_Namespace::Catalog& catalog) {
1872  if (ra_exe_unit.sort_info.order_entries.size() != 1 || !ra_exe_unit.sort_info.limit) {
1873  return 0;
1874  }
1875  for (const auto& group_expr : ra_exe_unit.groupby_exprs) {
1876  const auto grouped_col_expr =
1877  dynamic_cast<const Analyzer::ColumnVar*>(group_expr.get());
1878  if (!grouped_col_expr) {
1879  continue;
1880  }
1881  if (grouped_col_expr->get_table_id() <= 0) {
1882  return 0;
1883  }
1884  const auto td = catalog.getMetadataForTable(grouped_col_expr->get_table_id());
1885  if (td->shardedColumnId == grouped_col_expr->get_column_id()) {
1886  return td->nShards;
1887  }
1888  }
1889  return 0;
1890 }
void codegenEstimator(std::stack< llvm::BasicBlock * > &array_loops, GroupByAndAggregate::DiamondCodegen &diamond_codegen, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &)
const Analyzer::Expr * agg_arg(const Analyzer::Expr *expr)
bool is_fp() const
Definition: sqltypes.h:481
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:198
SqlWindowFunctionKind getKind() const
Definition: Analyzer.h:1396
bool use_streaming_top_n(const RelAlgExecutionUnit &ra_exe_unit, const bool output_columnar)
bool constrained_not_null(const Analyzer::Expr *expr, const std::list< std::shared_ptr< Analyzer::Expr >> &quals)
#define ROW_FUNC
bool gpuCanHandleOrderEntries(const std::list< Analyzer::OrderEntry > &order_entries)
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
const int32_t groups_buffer_size return groups_buffer
llvm::Value * getAdditionalLiteral(const int32_t off)
void get_domain(DomainSet &domain_set) const override
Definition: Analyzer.cpp:2911
llvm::Value * codegenAggColumnPtr(llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const QueryMemoryDescriptor &query_mem_desc, const size_t chosen_bytes, const size_t agg_out_off, const size_t target_idx)
: returns the pointer to where the aggregation should be stored.
#define LL_BUILDER
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:81
HOST DEVICE EncodingType get_compression() const
Definition: sqltypes.h:334
int hll_size_for_rate(const int err_percent)
Definition: HyperLogLog.h:115
bool g_cluster
std::vector< int8_t > get_col_byte_widths(const T &col_expr_list, const std::vector< ssize_t > &col_exprs_to_not_project)
bool is_column_range_too_big_for_perfect_hash(const ColRangeInfo &col_range_info, const int64_t max_entry_count)
GroupByAndAggregate(Executor *executor, const ExecutorDeviceType device_type, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, std::shared_ptr< RowSetMemoryOwner >)
#define LL_CONTEXT
bool expr_is_rowid(const Analyzer::Expr *expr, const Catalog_Namespace::Catalog &cat)
ALWAYS_INLINE uint64_t agg_count(uint64_t *agg, const int64_t)
ExecutorDeviceType
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptorImpl(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, RenderInfo *render_info, const bool must_use_baseline_sort, const bool output_columnar_hint)
SQLTypeInfo sql_type
Definition: TargetInfo.h:42
Streaming Top N algorithm.
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:65
void mark_function_always_inline(llvm::Function *func)
ColRangeInfo getColRangeInfo()
const std::list< Analyzer::OrderEntry > order_entries
#define LL_INT(v)
static const size_t baseline_threshold
Definition: Execute.h:1009
QueryDescriptionType hash_type_
llvm::Value * posArg(const Analyzer::Expr *) const
Definition: ColumnIR.cpp:503
static bool supportedExprForGpuSharedMemUsage(Analyzer::Expr *expr)
HOST DEVICE int get_size() const
Definition: sqltypes.h:336
#define CHECK_GE(x, y)
Definition: Logger.h:203
llvm::Value * emitCall(const std::string &fname, const std::vector< llvm::Value * > &args)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
Definition: sqldefs.h:49
int64_t get_agg_initial_val(const SQLAgg agg, const SQLTypeInfo &ti, const bool enable_compaction, const unsigned min_byte_width_to_compact)
Expr * get_arg() const
Definition: Analyzer.h:1045
size_t getEffectiveKeyWidth() const
size_t get_heap_key_slot_index(const std::vector< Analyzer::Expr * > &target_exprs, const size_t target_idx)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
bool takes_float_argument(const TargetInfo &target_info)
Definition: TargetInfo.h:120
bool has_count_distinct(const RelAlgExecutionUnit &ra_exe_unit)
bool supportedTypeForGpuSharedMemUsage(const SQLTypeInfo &target_type_info) const
int g_hll_precision_bits
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnBaselineHash(const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const size_t key_width, const int32_t row_size_quad)
llvm::Type * get_int_type(const int width, llvm::LLVMContext &context)
#define CHECK_GT(x, y)
Definition: Logger.h:202
std::list< const Expr * > DomainSet
Definition: Analyzer.h:61
double inline_fp_null_val(const SQL_TYPE_INFO &ti)
std::string to_string(char const *&&v)
bool g_enable_watchdog
Helpers for codegen of target expressions.
#define LL_BOOL(v)
size_t getColOnlyOffInBytes(const size_t col_idx) const
size_t get_count_distinct_sub_bitmap_count(const size_t bitmap_sz_bits, const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType device_type)
Definition: sqldefs.h:71
const SQLTypeInfo get_compact_type(const TargetInfo &target)
const size_t limit
CountDistinctDescriptors initCountDistinctDescriptors()
CHECK(cgen_state)
bool codegen(llvm::Value *filter_result, llvm::BasicBlock *sc_false, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co)
llvm::Value * get_arg_by_name(llvm::Function *func, const std::string &name)
Definition: Execute.h:117
void codegen(GroupByAndAggregate *group_by_and_agg, Executor *executor, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, llvm::Value *output_buffer_byte_stream, llvm::Value *out_row_idx, GroupByAndAggregate::DiamondCodegen &diamond_codegen) const
size_t g_leaf_count
Definition: ParserNode.cpp:66
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:171
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:35
const SortInfo sort_info
size_t getGroupbyColCount() const
const JoinQualsPerNestingLevel join_quals
HOST DEVICE SQLTypes get_type() const
Definition: sqltypes.h:326
llvm::Value * convertNullIfAny(const SQLTypeInfo &arg_type, const TargetInfo &agg_info, llvm::Value *target)
void setFalseTarget(llvm::BasicBlock *cond_false)
#define LL_FP(v)
std::tuple< llvm::Value *, llvm::Value * > codegenSingleColumnPerfectHash(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, llvm::Value *groups_buffer, llvm::Value *group_expr_lv_translated, llvm::Value *group_expr_lv_original, const int32_t row_size_quad)
std::tuple< llvm::Value *, llvm::Value * > codegenGroupBy(const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &codegen)
bool g_bigint_count
Definition: sqldefs.h:71
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:116
KeylessInfo getKeylessInfo(const std::vector< Analyzer::Expr * > &target_expr_list, const bool is_group_by) const
void codegenCountDistinct(const size_t target_idx, const Analyzer::Expr *target_expr, std::vector< llvm::Value * > &agg_args, const QueryMemoryDescriptor &, const ExecutorDeviceType)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
DiamondCodegen(llvm::Value *cond, Executor *executor, const bool chain_to_next, const std::string &label_prefix, DiamondCodegen *parent, const bool share_false_edge_with_parent)
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
const std::shared_ptr< Analyzer::Estimator > estimator
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void > > checked_int64_t
bool codegenAggCalls(const std::tuple< llvm::Value *, llvm::Value * > &agg_out_ptr_w_idx, const std::vector< llvm::Value * > &agg_out_vec, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::tuple< llvm::Value *, llvm::Value * > codegenMultiColumnPerfectHash(llvm::Value *groups_buffer, llvm::Value *group_key, llvm::Value *key_size_lv, const QueryMemoryDescriptor &query_mem_desc, const int32_t row_size_quad)
SQLAgg agg_kind
Definition: TargetInfo.h:41
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:78
ExecutorDeviceType device_type_
QueryDescriptionType getQueryDescriptionType() const
int64_t getShardedTopBucket(const ColRangeInfo &col_range_info, const size_t shard_count) const
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::vector< llvm::Value * > codegen(const Analyzer::Expr *, const bool fetch_columns, const CompilationOptions &)
Definition: IRCodegen.cpp:25
bool window_function_is_aggregate(const SqlWindowFunctionKind kind)
Definition: WindowContext.h:42
#define CHECK_LT(x, y)
Definition: Logger.h:200
Definition: sqltypes.h:55
const std::vector< InputTableInfo > & query_infos_
bool isSingleColumnGroupByWithPerfectHash() const
void agg_count_distinct(int64_t *agg, const int64_t val)
#define CHECK_LE(x, y)
Definition: Logger.h:201
std::unique_ptr< QueryMemoryDescriptor > initQueryMemoryDescriptor(const bool allow_multifrag, const size_t max_groups_buffer_entry_count, const int8_t crt_min_byte_width, RenderInfo *render_info, const bool output_columnar_hint)
const ExecutorDeviceType device_type_
Definition: sqldefs.h:71
std::vector< llvm::Value * > codegenAggArg(const Analyzer::Expr *target_expr, const CompilationOptions &co)
llvm::Function * codegenPerfectHashFunction()
llvm::Value * codegenWindowRowPointer(const Analyzer::WindowFunction *window_func, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
int32_t get_agg_count(const std::vector< Analyzer::Expr * > &target_exprs)
Descriptor for the result set buffer layout.
CountDistinctImplType
void add_transient_string_literals_for_expression(const Analyzer::Expr *expr, Executor *executor, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner)
llvm::Value * codegenOutputSlot(llvm::Value *groups_buffer, const QueryMemoryDescriptor &query_mem_desc, const CompilationOptions &co, DiamondCodegen &diamond_codegen)
std::list< std::shared_ptr< Analyzer::Expr > > quals
bool interleavedBins(const ExecutorDeviceType) const
int64_t inline_int_null_val(const SQL_TYPE_INFO &ti)
Estimators to be used when precise cardinality isn&#39;t useful.
ColRangeInfo getExprRangeInfo(const Analyzer::Expr *expr) const
uint32_t log2_bytes(const uint32_t bytes)
Definition: Execute.h:127
Definition: sqltypes.h:48
const TableDescriptor * getMetadataForTable(const std::string &tableName, const bool populateFragmenter=true) const
Returns a pointer to a const TableDescriptor struct matching the provided tableName.
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint)
Allocate GPU memory using GpuBuffers via DataMgr.
const bool with_dynamic_watchdog_
const RelAlgExecutionUnit & ra_exe_unit_
const size_t offset
Definition: sqldefs.h:71
Definition: sqldefs.h:71
size_t getColOffInBytes(const size_t col_idx) const
const int64_t const uint32_t const uint32_t const uint32_t const bool keyless
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
SQLOps get_optype() const
Definition: Analyzer.h:364
static WindowFunctionContext * getActiveWindowFunctionContext()
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
static size_t shard_count_for_top_groups(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &catalog)
void agg_count_distinct_skip_val(int64_t *agg, const int64_t val, const int64_t skip_val)