OmniSciDB  1dac507f6e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
QueryMemoryDescriptor.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2018 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryDescriptor.h"
18 
19 #include "../Execute.h"
20 #include "../ExpressionRewrite.h"
21 #include "../GroupByAndAggregate.h"
22 #include "../StreamingTopN.h"
23 #include "../UsedColumnsVisitor.h"
24 #include "ColSlotContext.h"
25 
27 extern bool g_enable_columnar_output;
28 
29 namespace {
30 
31 bool is_int_and_no_bigger_than(const SQLTypeInfo& ti, const size_t byte_width) {
32  if (!ti.is_integer()) {
33  return false;
34  }
35  return get_bit_width(ti) <= (byte_width * 8);
36 }
37 
38 std::vector<ssize_t> target_expr_group_by_indices(
39  const std::list<std::shared_ptr<Analyzer::Expr>>& groupby_exprs,
40  const std::vector<Analyzer::Expr*>& target_exprs) {
41  std::vector<ssize_t> indices(target_exprs.size(), -1);
42  for (size_t target_idx = 0; target_idx < target_exprs.size(); ++target_idx) {
43  const auto target_expr = target_exprs[target_idx];
44  if (dynamic_cast<const Analyzer::AggExpr*>(target_expr)) {
45  continue;
46  }
47  const auto var_expr = dynamic_cast<const Analyzer::Var*>(target_expr);
48  if (var_expr && var_expr->get_which_row() == Analyzer::Var::kGROUPBY) {
49  indices[target_idx] = var_expr->get_varno() - 1;
50  continue;
51  }
52  }
53  return indices;
54 }
55 
56 std::vector<ssize_t> target_expr_proj_indices(const RelAlgExecutionUnit& ra_exe_unit,
57  const Catalog_Namespace::Catalog& cat) {
58  if (ra_exe_unit.input_descs.size() > 1 ||
59  !ra_exe_unit.sort_info.order_entries.empty()) {
60  return {};
61  }
62  std::vector<ssize_t> target_indices(ra_exe_unit.target_exprs.size(), -1);
63  UsedColumnsVisitor columns_visitor;
64  std::unordered_set<int> used_columns;
65  for (const auto& simple_qual : ra_exe_unit.simple_quals) {
66  const auto crt_used_columns = columns_visitor.visit(simple_qual.get());
67  used_columns.insert(crt_used_columns.begin(), crt_used_columns.end());
68  }
69  for (const auto& qual : ra_exe_unit.quals) {
70  const auto crt_used_columns = columns_visitor.visit(qual.get());
71  used_columns.insert(crt_used_columns.begin(), crt_used_columns.end());
72  }
73  for (const auto& target : ra_exe_unit.target_exprs) {
74  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target);
75  if (col_var) {
76  const auto cd = get_column_descriptor_maybe(
77  col_var->get_column_id(), col_var->get_table_id(), cat);
78  if (!cd || !cd->isVirtualCol) {
79  continue;
80  }
81  }
82  const auto crt_used_columns = columns_visitor.visit(target);
83  used_columns.insert(crt_used_columns.begin(), crt_used_columns.end());
84  }
85  for (size_t target_idx = 0; target_idx < ra_exe_unit.target_exprs.size();
86  ++target_idx) {
87  const auto target_expr = ra_exe_unit.target_exprs[target_idx];
88  CHECK(target_expr);
89  const auto& ti = target_expr->get_type_info();
90  const bool is_real_str_or_array =
91  (ti.is_string() && ti.get_compression() == kENCODING_NONE) || ti.is_array();
92  if (is_real_str_or_array) {
93  continue;
94  }
95  if (ti.is_geometry()) {
96  // TODO(adb): Ideally we could determine which physical columns are required for a
97  // given query and fetch only those. For now, we bail on the memory optimization,
98  // since it is possible that adding the physical columns could have unintended
99  // consequences further down the execution path.
100  return {};
101  }
102  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
103  if (!col_var) {
104  continue;
105  }
106  if (!is_real_str_or_array &&
107  used_columns.find(col_var->get_column_id()) == used_columns.end()) {
108  target_indices[target_idx] = 0;
109  }
110  }
111  return target_indices;
112 }
113 
115  const size_t group_col_width) {
116  if (range.getType() == ExpressionRangeType::Invalid) {
117  return sizeof(int64_t);
118  }
119  switch (range.getType()) {
121  if (group_col_width == sizeof(int64_t) && range.hasNulls()) {
122  return sizeof(int64_t);
123  }
124  return range.getIntMax() < EMPTY_KEY_32 - 1 ? sizeof(int32_t) : sizeof(int64_t);
127  return sizeof(int64_t); // No compaction for floating point yet.
128  default:
129  UNREACHABLE();
130  }
131  return sizeof(int64_t);
132 }
133 
134 // TODO(miyu): make sure following setting of compact width is correct in all cases.
136  const std::vector<InputTableInfo>& query_infos,
137  const Executor* executor) {
138  int8_t compact_width{4};
139  for (const auto groupby_expr : ra_exe_unit.groupby_exprs) {
140  const auto expr_range = getExpressionRange(groupby_expr.get(), query_infos, executor);
141  compact_width = std::max(compact_width,
143  expr_range, groupby_expr->get_type_info().get_size()));
144  }
145  return compact_width;
146 }
147 
148 } // namespace
149 
150 std::unique_ptr<QueryMemoryDescriptor> QueryMemoryDescriptor::init(
151  const Executor* executor,
152  const RelAlgExecutionUnit& ra_exe_unit,
153  const std::vector<InputTableInfo>& query_infos,
154  const ColRangeInfo& col_range_info,
155  const KeylessInfo& keyless_info,
156  const bool allow_multifrag,
157  const ExecutorDeviceType device_type,
158  const int8_t crt_min_byte_width,
159  const bool sort_on_gpu_hint,
160  const size_t shard_count,
161  const size_t max_groups_buffer_entry_count,
162  RenderInfo* render_info,
163  const CountDistinctDescriptors count_distinct_descriptors,
164  const bool must_use_baseline_sort,
165  const bool output_columnar_hint) {
166  auto group_col_widths = get_col_byte_widths(ra_exe_unit.groupby_exprs, {});
167  const bool is_group_by{!group_col_widths.empty()};
168 
169  auto col_slot_context = ColSlotContext(ra_exe_unit.target_exprs, {});
170 
171  const auto min_slot_size = QueryMemoryDescriptor::pick_target_compact_width(
172  ra_exe_unit, query_infos, crt_min_byte_width);
173 
174  col_slot_context.setAllSlotsPaddedSize(min_slot_size);
175  col_slot_context.validate();
176 
177  if (!is_group_by) {
178  CHECK(!must_use_baseline_sort);
179 
180  return std::make_unique<QueryMemoryDescriptor>(
181  executor,
182  ra_exe_unit,
183  query_infos,
184  allow_multifrag,
185  false,
186  false,
187  -1,
188  ColRangeInfo{ra_exe_unit.estimator ? QueryDescriptionType::Estimator
190  0,
191  0,
192  0,
193  false},
194  col_slot_context,
195  std::vector<int8_t>{},
196  /*group_col_compact_width*/ 0,
197  std::vector<ssize_t>{},
198  /*entry_count*/ 1,
200  false,
201  count_distinct_descriptors,
202  false,
203  output_columnar_hint,
204  render_info && render_info->isPotentialInSituRender(),
205  must_use_baseline_sort);
206  }
207 
208  size_t entry_count = 1;
209  auto actual_col_range_info = col_range_info;
210  auto sharing = GroupByMemSharing::Shared;
211  bool interleaved_bins_on_gpu = false;
212  bool keyless_hash = false;
213  bool shared_mem_for_group_by = false;
214  int8_t group_col_compact_width = 0;
215  int32_t idx_target_as_key = -1;
216  auto output_columnar = output_columnar_hint;
217  std::vector<ssize_t> target_groupby_indices;
218 
219  switch (col_range_info.hash_type_) {
221  if (render_info) {
222  render_info->setInSituDataIfUnset(false);
223  }
224 
225  if (group_col_widths.size() > 1) {
226  // col range info max contains the expected cardinality of the output
227  entry_count = static_cast<size_t>(actual_col_range_info.max);
228  actual_col_range_info.bucket = 0;
229  } else {
230  // single column perfect hash
231  idx_target_as_key = keyless_info.target_index;
232  keyless_hash =
233  (!sort_on_gpu_hint ||
235  col_range_info.max, col_range_info.min, col_range_info.bucket)) &&
236  !col_range_info.bucket && !must_use_baseline_sort && keyless_info.keyless;
237  entry_count = std::max(
238  GroupByAndAggregate::getBucketedCardinality(col_range_info), int64_t(1));
239  const size_t interleaved_max_threshold{512};
240 
241  size_t gpu_smem_max_threshold{0};
242  if (device_type == ExecutorDeviceType::GPU) {
243  const auto cuda_mgr = executor->getCatalog()->getDataMgr().getCudaMgr();
244  CHECK(cuda_mgr);
245  /*
246  * We only use shared memory strategy if GPU hardware provides native shared
247  *memory atomics support. From CUDA Toolkit documentation:
248  *https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html#atomic-ops "Like
249  *Maxwell, Pascal [and Volta] provides native shared memory atomic operations
250  *for 32-bit integer arithmetic, along with native 32 or 64-bit compare-and-swap
251  *(CAS)."
252  *
253  **/
254  if (cuda_mgr->isArchMaxwellOrLaterForAll()) {
255  // TODO(Saman): threshold should be eventually set as an optimized policy per
256  // architecture.
257  gpu_smem_max_threshold =
258  std::min((cuda_mgr->isArchVoltaForAll()) ? 4095LU : 2047LU,
259  (cuda_mgr->getMaxSharedMemoryForAll() / sizeof(int64_t) - 1));
260  }
261  }
262 
263  if (must_use_baseline_sort) {
264  target_groupby_indices = target_expr_group_by_indices(ra_exe_unit.groupby_exprs,
265  ra_exe_unit.target_exprs);
266  col_slot_context =
267  ColSlotContext(ra_exe_unit.target_exprs, target_groupby_indices);
268  }
269 
270  const auto group_expr = ra_exe_unit.groupby_exprs.front().get();
271  shared_mem_for_group_by =
272  g_enable_smem_group_by && keyless_hash && keyless_info.shared_mem_support &&
273  (entry_count <= gpu_smem_max_threshold) &&
276  count_distinct_descriptors) &&
277  !output_columnar; // TODO(Saman): add columnar support with the new smem
278  // support.
279 
280  bool has_varlen_sample_agg = false;
281  for (const auto& target_expr : ra_exe_unit.target_exprs) {
282  if (target_expr->get_contains_agg()) {
283  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
284  CHECK(agg_expr);
285  if (agg_expr->get_aggtype() == kSAMPLE &&
286  agg_expr->get_type_info().is_varlen()) {
287  has_varlen_sample_agg = true;
288  break;
289  }
290  }
291  }
292 
293  interleaved_bins_on_gpu = keyless_hash && !has_varlen_sample_agg &&
294  (entry_count <= interleaved_max_threshold) &&
295  (device_type == ExecutorDeviceType::GPU) &&
297  count_distinct_descriptors) &&
298  !output_columnar;
299  }
300  break;
301  }
303  if (render_info) {
304  render_info->setInSituDataIfUnset(false);
305  }
306  entry_count = shard_count
307  ? (max_groups_buffer_entry_count + shard_count - 1) / shard_count
308  : max_groups_buffer_entry_count;
309  target_groupby_indices = target_expr_group_by_indices(ra_exe_unit.groupby_exprs,
310  ra_exe_unit.target_exprs);
311  col_slot_context = ColSlotContext(ra_exe_unit.target_exprs, target_groupby_indices);
312 
313  group_col_compact_width =
314  output_columnar ? 8
315  : pick_baseline_key_width(ra_exe_unit, query_infos, executor);
316 
317  actual_col_range_info =
319  break;
320  }
322  CHECK(!must_use_baseline_sort);
323 
324  if (use_streaming_top_n(ra_exe_unit, output_columnar)) {
325  entry_count = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
326  } else {
327  if (ra_exe_unit.use_bump_allocator) {
328  output_columnar = false;
329  entry_count = 0;
330  } else {
331  entry_count = ra_exe_unit.scan_limit
332  ? static_cast<size_t>(ra_exe_unit.scan_limit)
333  : max_groups_buffer_entry_count;
334  }
335  }
336 
337  const auto catalog = executor->getCatalog();
338  CHECK(catalog);
339  target_groupby_indices = executor->plan_state_->allow_lazy_fetch_
340  ? target_expr_proj_indices(ra_exe_unit, *catalog)
341  : std::vector<ssize_t>{};
342 
343  col_slot_context = ColSlotContext(ra_exe_unit.target_exprs, target_groupby_indices);
344  break;
345  }
346  default:
347  UNREACHABLE() << "Unknown query type";
348  }
349 
350  return std::make_unique<QueryMemoryDescriptor>(
351  executor,
352  ra_exe_unit,
353  query_infos,
354  allow_multifrag,
355  keyless_hash,
356  interleaved_bins_on_gpu,
357  idx_target_as_key,
358  actual_col_range_info,
359  col_slot_context,
360  group_col_widths,
361  group_col_compact_width,
362  target_groupby_indices,
363  entry_count,
364  sharing,
365  shared_mem_for_group_by,
366  count_distinct_descriptors,
367  sort_on_gpu_hint,
368  output_columnar,
369  render_info && render_info->isPotentialInSituRender(),
370  must_use_baseline_sort);
371 }
372 
374  const Executor* executor,
375  const RelAlgExecutionUnit& ra_exe_unit,
376  const std::vector<InputTableInfo>& query_infos,
377  const bool allow_multifrag,
378  const bool keyless_hash,
379  const bool interleaved_bins_on_gpu,
380  const int32_t idx_target_as_key,
381  const ColRangeInfo& col_range_info,
382  const ColSlotContext& col_slot_context,
383  const std::vector<int8_t>& group_col_widths,
384  const int8_t group_col_compact_width,
385  const std::vector<ssize_t>& target_groupby_indices,
386  const size_t entry_count,
387  const GroupByMemSharing sharing,
388  const bool shared_mem_for_group_by,
389  const CountDistinctDescriptors count_distinct_descriptors,
390  const bool sort_on_gpu_hint,
391  const bool output_columnar_hint,
392  const bool render_output,
393  const bool must_use_baseline_sort)
394  : executor_(executor)
395  , allow_multifrag_(allow_multifrag)
396  , query_desc_type_(col_range_info.hash_type_)
397  , keyless_hash_(keyless_hash)
398  , interleaved_bins_on_gpu_(interleaved_bins_on_gpu)
399  , idx_target_as_key_(idx_target_as_key)
400  , group_col_widths_(group_col_widths)
401  , group_col_compact_width_(group_col_compact_width)
402  , target_groupby_indices_(target_groupby_indices)
403  , entry_count_(entry_count)
404  , min_val_(col_range_info.min)
405  , max_val_(col_range_info.max)
406  , bucket_(col_range_info.bucket)
407  , has_nulls_(col_range_info.has_nulls)
408  , sharing_(sharing)
409  , count_distinct_descriptors_(count_distinct_descriptors)
410  , output_columnar_(false)
411  , render_output_(render_output)
412  , must_use_baseline_sort_(must_use_baseline_sort)
413  , is_table_function_(false)
414  , force_4byte_float_(false)
415  , col_slot_context_(col_slot_context) {
418 
419  // TODO(Saman): should remove this after implementing shared memory path
420  // completely through codegen We should not use the current shared memory path if
421  // more than 8 bytes per group is required
423  shared_mem_for_group_by && (getRowSize() <= sizeof(int64_t))) {
424  // TODO(adb / saman): Move this into a different enum so we can remove
425  // GroupByMemSharing
427  interleaved_bins_on_gpu_ = false;
428  }
429 
430  // Note that output_columnar_ currently defaults to false to avoid issues with
431  // getRowSize above. If output columnar is enable then shared_mem_for_group_by is not,
432  // and the above condition would never be true.
433 
434  sort_on_gpu_ = sort_on_gpu_hint && canOutputColumnar() && !keyless_hash_;
435 
436  if (sort_on_gpu_) {
437  CHECK(!ra_exe_unit.use_bump_allocator);
438  output_columnar_ = true;
439  } else {
440  switch (query_desc_type_) {
442  output_columnar_ = output_columnar_hint;
443  break;
448  break;
450  output_columnar_ = output_columnar_hint;
451  break;
456  break;
457  default:
458  output_columnar_ = false;
459  break;
460  }
461  }
462 
464  // TODO(adb): Ensure fixed size buffer allocations are correct with all logical column
465  // sizes
466  CHECK(!ra_exe_unit.use_bump_allocator);
469  }
470 }
471 
473  : executor_(nullptr)
474  , allow_multifrag_(false)
475  , query_desc_type_(QueryDescriptionType::Projection)
476  , keyless_hash_(false)
477  , interleaved_bins_on_gpu_(false)
478  , idx_target_as_key_(0)
479  , group_col_compact_width_(0)
480  , entry_count_(0)
481  , min_val_(0)
482  , max_val_(0)
483  , bucket_(0)
484  , has_nulls_(false)
485  , sharing_(GroupByMemSharing::Shared)
486  , sort_on_gpu_(false)
487  , output_columnar_(false)
488  , render_output_(false)
489  , must_use_baseline_sort_(false)
490  , is_table_function_(false)
491  , force_4byte_float_(false) {}
492 
494  const size_t entry_count,
495  const QueryDescriptionType query_desc_type,
496  const bool is_table_function)
497  : executor_(executor)
498  , allow_multifrag_(false)
499  , query_desc_type_(query_desc_type)
500  , keyless_hash_(false)
501  , interleaved_bins_on_gpu_(false)
502  , idx_target_as_key_(0)
503  , group_col_compact_width_(0)
504  , entry_count_(entry_count)
505  , min_val_(0)
506  , max_val_(0)
507  , bucket_(0)
508  , has_nulls_(false)
509  , sharing_(GroupByMemSharing::Shared)
510  , sort_on_gpu_(false)
511  , output_columnar_(false)
512  , render_output_(false)
513  , must_use_baseline_sort_(false)
514  , is_table_function_(is_table_function)
515  , force_4byte_float_(false) {}
516 
518  const int64_t min_val,
519  const int64_t max_val,
520  const bool has_nulls,
521  const std::vector<int8_t>& group_col_widths)
522  : executor_(nullptr)
523  , allow_multifrag_(false)
524  , query_desc_type_(query_desc_type)
525  , keyless_hash_(false)
526  , interleaved_bins_on_gpu_(false)
527  , idx_target_as_key_(0)
528  , group_col_widths_(group_col_widths)
529  , group_col_compact_width_(0)
530  , entry_count_(0)
531  , min_val_(min_val)
532  , max_val_(max_val)
533  , bucket_(0)
534  , has_nulls_(false)
535  , sharing_(GroupByMemSharing::Shared)
536  , sort_on_gpu_(false)
537  , output_columnar_(false)
538  , render_output_(false)
539  , must_use_baseline_sort_(false)
540  , is_table_function_(false)
541  , force_4byte_float_(false) {}
542 
544  // Note that this method does not check ptr reference members (e.g. executor_) or
545  // entry_count_
546  if (query_desc_type_ != other.query_desc_type_) {
547  return false;
548  }
549  if (keyless_hash_ != other.keyless_hash_) {
550  return false;
551  }
553  return false;
554  }
555  if (idx_target_as_key_ != other.idx_target_as_key_) {
556  return false;
557  }
558  if (force_4byte_float_ != other.force_4byte_float_) {
559  return false;
560  }
561  if (group_col_widths_ != other.group_col_widths_) {
562  return false;
563  }
565  return false;
566  }
568  return false;
569  }
570  if (min_val_ != other.min_val_) {
571  return false;
572  }
573  if (max_val_ != other.max_val_) {
574  return false;
575  }
576  if (bucket_ != other.bucket_) {
577  return false;
578  }
579  if (has_nulls_ != other.has_nulls_) {
580  return false;
581  }
582  if (sharing_ != other.sharing_) {
583  return false;
584  }
586  return false;
587  } else {
588  // Count distinct descriptors can legitimately differ in device only.
589  for (size_t i = 0; i < count_distinct_descriptors_.size(); ++i) {
590  auto ref_count_distinct_desc = other.count_distinct_descriptors_[i];
591  auto count_distinct_desc = count_distinct_descriptors_[i];
592  count_distinct_desc.device_type = ref_count_distinct_desc.device_type;
593  if (ref_count_distinct_desc != count_distinct_desc) {
594  return false;
595  }
596  }
597  }
598  if (sort_on_gpu_ != other.sort_on_gpu_) {
599  return false;
600  }
601  if (output_columnar_ != other.output_columnar_) {
602  return false;
603  }
604  if (col_slot_context_ != other.col_slot_context_) {
605  return false;
606  }
607  return true;
608 }
609 
610 std::unique_ptr<QueryExecutionContext> QueryMemoryDescriptor::getQueryExecutionContext(
611  const RelAlgExecutionUnit& ra_exe_unit,
612  const Executor* executor,
613  const ExecutorDeviceType device_type,
614  const ExecutorDispatchMode dispatch_mode,
615  const int device_id,
616  const int64_t num_rows,
617  const std::vector<std::vector<const int8_t*>>& col_buffers,
618  const std::vector<std::vector<uint64_t>>& frag_offsets,
619  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
620  const bool output_columnar,
621  const bool sort_on_gpu,
622  RenderInfo* render_info) const {
623  auto timer = DEBUG_TIMER(__func__);
624  if (frag_offsets.empty()) {
625  return nullptr;
626  }
627  return std::unique_ptr<QueryExecutionContext>(
628  new QueryExecutionContext(ra_exe_unit,
629  *this,
630  executor,
631  device_type,
632  dispatch_mode,
633  device_id,
634  num_rows,
635  col_buffers,
636  frag_offsets,
637  row_set_mem_owner,
638  output_columnar,
639  sort_on_gpu,
640  render_info));
641 }
642 
644  const RelAlgExecutionUnit& ra_exe_unit,
645  const std::vector<InputTableInfo>& query_infos,
646  const int8_t crt_min_byte_width) {
647  if (g_bigint_count) {
648  return sizeof(int64_t);
649  }
650  int8_t compact_width{0};
651  auto col_it = ra_exe_unit.input_col_descs.begin();
652  int unnest_array_col_id{std::numeric_limits<int>::min()};
653  for (const auto groupby_expr : ra_exe_unit.groupby_exprs) {
654  const auto uoper = dynamic_cast<Analyzer::UOper*>(groupby_expr.get());
655  if (uoper && uoper->get_optype() == kUNNEST) {
656  const auto& arg_ti = uoper->get_operand()->get_type_info();
657  CHECK(arg_ti.is_array());
658  const auto& elem_ti = arg_ti.get_elem_type();
659  if (elem_ti.is_string() && elem_ti.get_compression() == kENCODING_DICT) {
660  unnest_array_col_id = (*col_it)->getColId();
661  } else {
662  compact_width = crt_min_byte_width;
663  break;
664  }
665  }
666  ++col_it;
667  }
668  if (!compact_width &&
669  (ra_exe_unit.groupby_exprs.size() != 1 || !ra_exe_unit.groupby_exprs.front())) {
670  compact_width = crt_min_byte_width;
671  }
672  if (!compact_width) {
673  col_it = ra_exe_unit.input_col_descs.begin();
674  std::advance(col_it, ra_exe_unit.groupby_exprs.size());
675  for (const auto target : ra_exe_unit.target_exprs) {
676  const auto& ti = target->get_type_info();
677  const auto agg = dynamic_cast<const Analyzer::AggExpr*>(target);
678  if (agg && agg->get_arg()) {
679  compact_width = crt_min_byte_width;
680  break;
681  }
682 
683  if (agg) {
684  CHECK_EQ(kCOUNT, agg->get_aggtype());
685  CHECK(!agg->get_is_distinct());
686  ++col_it;
687  continue;
688  }
689 
690  if (is_int_and_no_bigger_than(ti, 4) ||
691  (ti.is_string() && ti.get_compression() == kENCODING_DICT)) {
692  ++col_it;
693  continue;
694  }
695 
696  const auto uoper = dynamic_cast<Analyzer::UOper*>(target);
697  if (uoper && uoper->get_optype() == kUNNEST &&
698  (*col_it)->getColId() == unnest_array_col_id) {
699  const auto arg_ti = uoper->get_operand()->get_type_info();
700  CHECK(arg_ti.is_array());
701  const auto& elem_ti = arg_ti.get_elem_type();
702  if (elem_ti.is_string() && elem_ti.get_compression() == kENCODING_DICT) {
703  ++col_it;
704  continue;
705  }
706  }
707 
708  compact_width = crt_min_byte_width;
709  break;
710  }
711  }
712  if (!compact_width) {
713  size_t total_tuples{0};
714  for (const auto& qi : query_infos) {
715  total_tuples += qi.info.getNumTuples();
716  }
717  return total_tuples <= static_cast<size_t>(std::numeric_limits<uint32_t>::max()) ||
718  unnest_array_col_id != std::numeric_limits<int>::min()
719  ? 4
720  : crt_min_byte_width;
721  } else {
722  // TODO(miyu): relax this condition to allow more cases just w/o padding
723  for (auto wid : get_col_byte_widths(ra_exe_unit.target_exprs, {})) {
724  compact_width = std::max(compact_width, wid);
725  }
726  return compact_width;
727  }
728 }
729 
732 }
733 
736  size_t total_bytes{0};
737  if (keyless_hash_) {
738  CHECK_EQ(size_t(1), group_col_widths_.size());
739  } else {
740  total_bytes += group_col_widths_.size() * getEffectiveKeyWidth();
741  total_bytes = align_to_int64(total_bytes);
742  }
743  total_bytes += getColsSize();
744  return align_to_int64(total_bytes);
745 }
746 
748  return (interleaved_bins_on_gpu_ ? executor_->warpSize() : 1);
749 }
750 
753 }
754 
763 }
764 
770  const size_t num_entries_per_column) const {
771  return col_slot_context_.getTotalBytesOfColumnarBuffers(num_entries_per_column);
772 }
773 
784  const size_t projection_count) const {
785  constexpr size_t row_index_width = sizeof(int64_t);
786  return getTotalBytesOfColumnarBuffers(projection_count) +
787  row_index_width * projection_count;
788 }
789 
790 size_t QueryMemoryDescriptor::getColOnlyOffInBytes(const size_t col_idx) const {
791  return col_slot_context_.getColOnlyOffInBytes(col_idx);
792 }
793 
794 /*
795  * Returns the memory offset in bytes for a specific agg column in the output
796  * memory buffer. Depending on the query type, there may be some extra portion
797  * of memory prepended at the beginning of the buffer. A brief description of
798  * the memory layout is as follows:
799  * 1. projections: index column (64bit) + all target columns
800  * 2. group by: all group columns (64-bit each) + all agg columns
801  * 2a. if keyless, there is no prepending group column stored at the beginning
802  */
803 size_t QueryMemoryDescriptor::getColOffInBytes(const size_t col_idx) const {
804  const auto warp_count = getWarpCount();
805  if (output_columnar_) {
806  CHECK_EQ(size_t(1), warp_count);
807  size_t offset{0};
808  if (!keyless_hash_) {
810  }
811  for (size_t index = 0; index < col_idx; ++index) {
813  }
814  return offset;
815  }
816 
817  size_t offset{0};
818  if (keyless_hash_) {
819  CHECK_EQ(size_t(1), group_col_widths_.size());
820  } else {
821  offset += group_col_widths_.size() * getEffectiveKeyWidth();
822  offset = align_to_int64(offset);
823  }
824  offset += getColOnlyOffInBytes(col_idx);
825  return offset;
826 }
827 
828 /*
829  * Returns the memory offset for a particular group column in the prepended group
830  * columns portion of the memory.
831  */
833  const size_t group_idx) const {
835  CHECK(group_idx < getGroupbyColCount());
836  size_t offset{0};
837  for (size_t col_idx = 0; col_idx < group_idx; col_idx++) {
838  // TODO(Saman): relax that int64_bit part immediately
839  offset += align_to_int64(
840  std::max(groupColWidth(col_idx), static_cast<int8_t>(sizeof(int64_t))) *
841  getEntryCount());
842  }
843  return offset;
844 }
845 
846 /*
847  * Returns total amount of memory prepended at the beginning of the output memory
848  * buffer.
849  */
852  size_t buffer_size{0};
853  for (size_t group_idx = 0; group_idx < getGroupbyColCount(); group_idx++) {
854  buffer_size += align_to_int64(
855  std::max(groupColWidth(group_idx), static_cast<int8_t>(sizeof(int64_t))) *
856  getEntryCount());
857  }
858  return buffer_size;
859 }
860 
861 size_t QueryMemoryDescriptor::getColOffInBytesInNextBin(const size_t col_idx) const {
862  auto warp_count = getWarpCount();
863  if (output_columnar_) {
864  CHECK_EQ(size_t(1), group_col_widths_.size());
865  CHECK_EQ(size_t(1), warp_count);
866  return getPaddedSlotWidthBytes(col_idx);
867  }
868 
869  return warp_count * getRowSize();
870 }
871 
872 size_t QueryMemoryDescriptor::getNextColOffInBytes(const int8_t* col_ptr,
873  const size_t bin,
874  const size_t col_idx) const {
876  size_t offset{0};
877  auto warp_count = getWarpCount();
878  const auto chosen_bytes = getPaddedSlotWidthBytes(col_idx);
879  const auto total_slot_count = getSlotCount();
880  if (col_idx + 1 == total_slot_count) {
881  if (output_columnar_) {
882  return (entry_count_ - bin) * chosen_bytes;
883  } else {
884  return static_cast<size_t>(align_to_int64(col_ptr + chosen_bytes) - col_ptr);
885  }
886  }
887 
888  const auto next_chosen_bytes = getPaddedSlotWidthBytes(col_idx + 1);
889  if (output_columnar_) {
890  CHECK_EQ(size_t(1), group_col_widths_.size());
891  CHECK_EQ(size_t(1), warp_count);
892 
893  offset = align_to_int64(entry_count_ * chosen_bytes);
894 
895  offset += bin * (next_chosen_bytes - chosen_bytes);
896  return offset;
897  }
898 
899  if (next_chosen_bytes == sizeof(int64_t)) {
900  return static_cast<size_t>(align_to_int64(col_ptr + chosen_bytes) - col_ptr);
901  } else {
902  return chosen_bytes;
903  }
904 }
905 
907  const RelAlgExecutionUnit& ra_exe_unit,
908  const unsigned thread_count,
909  const ExecutorDeviceType device_type) const {
910  if (use_streaming_top_n(ra_exe_unit, output_columnar_)) {
911  const size_t n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
912  return streaming_top_n::get_heap_size(getRowSize(), n, thread_count);
913  }
914  return getBufferSizeBytes(device_type, entry_count_);
915 }
916 
929  const size_t entry_count) const {
931  CHECK_GE(group_col_widths_.size(), size_t(1));
932  auto row_bytes = align_to_int64(getColsSize());
933 
934  return (interleavedBins(device_type) ? executor_->warpSize() : 1) * entry_count *
935  row_bytes;
936  }
937 
938  constexpr size_t row_index_width = sizeof(int64_t);
939  size_t total_bytes{0};
940  if (output_columnar_) {
942  ? row_index_width * entry_count
943  : sizeof(int64_t) * group_col_widths_.size() * entry_count) +
945  } else {
946  total_bytes = getRowSize() * entry_count;
947  }
948 
949  return total_bytes;
950 }
951 
953  const ExecutorDeviceType device_type) const {
954  return getBufferSizeBytes(device_type, entry_count_);
955 }
956 
958  output_columnar_ = val;
961  }
962 }
963 
964 /*
965  * Indicates the query types that are currently allowed to use the logical
966  * sized columns instead of padded sized ones.
967  */
969  // In distributed mode, result sets are serialized using rowwise iterators, so we use
970  // consistent slot widths for now
971  return output_columnar_ && !g_cluster &&
973 }
974 
976  size_t total_slot_count = col_slot_context_.getSlotCount();
977 
978  if (target_groupby_indices_.empty()) {
979  return total_slot_count;
980  }
981  return total_slot_count - std::count_if(target_groupby_indices_.begin(),
983  [](const ssize_t i) { return i >= 0; });
984 }
985 
988  getGroupbyColCount() == 1);
989 }
990 
993 }
994 
996  if (g_cluster || is_table_function_) {
997  return true;
998  }
1000  return true;
1001  }
1002  if (executor_->isCPUOnly() || render_output_ ||
1006  getGroupbyColCount() > 1)) {
1007  return true;
1008  }
1012 }
1013 
1015  return device_type == ExecutorDeviceType::GPU && !render_output_ &&
1017 }
1018 
1020  return interleaved_bins_on_gpu_ && device_type == ExecutorDeviceType::GPU;
1021 }
1022 
1024  CHECK(device_type == ExecutorDeviceType::CPU || device_type == ExecutorDeviceType::GPU);
1025  if (device_type == ExecutorDeviceType::CPU) {
1026  return 0;
1027  }
1028  // if performing keyless aggregate query with a single column group-by:
1030  CHECK_EQ(getRowSize(),
1031  sizeof(int64_t)); // Currently just designed for this scenario
1032  size_t shared_mem_size =
1033  (/*bin_count=*/entry_count_ + 1) * sizeof(int64_t); // one extra for NULL values
1034  CHECK(shared_mem_size <=
1035  executor_->getCatalog()->getDataMgr().getCudaMgr()->getMaxSharedMemoryForAll());
1036  return shared_mem_size;
1037  }
1038  return 0;
1039 }
1040 
1042  const ExecutorDeviceType device_type) const {
1043  if (device_type != ExecutorDeviceType::GPU) {
1044  return false;
1045  } else {
1046  auto cuda_mgr = executor_->getCatalog()->getDataMgr().getCudaMgr();
1047  CHECK(cuda_mgr);
1048  return cuda_mgr->isArchVoltaForAll();
1049  }
1050 }
1051 
1053  return col_slot_context_.getColCount();
1054 }
1055 
1058 }
1059 
1060 const int8_t QueryMemoryDescriptor::getPaddedSlotWidthBytes(const size_t slot_idx) const {
1061  return col_slot_context_.getSlotInfo(slot_idx).padded_size;
1062 }
1063 
1065  const size_t slot_idx) const {
1066  return col_slot_context_.getSlotInfo(slot_idx).logical_size;
1067 }
1068 
1070  const size_t col_idx) const {
1071  const auto& col_slots = col_slot_context_.getSlotsForCol(col_idx);
1072  CHECK_EQ(col_slots.size(), size_t(1));
1073  return col_slots.front();
1074 }
1075 
1076 void QueryMemoryDescriptor::useConsistentSlotWidthSize(const int8_t slot_width_size) {
1077  col_slot_context_.setAllSlotsSize(slot_width_size);
1078 }
1079 
1081  // Note: Actual row size may include padding (see ResultSetBufferAccessors.h)
1083 }
1084 
1086  const int8_t actual_min_byte_width) const {
1087  return col_slot_context_.getMinPaddedByteSize(actual_min_byte_width);
1088 }
1089 
1091  const std::vector<std::tuple<int8_t, int8_t>>& slots_for_col) {
1092  col_slot_context_.addColumn(slots_for_col);
1093 }
1094 
1097 }
1098 
1101 }
1102 
1107 }
1108 
1109 namespace {
1110 
1111 inline std::string boolToString(const bool val) {
1112  return val ? "True" : "False";
1113 }
1114 
1115 inline std::string queryDescTypeToString(const QueryDescriptionType val) {
1116  switch (val) {
1118  return "Perfect Hash";
1120  return "Baseline Hash";
1122  return "Projection";
1124  return "Non-grouped Aggregate";
1126  return "Estimator";
1127  default:
1128  UNREACHABLE();
1129  }
1130  return "";
1131 }
1132 
1133 } // namespace
1134 
1135 std::string QueryMemoryDescriptor::toString() const {
1136  auto str = reductionKey();
1137  str += "\tAllow Multifrag: " + boolToString(allow_multifrag_) + "\n";
1138  str += "\tInterleaved Bins on GPU: " + boolToString(interleaved_bins_on_gpu_) + "\n";
1139  str += "\tBlocks Share Memory: " + boolToString(blocksShareMemory()) + "\n";
1140  str += "\tThreads Share Memory: " + boolToString(threadsShareMemory()) + "\n";
1141  str += "\tUses Fast Group Values: " + boolToString(usesGetGroupValueFast()) + "\n";
1142  str += "\tLazy Init Groups (GPU): " +
1144  str += "\tEntry Count: " + std::to_string(entry_count_) + "\n";
1145  str += "\tMin Val (perfect hash only): " + std::to_string(min_val_) + "\n";
1146  str += "\tMax Val (perfect hash only): " + std::to_string(max_val_) + "\n";
1147  str += "\tBucket Val (perfect hash only): " + std::to_string(bucket_) + "\n";
1148  str += "\tSort on GPU: " + boolToString(sort_on_gpu_) + "\n";
1149  str += "\tOutput Columnar: " + boolToString(output_columnar_) + "\n";
1150  str += "\tRender Output: " + boolToString(render_output_) + "\n";
1151  str += "\tUse Baseline Sort: " + boolToString(must_use_baseline_sort_) + "\n";
1152  return str;
1153 }
1154 
1156  std::string str;
1157  str += "Query Memory Descriptor State\n";
1158  str += "\tQuery Type: " + queryDescTypeToString(query_desc_type_) + "\n";
1159  str +=
1160  "\tKeyless Hash: " + boolToString(keyless_hash_) +
1161  (keyless_hash_ ? ", target index for key: " + std::to_string(getTargetIdxForKey())
1162  : "") +
1163  "\n";
1164  str += "\tEffective key width: " + std::to_string(getEffectiveKeyWidth()) + "\n";
1165  str += "\tNumber of group columns: " + std::to_string(getGroupbyColCount()) + "\n";
1166  const auto group_indices_size = targetGroupbyIndicesSize();
1167  if (group_indices_size) {
1168  std::vector<std::string> group_indices_strings;
1169  for (size_t target_idx = 0; target_idx < group_indices_size; ++target_idx) {
1170  group_indices_strings.push_back(std::to_string(getTargetGroupbyIndex(target_idx)));
1171  }
1172  str += "\tTarget group by indices: " +
1173  boost::algorithm::join(group_indices_strings, ",");
1174  }
1175  str += "\t" + col_slot_context_.toString();
1176  return str;
1177 }
1178 
1179 std::vector<TargetInfo> target_exprs_to_infos(
1180  const std::vector<Analyzer::Expr*>& targets,
1182  std::vector<TargetInfo> target_infos;
1183  for (const auto target_expr : targets) {
1184  auto target = get_target_info(target_expr, g_bigint_count);
1185  if (query_mem_desc.getQueryDescriptionType() ==
1187  set_notnull(target, false);
1188  target.sql_type.set_notnull(false);
1189  }
1190  target_infos.push_back(target);
1191  }
1192  return target_infos;
1193 }
int8_t getMinPaddedByteSize(const int8_t actual_min_byte_width) const
std::vector< Analyzer::Expr * > target_exprs
static bool many_entries(const int64_t max_val, const int64_t min_val, const int64_t bucket)
QueryDescriptionType
Definition: Types.h:26
#define CHECK_EQ(x, y)
Definition: Logger.h:198
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
bool use_streaming_top_n(const RelAlgExecutionUnit &ra_exe_unit, const bool output_columnar)
bool g_enable_smem_group_by
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
void alignPaddedSlots(const bool sort_on_gpu)
int8_t logical_size
const int8_t const int64_t * num_rows
class for a per-database catalog. also includes metadata for the current database and the current use...
Definition: Catalog.h:81
size_t getTotalBytesOfColumnarProjections(const size_t projection_count) const
bool g_cluster
std::vector< int8_t > get_col_byte_widths(const T &col_expr_list, const std::vector< ssize_t > &col_exprs_to_not_project)
ExecutorDeviceType
std::unique_ptr< QueryExecutionContext > getQueryExecutionContext(const RelAlgExecutionUnit &, const Executor *executor, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const int device_id, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, std::shared_ptr< RowSetMemoryOwner >, const bool output_columnar, const bool sort_on_gpu, RenderInfo *) const
std::string toString() const
bool isLogicalSizedColumnsAllowed() const
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:65
int8_t pick_baseline_key_component_width(const ExpressionRange &range, const size_t group_col_width)
const std::list< Analyzer::OrderEntry > order_entries
std::string join(T const &container, std::string const &delim)
#define UNREACHABLE()
Definition: Logger.h:234
void setOutputColumnar(const bool val)
static bool supportedExprForGpuSharedMemUsage(Analyzer::Expr *expr)
#define CHECK_GE(x, y)
Definition: Logger.h:203
size_t getAllSlotsPaddedSize() const
size_t getAllSlotsAlignedPaddedSize() const
size_t getNextColOffInBytes(const int8_t *col_ptr, const size_t bin, const size_t col_idx) const
size_t getEffectiveKeyWidth() const
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
T visit(const Analyzer::Expr *expr) const
const std::vector< InputDescriptor > input_descs
void setAllSlotsSize(const int8_t slot_width_size)
std::string to_string(char const *&&v)
void useConsistentSlotWidthSize(const int8_t slot_width_size)
const SlotSize & getSlotInfo(const size_t slot_idx) const
size_t getColOnlyOffInBytes(const size_t col_idx) const
ExecutorDispatchMode
size_t getColOnlyOffInBytes(const size_t slot_idx) const
const size_t limit
bool g_enable_columnar_output
Definition: Execute.cpp:86
int8_t groupColWidth(const size_t key_idx) const
std::string queryDescTypeToString(const QueryDescriptionType val)
size_t get_bit_width(const SQLTypeInfo &ti)
CHECK(cgen_state)
const ColumnDescriptor * get_column_descriptor_maybe(const int col_id, const int table_id, const Catalog_Namespace::Catalog &cat)
Definition: Execute.h:171
std::vector< CountDistinctDescriptor > CountDistinctDescriptors
Definition: CountDistinct.h:35
const SortInfo sort_info
size_t getCompactByteWidth() const
Provides column info and slot info for the output buffer and some metadata helpers.
size_t getGroupbyColCount() const
std::vector< ssize_t > target_expr_group_by_indices(const std::list< std::shared_ptr< Analyzer::Expr >> &groupby_exprs, const std::vector< Analyzer::Expr * > &target_exprs)
bool lazyInitGroups(const ExecutorDeviceType) const
size_t targetGroupbyIndicesSize() const
size_t getPrependedGroupBufferSizeInBytes() const
size_t getTotalBytesOfColumnarBuffers() const
static int8_t pick_target_compact_width(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const int8_t crt_min_byte_width)
bool g_bigint_count
CountDistinctDescriptors count_distinct_descriptors_
const int32_t groups_buffer_size return nullptr
void validate() const
int get_varno() const
Definition: Analyzer.h:275
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
ExpressionRange getExpressionRange(const Analyzer::BinOper *expr, const std::vector< InputTableInfo > &query_infos, const Executor *, boost::optional< std::list< std::shared_ptr< Analyzer::Expr >>> simple_quals)
bool hasNulls() const
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:78
ssize_t getTargetGroupbyIndex(const size_t target_idx) const
QueryDescriptionType getQueryDescriptionType() const
void addColumn(const std::vector< std::tuple< int8_t, int8_t >> &slots_for_col)
size_t sharedMemBytes(const ExecutorDeviceType) const
bool is_real_str_or_array(const TargetInfo &target_info)
const Expr * get_operand() const
Definition: Analyzer.h:365
bool is_integer() const
Definition: sqltypes.h:479
QueryDescriptionType query_desc_type_
int8_t padded_size
Definition: sqldefs.h:71
int8_t updateActualMinByteWidth(const int8_t actual_min_byte_width) const
size_t getTotalBytesOfColumnarBuffers(const size_t entry_count) const
bool operator==(const QueryMemoryDescriptor &other) const
Descriptor for the result set buffer layout.
bool is_int_and_no_bigger_than(const SQLTypeInfo &ti, const size_t byte_width)
std::list< std::shared_ptr< Analyzer::Expr > > quals
ExpressionRangeType getType() const
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
int64_t getIntMax() const
bool isWarpSyncRequired(const ExecutorDeviceType) const
std::string toString() const
size_t getSlotCount() const
void setAllSlotsPaddedSizeToLogicalSize()
bool interleavedBins(const ExecutorDeviceType) const
bool g_enable_watchdog false
Definition: Execute.cpp:71
#define DEBUG_TIMER(name)
Definition: Logger.h:296
size_t getColCount() const
GroupByMemSharing
std::vector< int8_t > group_col_widths_
#define EMPTY_KEY_32
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
const std::vector< size_t > & getSlotsForCol(const size_t col_idx) const
std::vector< ssize_t > target_expr_proj_indices(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &cat)
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)
static std::unique_ptr< QueryMemoryDescriptor > init(const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs
void addColSlotInfo(const std::vector< std::tuple< int8_t, int8_t >> &slots_for_col)
const size_t offset
static bool countDescriptorsLogicallyEmpty(const CountDistinctDescriptors &count_distinct_descriptors)
void setAllUnsetSlotsPaddedSize(const int8_t padded_size)
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
size_t getBufferColSlotCount() const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
size_t getColOffInBytes(const size_t col_idx) const
size_t getColOffInBytesInNextBin(const size_t col_idx) const
std::vector< ssize_t > target_groupby_indices_
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int8_t pick_baseline_key_width(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Executor *executor)
std::string reductionKey() const
std::list< std::shared_ptr< Analyzer::Expr > > simple_quals
void set_notnull(TargetInfo &target, const bool not_null)
int32_t getTargetIdxForKey() const
size_t getPrependedGroupColOffInBytes(const size_t group_idx) const