OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryMemoryInitializer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryInitializer.h"
18 #include "Execute.h"
19 #include "GpuInitGroups.h"
20 #include "Logger/Logger.h"
23 #include "Shared/checked_alloc.h"
24 #include "StreamingTopN.h"
25 #include "Utils/FlatBuffer.h"
26 
27 // 8 GB, the limit of perfect hash group by under normal conditions
28 int64_t g_bitmap_memory_limit{8LL * 1000 * 1000 * 1000};
29 
30 namespace {
31 
33  const int32_t groups_buffer_entry_count = query_mem_desc.getEntryCount();
34  checked_int64_t total_bytes_per_group = 0;
35  const size_t num_count_distinct_descs =
36  query_mem_desc.getCountDistinctDescriptorsSize();
37  for (size_t i = 0; i < num_count_distinct_descs; i++) {
38  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
39  if (count_distinct_desc.impl_type_ != CountDistinctImplType::Bitmap) {
40  continue;
41  }
42  total_bytes_per_group += count_distinct_desc.bitmapPaddedSizeBytes();
43  }
44  int64_t total_bytes{0};
45  // Using OutOfHostMemory until we can verify that SlabTooBig would also be properly
46  // caught
47  try {
48  total_bytes = static_cast<int64_t>(total_bytes_per_group * groups_buffer_entry_count);
49  } catch (...) {
50  // Absurd amount of memory, merely computing the number of bits overflows int64_t.
51  // Don't bother to report the real amount, this is unlikely to ever happen.
52  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
53  }
54  if (total_bytes >= g_bitmap_memory_limit) {
55  throw OutOfHostMemory(total_bytes);
56  }
57 }
58 
59 int64_t* alloc_group_by_buffer(const size_t numBytes,
60  RenderAllocatorMap* render_allocator_map,
61  const size_t thread_idx,
62  RowSetMemoryOwner* mem_owner) {
63  if (render_allocator_map) {
64  // NOTE(adb): If we got here, we are performing an in-situ rendering query and are not
65  // using CUDA buffers. Therefore we need to allocate result set storage using CPU
66  // memory.
67  const auto gpu_idx = 0; // Only 1 GPU supported in CUDA-disabled rendering mode
68  auto render_allocator_ptr = render_allocator_map->getRenderAllocator(gpu_idx);
69  return reinterpret_cast<int64_t*>(render_allocator_ptr->alloc(numBytes));
70  } else {
71  return reinterpret_cast<int64_t*>(mem_owner->allocate(numBytes, thread_idx));
72  }
73 }
74 
75 inline int64_t get_consistent_frag_size(const std::vector<uint64_t>& frag_offsets) {
76  if (frag_offsets.size() < 2) {
77  return int64_t(-1);
78  }
79  const auto frag_size = frag_offsets[1] - frag_offsets[0];
80  for (size_t i = 2; i < frag_offsets.size(); ++i) {
81  const auto curr_size = frag_offsets[i] - frag_offsets[i - 1];
82  if (curr_size != frag_size) {
83  return int64_t(-1);
84  }
85  }
86  return !frag_size ? std::numeric_limits<int64_t>::max()
87  : static_cast<int64_t>(frag_size);
88 }
89 
90 inline std::vector<int64_t> get_consistent_frags_sizes(
91  const std::vector<std::vector<uint64_t>>& frag_offsets) {
92  if (frag_offsets.empty()) {
93  return {};
94  }
95  std::vector<int64_t> frag_sizes;
96  for (size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
97  std::vector<uint64_t> tab_offs;
98  for (auto& offsets : frag_offsets) {
99  tab_offs.push_back(offsets[tab_idx]);
100  }
101  frag_sizes.push_back(get_consistent_frag_size(tab_offs));
102  }
103  return frag_sizes;
104 }
105 
106 inline std::vector<int64_t> get_consistent_frags_sizes(
107  const std::vector<Analyzer::Expr*>& target_exprs,
108  const std::vector<int64_t>& table_frag_sizes) {
109  std::vector<int64_t> col_frag_sizes;
110  for (auto expr : target_exprs) {
111  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
112  if (col_var->get_rte_idx() < 0) {
113  CHECK_EQ(-1, col_var->get_rte_idx());
114  col_frag_sizes.push_back(int64_t(-1));
115  } else {
116  col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
117  }
118  } else {
119  col_frag_sizes.push_back(int64_t(-1));
120  }
121  }
122  return col_frag_sizes;
123 }
124 
125 inline std::vector<std::vector<int64_t>> get_col_frag_offsets(
126  const std::vector<Analyzer::Expr*>& target_exprs,
127  const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
128  std::vector<std::vector<int64_t>> col_frag_offsets;
129  for (auto& table_offsets : table_frag_offsets) {
130  std::vector<int64_t> col_offsets;
131  for (auto expr : target_exprs) {
132  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
133  if (col_var->get_rte_idx() < 0) {
134  CHECK_EQ(-1, col_var->get_rte_idx());
135  col_offsets.push_back(int64_t(-1));
136  } else {
137  CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
138  col_offsets.push_back(
139  static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
140  }
141  } else {
142  col_offsets.push_back(int64_t(-1));
143  }
144  }
145  col_frag_offsets.push_back(col_offsets);
146  }
147  return col_frag_offsets;
148 }
149 
150 // Return the RelAlg input index of outer_table_id based on ra_exe_unit.input_descs.
151 // Used by UNION queries to get the target_exprs corresponding to the current subquery.
152 int get_input_idx(RelAlgExecutionUnit const& ra_exe_unit,
153  const shared::TableKey& outer_table_key) {
154  auto match_table_key = [=](auto& desc) {
155  return outer_table_key == desc.getTableKey();
156  };
157  auto& input_descs = ra_exe_unit.input_descs;
158  auto itr = std::find_if(input_descs.begin(), input_descs.end(), match_table_key);
159  return itr == input_descs.end() ? 0 : itr->getNestLevel();
160 }
161 
162 } // namespace
163 
164 // Row-based execution constructor
166  const RelAlgExecutionUnit& ra_exe_unit,
168  const int device_id,
169  const ExecutorDeviceType device_type,
170  const ExecutorDispatchMode dispatch_mode,
171  const bool output_columnar,
172  const bool sort_on_gpu,
173  const shared::TableKey& outer_table_key,
174  const int64_t num_rows,
175  const std::vector<std::vector<const int8_t*>>& col_buffers,
176  const std::vector<std::vector<uint64_t>>& frag_offsets,
177  RenderAllocatorMap* render_allocator_map,
178  RenderInfo* render_info,
179  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
180  DeviceAllocator* device_allocator,
181  const size_t thread_idx,
182  const Executor* executor)
183  : num_rows_(num_rows)
184  , row_set_mem_owner_(row_set_mem_owner)
185  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
186  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
193  , device_allocator_(device_allocator)
194  , thread_idx_(thread_idx) {
195  CHECK(!sort_on_gpu || output_columnar);
196 
197  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
198  if (consistent_frag_sizes.empty()) {
199  // No fragments in the input, no underlying buffers will be needed.
200  return;
201  }
202  if (!ra_exe_unit.use_bump_allocator) {
203  check_total_bitmap_memory(query_mem_desc);
204  }
205  if (device_type == ExecutorDeviceType::GPU) {
206  allocateCountDistinctGpuMem(query_mem_desc);
207  }
208 
209  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
210  allocateCountDistinctBuffers(query_mem_desc, false, executor);
211  allocateModes(query_mem_desc, false, executor);
212  allocateTDigests(query_mem_desc, false, executor);
213  if (render_info && render_info->useCudaBuffers()) {
214  return;
215  }
216  }
217 
218  if (ra_exe_unit.estimator) {
219  return;
220  }
221 
222  const auto thread_count = device_type == ExecutorDeviceType::GPU
223  ? executor->blockSize() * executor->gridSize()
224  : 1;
225 
226  size_t group_buffer_size{0};
227  if (ra_exe_unit.use_bump_allocator) {
228  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
229  // the fragment
230  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
231  group_buffer_size = num_rows * query_mem_desc.getRowSize();
232  } else {
233  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
234  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
235  }
236  } else {
237  group_buffer_size =
238  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
239  }
240  CHECK_GE(group_buffer_size, size_t(0));
241 
242  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
243  int64_t* group_by_buffer_template{nullptr};
244  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
245  group_by_buffer_template = reinterpret_cast<int64_t*>(
246  row_set_mem_owner_->allocate(group_buffer_size, thread_idx_));
247  initGroupByBuffer(group_by_buffer_template,
248  ra_exe_unit,
249  query_mem_desc,
250  device_type,
251  output_columnar,
252  executor);
253  }
254 
255  if (query_mem_desc.interleavedBins(device_type)) {
256  CHECK(query_mem_desc.hasKeylessHash());
257  }
258 
259  const auto step = device_type == ExecutorDeviceType::GPU &&
260  query_mem_desc.threadsShareMemory() &&
261  query_mem_desc.isGroupBy()
262  ? executor->blockSize()
263  : size_t(1);
264  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
265  query_mem_desc.hasKeylessHash()
266  ? query_mem_desc.getEntryCount()
267  : size_t(0);
268  const auto actual_group_buffer_size =
269  group_buffer_size + index_buffer_qw * sizeof(int64_t);
270  CHECK_GE(actual_group_buffer_size, group_buffer_size);
271 
272  if (query_mem_desc.hasVarlenOutput()) {
273  const auto varlen_buffer_elem_size_opt = query_mem_desc.varlenOutputBufferElemSize();
274  CHECK(varlen_buffer_elem_size_opt); // TODO(adb): relax
275  auto varlen_output_buffer = reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(
276  query_mem_desc.getEntryCount() * varlen_buffer_elem_size_opt.value()));
277  num_buffers_ += 1;
278  group_by_buffers_.push_back(varlen_output_buffer);
279  }
280 
281  for (size_t i = 0; i < group_buffers_count; i += step) {
282  auto group_by_buffer = alloc_group_by_buffer(actual_group_buffer_size,
283  render_allocator_map,
284  thread_idx_,
285  row_set_mem_owner_.get());
286  if (!query_mem_desc.lazyInitGroups(device_type)) {
287  if (group_by_buffer_template) {
288  memcpy(group_by_buffer + index_buffer_qw,
289  group_by_buffer_template,
290  group_buffer_size);
291  } else {
292  initGroupByBuffer(group_by_buffer + index_buffer_qw,
293  ra_exe_unit,
294  query_mem_desc,
295  device_type,
296  output_columnar,
297  executor);
298  }
299  }
300  group_by_buffers_.push_back(group_by_buffer);
301  for (size_t j = 1; j < step; ++j) {
302  group_by_buffers_.push_back(nullptr);
303  }
304  const bool use_target_exprs_union =
305  ra_exe_unit.union_all && get_input_idx(ra_exe_unit, outer_table_key);
306  const auto& target_exprs = use_target_exprs_union ? ra_exe_unit.target_exprs_union
307  : ra_exe_unit.target_exprs;
308  const auto column_frag_offsets = get_col_frag_offsets(target_exprs, frag_offsets);
309  const auto column_frag_sizes =
310  get_consistent_frags_sizes(target_exprs, consistent_frag_sizes);
311 
312  result_sets_.emplace_back(
313  new ResultSet(target_exprs_to_infos(target_exprs, query_mem_desc),
314  executor->getColLazyFetchInfo(target_exprs),
315  col_buffers,
316  column_frag_offsets,
317  column_frag_sizes,
318  device_type,
319  device_id,
322  executor->blockSize(),
323  executor->gridSize()));
324  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
325  executor->plan_state_->init_agg_vals_,
327  for (size_t j = 1; j < step; ++j) {
328  result_sets_.emplace_back(nullptr);
329  }
330  }
331 }
332 
333 // Table functions execution constructor
335  const TableFunctionExecutionUnit& exe_unit,
337  const int device_id,
338  const ExecutorDeviceType device_type,
339  const int64_t num_rows,
340  const std::vector<std::vector<const int8_t*>>& col_buffers,
341  const std::vector<std::vector<uint64_t>>& frag_offsets,
342  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
343  DeviceAllocator* device_allocator,
344  const Executor* executor)
345  : num_rows_(num_rows)
346  , row_set_mem_owner_(row_set_mem_owner)
347  , init_agg_vals_(init_agg_val_vec(exe_unit.target_exprs, {}, query_mem_desc))
348  , num_buffers_(1)
355  , device_allocator_(device_allocator)
356  , thread_idx_(0) {
357  // Table functions output columnar, basically treat this as a projection
358  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
359  if (consistent_frag_sizes.empty()) {
360  // No fragments in the input, no underlying buffers will be needed.
361  return;
362  }
363 
364  const size_t num_columns =
365  query_mem_desc.getBufferColSlotCount(); // shouldn't we use getColCount() ???
366  size_t total_group_by_buffer_size{0};
367  for (size_t i = 0; i < num_columns; ++i) {
368  auto ti = exe_unit.target_exprs[i]->get_type_info();
369  if (ti.is_array()) {
370  // See TableFunctionManager.h for info regarding flatbuffer
371  // memory managment.
372  auto slot_idx = query_mem_desc.getSlotIndexForSingleSlotCol(i);
373  int64_t flatbuffer_size = query_mem_desc.getFlatBufferSize(slot_idx);
374  total_group_by_buffer_size =
375  align_to_int64(total_group_by_buffer_size + flatbuffer_size);
376  } else {
377  const size_t col_width = ti.get_size();
378  const size_t group_buffer_size = num_rows_ * col_width;
379  total_group_by_buffer_size =
380  align_to_int64(total_group_by_buffer_size + group_buffer_size);
381  }
382  }
383 
384  CHECK_EQ(num_buffers_, size_t(1));
385  auto group_by_buffer = alloc_group_by_buffer(
386  total_group_by_buffer_size, nullptr, thread_idx_, row_set_mem_owner.get());
387  group_by_buffers_.push_back(group_by_buffer);
388 
389  const auto column_frag_offsets =
390  get_col_frag_offsets(exe_unit.target_exprs, frag_offsets);
391  const auto column_frag_sizes =
392  get_consistent_frags_sizes(exe_unit.target_exprs, consistent_frag_sizes);
393  result_sets_.emplace_back(
394  new ResultSet(target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
395  /*col_lazy_fetch_info=*/{},
396  col_buffers,
397  column_frag_offsets,
398  column_frag_sizes,
399  device_type,
400  device_id,
402  row_set_mem_owner_,
403  executor->blockSize(),
404  executor->gridSize()));
405  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
406  init_agg_vals_);
407 }
408 
410  int64_t* buffer,
411  const RelAlgExecutionUnit& ra_exe_unit,
413  const ExecutorDeviceType device_type,
414  const bool output_columnar,
415  const Executor* executor) {
416  if (output_columnar) {
417  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor);
418  } else {
419  auto rows_ptr = buffer;
420  auto actual_entry_count = query_mem_desc.getEntryCount();
421  const auto thread_count = device_type == ExecutorDeviceType::GPU
422  ? executor->blockSize() * executor->gridSize()
423  : 1;
424  auto warp_size =
425  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
426  if (query_mem_desc.useStreamingTopN()) {
427  const auto node_count_size = thread_count * sizeof(int64_t);
428  memset(rows_ptr, 0, node_count_size);
429  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
430  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
431  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
432  rows_ptr += rows_offset / sizeof(int64_t);
433  actual_entry_count = n * thread_count;
434  warp_size = 1;
435  }
436  initRowGroups(query_mem_desc,
437  rows_ptr,
439  actual_entry_count,
440  warp_size,
441  executor);
442  }
443 }
444 
446  int64_t* groups_buffer,
447  const std::vector<int64_t>& init_vals,
448  const int32_t groups_buffer_entry_count,
449  const size_t warp_size,
450  const Executor* executor) {
451  const size_t key_count{query_mem_desc.getGroupbyColCount()};
452  const size_t row_size{query_mem_desc.getRowSize()};
453  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
454 
455  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
456  auto mode_index_set = allocateModes(query_mem_desc, true, executor);
457  auto quantile_params = allocateTDigests(query_mem_desc, true, executor);
458  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
459 
460  const auto query_mem_desc_fixedup =
462 
463  auto const is_true = [](auto const& x) { return static_cast<bool>(x); };
464  // not COUNT DISTINCT / APPROX_COUNT_DISTINCT / APPROX_QUANTILE
465  // we fallback to default implementation in that cases
466  if (!std::any_of(agg_bitmap_size.begin(), agg_bitmap_size.end(), is_true) &&
467  !std::any_of(quantile_params.begin(), quantile_params.end(), is_true) &&
468  mode_index_set.empty() && g_optimize_row_initialization) {
469  std::vector<int8_t> sample_row(row_size - col_base_off);
470 
471  initColumnsPerRow(query_mem_desc_fixedup,
472  sample_row.data(),
473  init_vals,
474  agg_bitmap_size,
475  mode_index_set,
476  quantile_params);
477 
478  if (query_mem_desc.hasKeylessHash()) {
479  CHECK(warp_size >= 1);
480  CHECK(key_count == 1 || warp_size == 1);
481  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
482  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
483  ++bin, buffer_ptr += row_size) {
484  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
485  }
486  }
487  return;
488  }
489 
490  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
491  ++bin, buffer_ptr += row_size) {
492  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
494  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
495  }
496  } else {
497  if (query_mem_desc.hasKeylessHash()) {
498  CHECK(warp_size >= 1);
499  CHECK(key_count == 1 || warp_size == 1);
500  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
501  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
502  ++bin, buffer_ptr += row_size) {
503  initColumnsPerRow(query_mem_desc_fixedup,
504  &buffer_ptr[col_base_off],
505  init_vals,
506  agg_bitmap_size,
507  mode_index_set,
508  quantile_params);
509  }
510  }
511  return;
512  }
513 
514  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
515  ++bin, buffer_ptr += row_size) {
517  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
518  initColumnsPerRow(query_mem_desc_fixedup,
519  &buffer_ptr[col_base_off],
520  init_vals,
521  agg_bitmap_size,
522  mode_index_set,
523  quantile_params);
524  }
525  }
526 }
527 
528 namespace {
529 
530 template <typename T>
531 int8_t* initColumnarBuffer(T* buffer_ptr, const T init_val, const uint32_t entry_count) {
532  static_assert(sizeof(T) <= sizeof(int64_t), "Unsupported template type");
533  for (uint32_t i = 0; i < entry_count; ++i) {
534  buffer_ptr[i] = init_val;
535  }
536  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
537 }
538 
539 } // namespace
540 
543  int64_t* groups_buffer,
544  const std::vector<int64_t>& init_vals,
545  const Executor* executor) {
546  CHECK(groups_buffer);
547 
548  for (const auto target_expr : executor->plan_state_->target_exprs_) {
549  const auto agg_info = get_target_info(target_expr, g_bigint_count);
550  CHECK(!is_distinct_target(agg_info));
551  }
552  const int32_t agg_col_count = query_mem_desc.getSlotCount();
553  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
554 
555  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
556  if (!query_mem_desc.hasKeylessHash()) {
557  const size_t key_count{query_mem_desc.getGroupbyColCount()};
558  for (size_t i = 0; i < key_count; ++i) {
559  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
560  EMPTY_KEY_64,
561  groups_buffer_entry_count);
562  }
563  }
564 
566  // initializing all aggregate columns:
567  int32_t init_val_idx = 0;
568  for (int32_t i = 0; i < agg_col_count; ++i) {
569  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
570  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
571  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
572  case 1:
573  buffer_ptr = initColumnarBuffer<int8_t>(
574  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
575  break;
576  case 2:
577  buffer_ptr =
578  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
579  init_vals[init_val_idx++],
580  groups_buffer_entry_count);
581  break;
582  case 4:
583  buffer_ptr =
584  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
585  init_vals[init_val_idx++],
586  groups_buffer_entry_count);
587  break;
588  case 8:
589  buffer_ptr =
590  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
591  init_vals[init_val_idx++],
592  groups_buffer_entry_count);
593  break;
594  case 0:
595  break;
596  default:
597  CHECK(false);
598  }
599 
600  buffer_ptr = align_to_int64(buffer_ptr);
601  }
602  }
603  }
604 }
605 
608  int8_t* row_ptr,
609  const std::vector<int64_t>& init_vals,
610  const std::vector<int64_t>& bitmap_sizes,
611  const ModeIndexSet& mode_index_set,
612  const std::vector<QuantileParam>& quantile_params) {
613  int8_t* col_ptr = row_ptr;
614  size_t init_vec_idx = 0;
615  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
616  col_ptr += query_mem_desc.getNextColOffInBytesRowOnly(col_ptr, col_idx++)) {
617  const int64_t bm_sz{bitmap_sizes[col_idx]};
618  int64_t init_val{0};
619  if (bm_sz && query_mem_desc.isGroupBy()) {
620  // COUNT DISTINCT / APPROX_COUNT_DISTINCT
621  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
622  sizeof(int64_t));
623  init_val =
625  ++init_vec_idx;
626  } else if (query_mem_desc.isGroupBy() && quantile_params[col_idx]) {
627  auto const q = *quantile_params[col_idx];
628  // allocate for APPROX_QUANTILE only when slot is used
629  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
630  ++init_vec_idx;
631  } else if (query_mem_desc.isGroupBy() && mode_index_set.count(col_idx)) {
632  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->allocateMode());
633  ++init_vec_idx;
634  } else {
635  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
636  CHECK_LT(init_vec_idx, init_vals.size());
637  init_val = init_vals[init_vec_idx++];
638  }
639  }
640  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
641  case 1:
642  *col_ptr = static_cast<int8_t>(init_val);
643  break;
644  case 2:
645  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
646  break;
647  case 4:
648  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
649  break;
650  case 8:
651  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
652  break;
653  case 0:
654  continue;
655  default:
656  CHECK(false);
657  }
658  }
659 }
660 
663  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
664  return;
665  }
667 
668  size_t total_bytes_per_entry{0};
669  const size_t num_count_distinct_descs =
670  query_mem_desc.getCountDistinctDescriptorsSize();
671  for (size_t i = 0; i < num_count_distinct_descs; i++) {
672  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
673  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
674  continue;
675  }
676  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
677  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
678  }
679 
681  total_bytes_per_entry * query_mem_desc.getEntryCount();
682  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
684  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
686 
689 }
690 
691 // deferred is true for group by queries; initGroups will allocate a bitmap
692 // for each group slot
695  const bool deferred,
696  const Executor* executor) {
697  const size_t agg_col_count{query_mem_desc.getSlotCount()};
698  std::vector<int64_t> agg_bitmap_size(deferred ? agg_col_count : 0);
699 
700  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
701  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
702  ++target_idx) {
703  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
704  const auto agg_info = get_target_info(target_expr, g_bigint_count);
705  if (is_distinct_target(agg_info)) {
706  CHECK(agg_info.is_agg &&
707  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kCOUNT_IF ||
708  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
709  CHECK(!agg_info.sql_type.is_varlen());
710 
711  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
712  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
713 
714  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
715  sizeof(int64_t));
716  const auto& count_distinct_desc =
717  query_mem_desc.getCountDistinctDescriptor(target_idx);
718  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
719  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
720  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
721  if (deferred) {
722  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
723  } else {
724  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
725  }
726  } else {
727  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::UnorderedSet);
728  if (deferred) {
729  agg_bitmap_size[agg_col_idx] = -1;
730  } else {
731  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
732  }
733  }
734  }
735  }
736 
737  return agg_bitmap_size;
738 }
739 
740 int64_t QueryMemoryInitializer::allocateCountDistinctBitmap(const size_t bitmap_byte_sz) {
744  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
745  row_set_mem_owner_->addCountDistinctBuffer(
746  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
747  return reinterpret_cast<int64_t>(ptr);
748  }
749  return reinterpret_cast<int64_t>(
750  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz, thread_idx_));
751 }
752 
754  auto count_distinct_set = new CountDistinctSet();
755  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
756  return reinterpret_cast<int64_t>(count_distinct_set);
757 }
758 
759 namespace {
760 
762  std::vector<Analyzer::Expr*> const& target_exprs,
763  SQLAgg const agg_type,
764  std::function<void(Analyzer::AggExpr const*, size_t)> lambda) {
765  for (size_t target_idx = 0; target_idx < target_exprs.size(); ++target_idx) {
766  auto const target_expr = target_exprs[target_idx];
767  if (auto const* agg_expr = dynamic_cast<Analyzer::AggExpr const*>(target_expr)) {
768  if (agg_expr->get_aggtype() == agg_type) {
769  lambda(agg_expr, target_idx);
770  }
771  }
772  }
773 }
774 
775 } // namespace
776 
779  const bool deferred,
780  const Executor* executor) {
781  size_t const slot_count = query_mem_desc.getSlotCount();
782  CHECK_LE(executor->plan_state_->target_exprs_.size(), slot_count);
783  ModeIndexSet mode_index_set;
784 
786  executor->plan_state_->target_exprs_,
787  kMODE,
788  [&](Analyzer::AggExpr const*, size_t const target_idx) {
789  size_t const agg_col_idx =
790  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
791  CHECK_LT(agg_col_idx, slot_count);
792  if (deferred) {
793  mode_index_set.emplace(agg_col_idx);
794  } else {
795  AggMode* agg_mode = row_set_mem_owner_->allocateMode();
796  init_agg_vals_[agg_col_idx] = reinterpret_cast<int64_t>(agg_mode);
797  }
798  });
799  return mode_index_set;
800 }
801 
802 std::vector<QueryMemoryInitializer::QuantileParam>
804  const bool deferred,
805  const Executor* executor) {
806  size_t const slot_count = query_mem_desc.getSlotCount();
807  CHECK_LE(executor->plan_state_->target_exprs_.size(), slot_count);
808  std::vector<QuantileParam> quantile_params(deferred ? slot_count : 0);
809 
811  executor->plan_state_->target_exprs_,
813  [&](Analyzer::AggExpr const* const agg_expr, size_t const target_idx) {
814  size_t const agg_col_idx =
815  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
816  CHECK_LT(agg_col_idx, slot_count);
817  CHECK_EQ(static_cast<int8_t>(sizeof(int64_t)),
818  query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx));
819  auto const q_expr =
820  dynamic_cast<Analyzer::Constant const*>(agg_expr->get_arg1().get());
821  CHECK(q_expr);
822  auto const q = q_expr->get_constval().doubleval;
823  if (deferred) {
824  quantile_params[agg_col_idx] = q;
825  } else {
826  // allocate for APPROX_QUANTILE only when slot is used
827  init_agg_vals_[agg_col_idx] =
828  reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
829  }
830  });
831  return quantile_params;
832 }
833 
836  const int8_t* init_agg_vals_dev_ptr,
837  const size_t n,
838  const int device_id,
839  const unsigned block_size_x,
840  const unsigned grid_size_x) {
841 #ifdef HAVE_CUDA
843  const auto thread_count = block_size_x * grid_size_x;
844  const auto total_buff_size =
845  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
846  int8_t* dev_buffer = device_allocator_->alloc(total_buff_size);
847 
848  std::vector<int8_t*> dev_buffers(thread_count);
849 
850  for (size_t i = 0; i < thread_count; ++i) {
851  dev_buffers[i] = dev_buffer;
852  }
853 
854  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(int8_t*));
856  dev_ptr, dev_buffers.data(), thread_count * sizeof(int8_t*));
857 
859 
860  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
861  thread_count * sizeof(int64_t));
862 
864  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
865  (unsigned char)-1,
866  thread_count * n * sizeof(int64_t));
867 
869  reinterpret_cast<int64_t*>(
870  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
871  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
872  n * thread_count,
873  query_mem_desc.getGroupbyColCount(),
874  query_mem_desc.getEffectiveKeyWidth(),
875  query_mem_desc.getRowSize() / sizeof(int64_t),
876  query_mem_desc.hasKeylessHash(),
877  1,
878  block_size_x,
879  grid_size_x);
880 
881  return {dev_ptr, dev_buffer};
882 #else
883  UNREACHABLE();
884  return {};
885 #endif
886 }
887 
889  const RelAlgExecutionUnit& ra_exe_unit,
891  const int8_t* init_agg_vals_dev_ptr,
892  const int device_id,
893  const ExecutorDispatchMode dispatch_mode,
894  const unsigned block_size_x,
895  const unsigned grid_size_x,
896  const int8_t warp_size,
897  const bool can_sort_on_gpu,
898  const bool output_columnar,
899  RenderAllocator* render_allocator) {
900 #ifdef HAVE_CUDA
901  if (query_mem_desc.useStreamingTopN()) {
902  if (render_allocator) {
904  }
905  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
906  CHECK(!output_columnar);
907 
909  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
910  }
911 
912  auto dev_group_by_buffers =
915  query_mem_desc,
916  block_size_x,
917  grid_size_x,
918  device_id,
919  dispatch_mode,
920  num_rows_,
921  can_sort_on_gpu,
922  false,
923  ra_exe_unit.use_bump_allocator,
924  query_mem_desc.hasVarlenOutput(),
925  render_allocator);
926  if (query_mem_desc.hasVarlenOutput()) {
927  CHECK(dev_group_by_buffers.varlen_output_buffer);
929  reinterpret_cast<CUdeviceptr>(dev_group_by_buffers.varlen_output_buffer);
930  CHECK(query_mem_desc.varlenOutputBufferElemSize());
931  const size_t varlen_output_buf_bytes =
932  query_mem_desc.getEntryCount() *
933  query_mem_desc.varlenOutputBufferElemSize().value();
935  row_set_mem_owner_->allocate(varlen_output_buf_bytes, thread_idx_);
937  varlen_output_info_->gpu_start_address = static_cast<int64_t>(varlen_output_buffer_);
939  }
940  if (render_allocator) {
941  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
942  }
943  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
944  CHECK(!render_allocator);
945 
946  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
947  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
948  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
949  auto group_by_dev_buffer = dev_group_by_buffers.data;
950  const size_t col_count = query_mem_desc.getSlotCount();
951  int8_t* col_widths_dev_ptr{nullptr};
952  if (output_columnar) {
953  std::vector<int8_t> compact_col_widths(col_count);
954  for (size_t idx = 0; idx < col_count; ++idx) {
955  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
956  }
957  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
959  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
960  }
961  const int8_t warp_count =
962  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
963  const auto num_group_by_buffers =
964  getGroupByBuffersSize() - (query_mem_desc.hasVarlenOutput() ? 1 : 0);
965  for (size_t i = 0; i < num_group_by_buffers; i += step) {
966  if (output_columnar) {
968  reinterpret_cast<int64_t*>(group_by_dev_buffer),
969  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
970  dev_group_by_buffers.entry_count,
971  query_mem_desc.getGroupbyColCount(),
972  col_count,
973  col_widths_dev_ptr,
974  /*need_padding = */ true,
975  query_mem_desc.hasKeylessHash(),
976  sizeof(int64_t),
977  block_size_x,
978  grid_size_x);
979  } else {
981  reinterpret_cast<int64_t*>(group_by_dev_buffer),
982  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
983  dev_group_by_buffers.entry_count,
984  query_mem_desc.getGroupbyColCount(),
985  query_mem_desc.getEffectiveKeyWidth(),
986  query_mem_desc.getRowSize() / sizeof(int64_t),
987  query_mem_desc.hasKeylessHash(),
988  warp_count,
989  block_size_x,
990  grid_size_x);
991  }
992  group_by_dev_buffer += groups_buffer_size;
993  }
994  }
995  return dev_group_by_buffers;
996 #else
997  UNREACHABLE();
998  return {};
999 #endif
1000 }
1001 
1004  const int device_id,
1005  const unsigned block_size_x,
1006  const unsigned grid_size_x,
1007  const bool zero_initialize_buffers) {
1008  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
1009  CHECK_GT(num_columns, size_t(0));
1010  size_t total_group_by_buffer_size{0};
1011  const auto col_slot_context = query_mem_desc.getColSlotContext();
1012 
1013  std::vector<size_t> col_byte_offsets;
1014  col_byte_offsets.reserve(num_columns);
1015 
1016  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1017  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
1018  size_t group_buffer_size = num_rows_ * col_width;
1019  col_byte_offsets.emplace_back(total_group_by_buffer_size);
1020  total_group_by_buffer_size =
1021  align_to_int64(total_group_by_buffer_size + group_buffer_size);
1022  }
1023 
1024  int8_t* dev_buffers_allocation{nullptr};
1025  dev_buffers_allocation = device_allocator_->alloc(total_group_by_buffer_size);
1026  CHECK(dev_buffers_allocation);
1027  if (zero_initialize_buffers) {
1028  device_allocator_->zeroDeviceMem(dev_buffers_allocation, total_group_by_buffer_size);
1029  }
1030 
1031  auto dev_buffers_mem = dev_buffers_allocation;
1032  std::vector<int8_t*> dev_buffers(num_columns);
1033  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1034  dev_buffers[col_idx] = dev_buffers_allocation + col_byte_offsets[col_idx];
1035  }
1036  auto dev_ptrs = device_allocator_->alloc(num_columns * sizeof(CUdeviceptr));
1038  dev_ptrs, dev_buffers.data(), num_columns * sizeof(CUdeviceptr));
1039 
1040  return {dev_ptrs, dev_buffers_mem, (size_t)num_rows_};
1041 }
1042 
1044  Data_Namespace::DataMgr* data_mgr,
1045  const QueryMemoryDescriptor& query_mem_desc,
1046  const size_t entry_count,
1047  const GpuGroupByBuffers& gpu_group_by_buffers,
1048  const int device_id,
1049  const unsigned block_size_x,
1050  const unsigned grid_size_x) {
1051  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
1052 
1053  int8_t* dev_buffer = gpu_group_by_buffers.data;
1054  int8_t* host_buffer = reinterpret_cast<int8_t*>(group_by_buffers_[0]);
1055 
1056  const size_t original_entry_count = gpu_group_by_buffers.entry_count;
1057  CHECK_LE(entry_count, original_entry_count);
1058  size_t output_device_col_offset{0};
1059  size_t output_host_col_offset{0};
1060 
1061  const auto col_slot_context = query_mem_desc.getColSlotContext();
1062 
1063  auto allocator = std::make_unique<CudaAllocator>(
1064  data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id));
1065 
1066  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1067  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
1068  const size_t output_device_col_size = original_entry_count * col_width;
1069  const size_t output_host_col_size = entry_count * col_width;
1070  allocator->copyFromDevice(host_buffer + output_host_col_offset,
1071  dev_buffer + output_device_col_offset,
1072  output_host_col_size);
1073  output_device_col_offset =
1074  align_to_int64(output_device_col_offset + output_device_col_size);
1075  output_host_col_offset =
1076  align_to_int64(output_host_col_offset + output_host_col_size);
1077  }
1078 }
1079 
1081  const QueryMemoryDescriptor& query_mem_desc,
1082  const ExecutorDeviceType device_type,
1083  const Executor* executor) const {
1084  return device_type == ExecutorDeviceType::CPU
1085  ? 1
1086  : executor->blockSize() *
1087  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
1088 }
1089 
1090 namespace {
1091 
1092 // in-place compaction of output buffer
1094  const QueryMemoryDescriptor& query_mem_desc,
1095  int8_t* projection_buffer,
1096  const size_t projection_count) {
1097  // the first column (row indices) remains unchanged.
1098  CHECK(projection_count <= query_mem_desc.getEntryCount());
1099  constexpr size_t row_index_width = sizeof(int64_t);
1100  size_t buffer_offset1{projection_count * row_index_width};
1101  // other columns are actual non-lazy columns for the projection:
1102  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
1103  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
1104  auto column_proj_size =
1105  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
1106  auto buffer_offset2 = query_mem_desc.getColOffInBytes(i);
1107  if (buffer_offset1 + column_proj_size >= buffer_offset2) {
1108  // overlapping
1109  std::memmove(projection_buffer + buffer_offset1,
1110  projection_buffer + buffer_offset2,
1111  column_proj_size);
1112  } else {
1113  std::memcpy(projection_buffer + buffer_offset1,
1114  projection_buffer + buffer_offset2,
1115  column_proj_size);
1116  }
1117  buffer_offset1 += align_to_int64(column_proj_size);
1118  }
1119  }
1120 }
1121 
1122 } // namespace
1123 
1125  const QueryMemoryDescriptor& query_mem_desc,
1126  const size_t projection_count) {
1127  const auto num_allocated_rows =
1128  std::min(projection_count, query_mem_desc.getEntryCount());
1129  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1130 
1131  // copy the results from the main buffer into projection_buffer
1133  query_mem_desc,
1134  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1135  num_allocated_rows);
1136 
1137  // update the entry count for the result set, and its underlying storage
1138  CHECK(!result_sets_.empty());
1139  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1140 }
1141 
1143  const QueryMemoryDescriptor& query_mem_desc,
1144  Data_Namespace::DataMgr* data_mgr,
1145  const GpuGroupByBuffers& gpu_group_by_buffers,
1146  const size_t projection_count,
1147  const int device_id) {
1148  // store total number of allocated rows:
1149  const auto num_allocated_rows =
1150  std::min(projection_count, query_mem_desc.getEntryCount());
1151 
1152  // copy the results from the main buffer into projection_buffer
1153  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1155  data_mgr,
1156  gpu_group_by_buffers,
1157  query_mem_desc,
1158  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1159  num_allocated_rows,
1160  device_id);
1161 
1162  // update the entry count for the result set, and its underlying storage
1163  CHECK(!result_sets_.empty());
1164  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1165 }
1166 
1168  DeviceAllocator& device_allocator,
1169  const QueryMemoryDescriptor& query_mem_desc,
1170  const size_t entry_count,
1171  const GpuGroupByBuffers& gpu_group_by_buffers,
1172  const RelAlgExecutionUnit* ra_exe_unit,
1173  const unsigned block_size_x,
1174  const unsigned grid_size_x,
1175  const int device_id,
1176  const bool prepend_index_buffer) const {
1177  const auto thread_count = block_size_x * grid_size_x;
1178 
1179  size_t total_buff_size{0};
1180  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
1181  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
1182  total_buff_size =
1183  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1184  } else {
1185  total_buff_size =
1186  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
1187  }
1188  copy_group_by_buffers_from_gpu(device_allocator,
1190  total_buff_size,
1191  gpu_group_by_buffers.data,
1192  query_mem_desc,
1193  block_size_x,
1194  grid_size_x,
1195  device_id,
1196  prepend_index_buffer,
1197  query_mem_desc.hasVarlenOutput());
1198 }
1199 
1201  const QueryMemoryDescriptor& query_mem_desc,
1202  const RelAlgExecutionUnit& ra_exe_unit) {
1203  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1204  CHECK_EQ(group_by_buffers_.size(), buffer_start_idx + 1);
1205 
1206  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1207  group_by_buffers_[buffer_start_idx],
1208  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1209  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
1210  1);
1211  CHECK_EQ(rows_copy.size(),
1212  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1213  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1214 }
1215 
1217  Data_Namespace::DataMgr* data_mgr,
1218  const QueryMemoryDescriptor& query_mem_desc,
1219  const GpuGroupByBuffers& gpu_group_by_buffers,
1220  const RelAlgExecutionUnit& ra_exe_unit,
1221  const unsigned total_thread_count,
1222  const int device_id) {
1223 #ifdef HAVE_CUDA
1225  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1226 
1227  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1228  data_mgr,
1229  reinterpret_cast<int64_t*>(gpu_group_by_buffers.data),
1230  ra_exe_unit,
1231  query_mem_desc,
1232  total_thread_count,
1233  device_id);
1234  CHECK_EQ(
1235  rows_copy.size(),
1236  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1237  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1238 #else
1239  UNREACHABLE();
1240 #endif
1241 }
1242 
1243 std::shared_ptr<VarlenOutputInfo> QueryMemoryInitializer::getVarlenOutputInfo() {
1244  if (varlen_output_info_) {
1245  return varlen_output_info_;
1246  }
1247 
1248  // shared_ptr so that both the ResultSet and QMI can hold on to the varlen info object
1249  // and update it as needed
1250  varlen_output_info_ = std::make_shared<VarlenOutputInfo>(VarlenOutputInfo{
1251  static_cast<int64_t>(varlen_output_buffer_), varlen_output_buffer_host_ptr_});
1252  return varlen_output_info_;
1253 }
GpuGroupByBuffers setupTableFunctionGpuBuffers(const QueryMemoryDescriptor &query_mem_desc, const int device_id, const unsigned block_size_x, const unsigned grid_size_x, const bool zero_initialize_buffers)
std::vector< Analyzer::Expr * > target_exprs
SQLAgg
Definition: sqldefs.h:73
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *device_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:70
RenderAllocator * getRenderAllocator(size_t device_id)
ModeIndexSet allocateModes(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
bool countDistinctDescriptorsLogicallyEmpty() const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:53
#define EMPTY_KEY_64
int8_t logical_size
GpuGroupByBuffers prepareTopNHeapsDevBuffer(const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const ModeIndexSet &mode_index_set, const std::vector< QuantileParam > &quantile_params)
GpuGroupByBuffers createAndInitializeGroupByBufferGpu(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const int device_id, const ExecutorDispatchMode dispatch_mode, const unsigned block_size_x, const unsigned grid_size_x, const int8_t warp_size, const bool can_sort_on_gpu, const bool output_columnar, RenderAllocator *render_allocator)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
DeviceAllocator * device_allocator_
ExecutorDeviceType
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
const std::optional< bool > union_all
Streaming Top N algorithm.
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const shared::TableKey &outer_table_key, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
unsigned long long CUdeviceptr
Definition: nocuda.h:28
int8_t * allocate(const size_t num_bytes, const size_t thread_idx=0) override
std::vector< InputDescriptor > input_descs
#define UNREACHABLE()
Definition: Logger.h:337
#define CHECK_GE(x, y)
Definition: Logger.h:306
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
varlen_output_buffer_(0)
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getEffectiveKeyWidth() const
void eachAggregateTargetIdxOfType(std::vector< Analyzer::Expr * > const &target_exprs, SQLAgg const agg_type, std::function< void(Analyzer::AggExpr const *, size_t)> lambda)
num_buffers_(1)
#define CHECK_GT(x, y)
Definition: Logger.h:305
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:88
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< QuantileParam > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
varlen_output_buffer_host_ptr_(nullptr)
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
const SlotSize & getSlotInfo(const size_t slot_idx) const
std::vector< Analyzer::Expr * > target_exprs_union
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
const size_t limit
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
std::vector< int64_t > init_agg_vals_
size_t getGroupbyColCount() const
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
bool g_bigint_count
int64_t g_bitmap_memory_limit
size_t g_max_memory_allocation_size
Definition: Execute.cpp:116
size_t getAllocatedSize() const
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:98
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
const std::shared_ptr< Analyzer::Estimator > estimator
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
size_t getCountDistinctDescriptorsSize() const
QueryDescriptionType getQueryDescriptionType() const
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::shared_ptr< Analyzer::Expr > get_arg1() const
Definition: Analyzer.h:1211
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
void copyGroupByBuffersFromGpu(DeviceAllocator &device_allocator, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
std::optional< size_t > varlenOutputBufferElemSize() const
#define CHECK_LT(x, y)
Definition: Logger.h:303
std::shared_ptr< VarlenOutputInfo > getVarlenOutputInfo()
count_distinct_bitmap_mem_(0)
#define CHECK_LE(x, y)
Definition: Logger.h:304
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
size_t getNextColOffInBytesRowOnly(const int8_t *col_ptr, const size_t col_idx) const
Definition: sqldefs.h:78
robin_hood::unordered_set< size_t > ModeIndexSet
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:756
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
count_distinct_bitmap_host_mem_(nullptr)
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
device_allocator_(device_allocator)
bool interleavedBins(const ExecutorDeviceType) const
const ColSlotContext & getColSlotContext() const
#define CHECK(condition)
Definition: Logger.h:291
void copyFromTableFunctionGpuBuffers(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
bool g_optimize_row_initialization
Definition: Execute.cpp:101
bool any_of(std::vector< Analyzer::Expr * > const &target_exprs)
count_distinct_bitmap_crt_ptr_(nullptr)
std::shared_ptr< VarlenOutputInfo > varlen_output_info_
constexpr double n
Definition: Utm.h:38
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
const size_t offset
count_distinct_bitmap_mem_bytes_(0)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
size_t getColOffInBytes(const size_t col_idx) const
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, const size_t thread_idx, RowSetMemoryOwner *mem_owner)
void copy_group_by_buffers_from_gpu(DeviceAllocator &device_allocator, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const int8_t *group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)
std::vector< std::unique_ptr< ResultSet > > result_sets_
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0
Definition: sqldefs.h:83
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
int get_input_idx(RelAlgExecutionUnit const &ra_exe_unit, const shared::TableKey &outer_table_key)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)