OmniSciDB  ca0c39ec8f
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryMemoryInitializer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryInitializer.h"
18 
20 #include "Execute.h"
21 #include "GpuInitGroups.h"
22 #include "GpuMemUtils.h"
23 #include "Logger/Logger.h"
26 #include "ResultSet.h"
27 #include "StreamingTopN.h"
28 #include "Utils/FlatBuffer.h"
29 
30 #include <Shared/checked_alloc.h>
31 
32 // 8 GB, the limit of perfect hash group by under normal conditions
33 int64_t g_bitmap_memory_limit{8LL * 1000 * 1000 * 1000};
34 
35 namespace {
36 
38  const int32_t groups_buffer_entry_count = query_mem_desc.getEntryCount();
39  checked_int64_t total_bytes_per_group = 0;
40  const size_t num_count_distinct_descs =
41  query_mem_desc.getCountDistinctDescriptorsSize();
42  for (size_t i = 0; i < num_count_distinct_descs; i++) {
43  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
44  if (count_distinct_desc.impl_type_ != CountDistinctImplType::Bitmap) {
45  continue;
46  }
47  total_bytes_per_group += count_distinct_desc.bitmapPaddedSizeBytes();
48  }
49  int64_t total_bytes{0};
50  // Using OutOfHostMemory until we can verify that SlabTooBig would also be properly
51  // caught
52  try {
53  total_bytes = static_cast<int64_t>(total_bytes_per_group * groups_buffer_entry_count);
54  } catch (...) {
55  // Absurd amount of memory, merely computing the number of bits overflows int64_t.
56  // Don't bother to report the real amount, this is unlikely to ever happen.
57  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
58  }
59  if (total_bytes >= g_bitmap_memory_limit) {
60  throw OutOfHostMemory(total_bytes);
61  }
62 }
63 
64 int64_t* alloc_group_by_buffer(const size_t numBytes,
65  RenderAllocatorMap* render_allocator_map,
66  const size_t thread_idx,
67  RowSetMemoryOwner* mem_owner) {
68  if (render_allocator_map) {
69  // NOTE(adb): If we got here, we are performing an in-situ rendering query and are not
70  // using CUDA buffers. Therefore we need to allocate result set storage using CPU
71  // memory.
72  const auto gpu_idx = 0; // Only 1 GPU supported in CUDA-disabled rendering mode
73  auto render_allocator_ptr = render_allocator_map->getRenderAllocator(gpu_idx);
74  return reinterpret_cast<int64_t*>(render_allocator_ptr->alloc(numBytes));
75  } else {
76  return reinterpret_cast<int64_t*>(mem_owner->allocate(numBytes, thread_idx));
77  }
78 }
79 
80 inline int64_t get_consistent_frag_size(const std::vector<uint64_t>& frag_offsets) {
81  if (frag_offsets.size() < 2) {
82  return int64_t(-1);
83  }
84  const auto frag_size = frag_offsets[1] - frag_offsets[0];
85  for (size_t i = 2; i < frag_offsets.size(); ++i) {
86  const auto curr_size = frag_offsets[i] - frag_offsets[i - 1];
87  if (curr_size != frag_size) {
88  return int64_t(-1);
89  }
90  }
91  return !frag_size ? std::numeric_limits<int64_t>::max()
92  : static_cast<int64_t>(frag_size);
93 }
94 
95 inline std::vector<int64_t> get_consistent_frags_sizes(
96  const std::vector<std::vector<uint64_t>>& frag_offsets) {
97  if (frag_offsets.empty()) {
98  return {};
99  }
100  std::vector<int64_t> frag_sizes;
101  for (size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
102  std::vector<uint64_t> tab_offs;
103  for (auto& offsets : frag_offsets) {
104  tab_offs.push_back(offsets[tab_idx]);
105  }
106  frag_sizes.push_back(get_consistent_frag_size(tab_offs));
107  }
108  return frag_sizes;
109 }
110 
111 inline std::vector<int64_t> get_consistent_frags_sizes(
112  const std::vector<Analyzer::Expr*>& target_exprs,
113  const std::vector<int64_t>& table_frag_sizes) {
114  std::vector<int64_t> col_frag_sizes;
115  for (auto expr : target_exprs) {
116  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
117  if (col_var->get_rte_idx() < 0) {
118  CHECK_EQ(-1, col_var->get_rte_idx());
119  col_frag_sizes.push_back(int64_t(-1));
120  } else {
121  col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
122  }
123  } else {
124  col_frag_sizes.push_back(int64_t(-1));
125  }
126  }
127  return col_frag_sizes;
128 }
129 
130 inline std::vector<std::vector<int64_t>> get_col_frag_offsets(
131  const std::vector<Analyzer::Expr*>& target_exprs,
132  const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
133  std::vector<std::vector<int64_t>> col_frag_offsets;
134  for (auto& table_offsets : table_frag_offsets) {
135  std::vector<int64_t> col_offsets;
136  for (auto expr : target_exprs) {
137  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
138  if (col_var->get_rte_idx() < 0) {
139  CHECK_EQ(-1, col_var->get_rte_idx());
140  col_offsets.push_back(int64_t(-1));
141  } else {
142  CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
143  col_offsets.push_back(
144  static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
145  }
146  } else {
147  col_offsets.push_back(int64_t(-1));
148  }
149  }
150  col_frag_offsets.push_back(col_offsets);
151  }
152  return col_frag_offsets;
153 }
154 
155 // Return the RelAlg input index of outer_table_id based on ra_exe_unit.input_descs.
156 // Used by UNION queries to get the target_exprs corresponding to the current subquery.
157 int get_input_idx(RelAlgExecutionUnit const& ra_exe_unit, int const outer_table_id) {
158  auto match_table_id = [=](auto& desc) { return outer_table_id == desc.getTableId(); };
159  auto& input_descs = ra_exe_unit.input_descs;
160  auto itr = std::find_if(input_descs.begin(), input_descs.end(), match_table_id);
161  return itr == input_descs.end() ? 0 : itr->getNestLevel();
162 }
163 
164 } // namespace
165 
166 // Row-based execution constructor
168  const RelAlgExecutionUnit& ra_exe_unit,
170  const int device_id,
171  const ExecutorDeviceType device_type,
172  const ExecutorDispatchMode dispatch_mode,
173  const bool output_columnar,
174  const bool sort_on_gpu,
175  const int outer_table_id,
176  const int64_t num_rows,
177  const std::vector<std::vector<const int8_t*>>& col_buffers,
178  const std::vector<std::vector<uint64_t>>& frag_offsets,
179  RenderAllocatorMap* render_allocator_map,
180  RenderInfo* render_info,
181  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
182  DeviceAllocator* device_allocator,
183  const size_t thread_idx,
184  const Executor* executor)
185  : num_rows_(num_rows)
186  , row_set_mem_owner_(row_set_mem_owner)
187  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
188  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
195  , device_allocator_(device_allocator)
196  , thread_idx_(thread_idx) {
197  CHECK(!sort_on_gpu || output_columnar);
198 
199  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
200  if (consistent_frag_sizes.empty()) {
201  // No fragments in the input, no underlying buffers will be needed.
202  return;
203  }
204  if (!ra_exe_unit.use_bump_allocator) {
205  check_total_bitmap_memory(query_mem_desc);
206  }
207  if (device_type == ExecutorDeviceType::GPU) {
208  allocateCountDistinctGpuMem(query_mem_desc);
209  }
210 
211  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
212  allocateCountDistinctBuffers(query_mem_desc, false, executor);
213  allocateTDigests(query_mem_desc, false, executor);
214  if (render_info && render_info->useCudaBuffers()) {
215  return;
216  }
217  }
218 
219  if (ra_exe_unit.estimator) {
220  return;
221  }
222 
223  const auto thread_count = device_type == ExecutorDeviceType::GPU
224  ? executor->blockSize() * executor->gridSize()
225  : 1;
226 
227  size_t group_buffer_size{0};
228  if (ra_exe_unit.use_bump_allocator) {
229  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
230  // the fragment
231  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
232  group_buffer_size = num_rows * query_mem_desc.getRowSize();
233  } else {
234  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
235  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
236  }
237  } else {
238  group_buffer_size =
239  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
240  }
241  CHECK_GE(group_buffer_size, size_t(0));
242 
243  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
244  int64_t* group_by_buffer_template{nullptr};
245  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
246  group_by_buffer_template = reinterpret_cast<int64_t*>(
247  row_set_mem_owner_->allocate(group_buffer_size, thread_idx_));
248  initGroupByBuffer(group_by_buffer_template,
249  ra_exe_unit,
250  query_mem_desc,
251  device_type,
252  output_columnar,
253  executor);
254  }
255 
256  if (query_mem_desc.interleavedBins(device_type)) {
257  CHECK(query_mem_desc.hasKeylessHash());
258  }
259 
260  const auto step = device_type == ExecutorDeviceType::GPU &&
261  query_mem_desc.threadsShareMemory() &&
262  query_mem_desc.isGroupBy()
263  ? executor->blockSize()
264  : size_t(1);
265  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
266  query_mem_desc.hasKeylessHash()
267  ? query_mem_desc.getEntryCount()
268  : size_t(0);
269  const auto actual_group_buffer_size =
270  group_buffer_size + index_buffer_qw * sizeof(int64_t);
271  CHECK_GE(actual_group_buffer_size, group_buffer_size);
272 
273  if (query_mem_desc.hasVarlenOutput()) {
274  const auto varlen_buffer_elem_size_opt = query_mem_desc.varlenOutputBufferElemSize();
275  CHECK(varlen_buffer_elem_size_opt); // TODO(adb): relax
276  auto varlen_output_buffer = reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(
277  query_mem_desc.getEntryCount() * varlen_buffer_elem_size_opt.value()));
278  num_buffers_ += 1;
279  group_by_buffers_.push_back(varlen_output_buffer);
280  }
281 
282  for (size_t i = 0; i < group_buffers_count; i += step) {
283  auto group_by_buffer = alloc_group_by_buffer(actual_group_buffer_size,
284  render_allocator_map,
285  thread_idx_,
286  row_set_mem_owner_.get());
287  if (!query_mem_desc.lazyInitGroups(device_type)) {
288  if (group_by_buffer_template) {
289  memcpy(group_by_buffer + index_buffer_qw,
290  group_by_buffer_template,
291  group_buffer_size);
292  } else {
293  initGroupByBuffer(group_by_buffer + index_buffer_qw,
294  ra_exe_unit,
295  query_mem_desc,
296  device_type,
297  output_columnar,
298  executor);
299  }
300  }
301  group_by_buffers_.push_back(group_by_buffer);
302  for (size_t j = 1; j < step; ++j) {
303  group_by_buffers_.push_back(nullptr);
304  }
305  const bool use_target_exprs_union =
306  ra_exe_unit.union_all && get_input_idx(ra_exe_unit, outer_table_id);
307  const auto& target_exprs = use_target_exprs_union ? ra_exe_unit.target_exprs_union
308  : ra_exe_unit.target_exprs;
309  const auto column_frag_offsets = get_col_frag_offsets(target_exprs, frag_offsets);
310  const auto column_frag_sizes =
311  get_consistent_frags_sizes(target_exprs, consistent_frag_sizes);
312 
313  result_sets_.emplace_back(
314  new ResultSet(target_exprs_to_infos(target_exprs, query_mem_desc),
315  executor->getColLazyFetchInfo(target_exprs),
316  col_buffers,
317  column_frag_offsets,
318  column_frag_sizes,
319  device_type,
320  device_id,
323  executor->getCatalog(),
324  executor->blockSize(),
325  executor->gridSize()));
326  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
327  executor->plan_state_->init_agg_vals_,
329  for (size_t j = 1; j < step; ++j) {
330  result_sets_.emplace_back(nullptr);
331  }
332  }
333 }
334 
335 // Table functions execution constructor
337  const TableFunctionExecutionUnit& exe_unit,
339  const int device_id,
340  const ExecutorDeviceType device_type,
341  const int64_t num_rows,
342  const std::vector<std::vector<const int8_t*>>& col_buffers,
343  const std::vector<std::vector<uint64_t>>& frag_offsets,
344  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
345  DeviceAllocator* device_allocator,
346  const Executor* executor)
347  : num_rows_(num_rows)
348  , row_set_mem_owner_(row_set_mem_owner)
349  , init_agg_vals_(init_agg_val_vec(exe_unit.target_exprs, {}, query_mem_desc))
350  , num_buffers_(1)
357  , device_allocator_(device_allocator)
358  , thread_idx_(0) {
359  // Table functions output columnar, basically treat this as a projection
360  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
361  if (consistent_frag_sizes.empty()) {
362  // No fragments in the input, no underlying buffers will be needed.
363  return;
364  }
365 
366  const size_t num_columns =
367  query_mem_desc.getBufferColSlotCount(); // shouldn't we use getColCount() ???
368  size_t total_group_by_buffer_size{0};
369  for (size_t i = 0; i < num_columns; ++i) {
370  auto ti = exe_unit.target_exprs[i]->get_type_info();
371  if (ti.is_array()) {
372  // See TableFunctionManager.h for info regarding flatbuffer
373  // memory managment.
374  auto slot_idx = query_mem_desc.getSlotIndexForSingleSlotCol(i);
375  int64_t flatbuffer_size = query_mem_desc.getFlatBufferSize(slot_idx);
376  total_group_by_buffer_size =
377  align_to_int64(total_group_by_buffer_size + flatbuffer_size);
378  } else {
379  const size_t col_width = ti.get_size();
380  const size_t group_buffer_size = num_rows_ * col_width;
381  total_group_by_buffer_size =
382  align_to_int64(total_group_by_buffer_size + group_buffer_size);
383  }
384  }
385 
386  CHECK_EQ(num_buffers_, size_t(1));
387  auto group_by_buffer = alloc_group_by_buffer(
388  total_group_by_buffer_size, nullptr, thread_idx_, row_set_mem_owner.get());
389  group_by_buffers_.push_back(group_by_buffer);
390 
391  const auto column_frag_offsets =
392  get_col_frag_offsets(exe_unit.target_exprs, frag_offsets);
393  const auto column_frag_sizes =
394  get_consistent_frags_sizes(exe_unit.target_exprs, consistent_frag_sizes);
395  result_sets_.emplace_back(
396  new ResultSet(target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
397  /*col_lazy_fetch_info=*/{},
398  col_buffers,
399  column_frag_offsets,
400  column_frag_sizes,
401  device_type,
402  device_id,
404  row_set_mem_owner_,
405  executor->getCatalog(),
406  executor->blockSize(),
407  executor->gridSize()));
408  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
409  init_agg_vals_);
410 }
411 
413  int64_t* buffer,
414  const RelAlgExecutionUnit& ra_exe_unit,
416  const ExecutorDeviceType device_type,
417  const bool output_columnar,
418  const Executor* executor) {
419  if (output_columnar) {
420  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor);
421  } else {
422  auto rows_ptr = buffer;
423  auto actual_entry_count = query_mem_desc.getEntryCount();
424  const auto thread_count = device_type == ExecutorDeviceType::GPU
425  ? executor->blockSize() * executor->gridSize()
426  : 1;
427  auto warp_size =
428  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
429  if (query_mem_desc.useStreamingTopN()) {
430  const auto node_count_size = thread_count * sizeof(int64_t);
431  memset(rows_ptr, 0, node_count_size);
432  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
433  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
434  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
435  rows_ptr += rows_offset / sizeof(int64_t);
436  actual_entry_count = n * thread_count;
437  warp_size = 1;
438  }
439  initRowGroups(query_mem_desc,
440  rows_ptr,
442  actual_entry_count,
443  warp_size,
444  executor);
445  }
446 }
447 
449  int64_t* groups_buffer,
450  const std::vector<int64_t>& init_vals,
451  const int32_t groups_buffer_entry_count,
452  const size_t warp_size,
453  const Executor* executor) {
454  const size_t key_count{query_mem_desc.getGroupbyColCount()};
455  const size_t row_size{query_mem_desc.getRowSize()};
456  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
457 
458  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
459  auto quantile_params = allocateTDigests(query_mem_desc, true, executor);
460  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
461 
462  const auto query_mem_desc_fixedup =
464 
465  auto const is_true = [](auto const& x) { return static_cast<bool>(x); };
466  // not COUNT DISTINCT / APPROX_COUNT_DISTINCT / APPROX_QUANTILE
467  // we fallback to default implementation in that cases
468  if (!std::any_of(agg_bitmap_size.begin(), agg_bitmap_size.end(), is_true) &&
469  !std::any_of(quantile_params.begin(), quantile_params.end(), is_true) &&
471  std::vector<int8_t> sample_row(row_size - col_base_off);
472 
473  initColumnsPerRow(query_mem_desc_fixedup,
474  sample_row.data(),
475  init_vals,
476  agg_bitmap_size,
477  quantile_params);
478 
479  if (query_mem_desc.hasKeylessHash()) {
480  CHECK(warp_size >= 1);
481  CHECK(key_count == 1 || warp_size == 1);
482  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
483  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
484  ++bin, buffer_ptr += row_size) {
485  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
486  }
487  }
488  return;
489  }
490 
491  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
492  ++bin, buffer_ptr += row_size) {
493  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
495  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
496  }
497  } else {
498  if (query_mem_desc.hasKeylessHash()) {
499  CHECK(warp_size >= 1);
500  CHECK(key_count == 1 || warp_size == 1);
501  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
502  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
503  ++bin, buffer_ptr += row_size) {
504  initColumnsPerRow(query_mem_desc_fixedup,
505  &buffer_ptr[col_base_off],
506  init_vals,
507  agg_bitmap_size,
508  quantile_params);
509  }
510  }
511  return;
512  }
513 
514  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
515  ++bin, buffer_ptr += row_size) {
517  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
518  initColumnsPerRow(query_mem_desc_fixedup,
519  &buffer_ptr[col_base_off],
520  init_vals,
521  agg_bitmap_size,
522  quantile_params);
523  }
524  }
525 }
526 
527 namespace {
528 
529 template <typename T>
530 int8_t* initColumnarBuffer(T* buffer_ptr, const T init_val, const uint32_t entry_count) {
531  static_assert(sizeof(T) <= sizeof(int64_t), "Unsupported template type");
532  for (uint32_t i = 0; i < entry_count; ++i) {
533  buffer_ptr[i] = init_val;
534  }
535  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
536 }
537 
538 } // namespace
539 
542  int64_t* groups_buffer,
543  const std::vector<int64_t>& init_vals,
544  const Executor* executor) {
545  CHECK(groups_buffer);
546 
547  for (const auto target_expr : executor->plan_state_->target_exprs_) {
548  const auto agg_info = get_target_info(target_expr, g_bigint_count);
549  CHECK(!is_distinct_target(agg_info));
550  }
551  const int32_t agg_col_count = query_mem_desc.getSlotCount();
552  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
553 
554  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
555  if (!query_mem_desc.hasKeylessHash()) {
556  const size_t key_count{query_mem_desc.getGroupbyColCount()};
557  for (size_t i = 0; i < key_count; ++i) {
558  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
559  EMPTY_KEY_64,
560  groups_buffer_entry_count);
561  }
562  }
563 
565  // initializing all aggregate columns:
566  int32_t init_val_idx = 0;
567  for (int32_t i = 0; i < agg_col_count; ++i) {
568  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
569  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
570  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
571  case 1:
572  buffer_ptr = initColumnarBuffer<int8_t>(
573  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
574  break;
575  case 2:
576  buffer_ptr =
577  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
578  init_vals[init_val_idx++],
579  groups_buffer_entry_count);
580  break;
581  case 4:
582  buffer_ptr =
583  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
584  init_vals[init_val_idx++],
585  groups_buffer_entry_count);
586  break;
587  case 8:
588  buffer_ptr =
589  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
590  init_vals[init_val_idx++],
591  groups_buffer_entry_count);
592  break;
593  case 0:
594  break;
595  default:
596  CHECK(false);
597  }
598 
599  buffer_ptr = align_to_int64(buffer_ptr);
600  }
601  }
602  }
603 }
604 
607  int8_t* row_ptr,
608  const std::vector<int64_t>& init_vals,
609  const std::vector<int64_t>& bitmap_sizes,
610  const std::vector<QuantileParam>& quantile_params) {
611  int8_t* col_ptr = row_ptr;
612  size_t init_vec_idx = 0;
613  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
614  col_ptr += query_mem_desc.getNextColOffInBytesRowOnly(col_ptr, col_idx++)) {
615  const int64_t bm_sz{bitmap_sizes[col_idx]};
616  int64_t init_val{0};
617  if (bm_sz && query_mem_desc.isGroupBy()) {
618  // COUNT DISTINCT / APPROX_COUNT_DISTINCT
619  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
620  sizeof(int64_t));
621  init_val =
623  ++init_vec_idx;
624  } else if (query_mem_desc.isGroupBy() && quantile_params[col_idx]) {
625  auto const q = *quantile_params[col_idx];
626  // allocate for APPROX_QUANTILE only when slot is used
627  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
628  ++init_vec_idx;
629  } else {
630  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
631  CHECK_LT(init_vec_idx, init_vals.size());
632  init_val = init_vals[init_vec_idx++];
633  }
634  }
635  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
636  case 1:
637  *col_ptr = static_cast<int8_t>(init_val);
638  break;
639  case 2:
640  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
641  break;
642  case 4:
643  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
644  break;
645  case 8:
646  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
647  break;
648  case 0:
649  continue;
650  default:
651  CHECK(false);
652  }
653  }
654 }
655 
658  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
659  return;
660  }
662 
663  size_t total_bytes_per_entry{0};
664  const size_t num_count_distinct_descs =
665  query_mem_desc.getCountDistinctDescriptorsSize();
666  for (size_t i = 0; i < num_count_distinct_descs; i++) {
667  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
668  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
669  continue;
670  }
671  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
672  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
673  }
674 
676  total_bytes_per_entry * query_mem_desc.getEntryCount();
677  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
679  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
681 
684 }
685 
686 // deferred is true for group by queries; initGroups will allocate a bitmap
687 // for each group slot
690  const bool deferred,
691  const Executor* executor) {
692  const size_t agg_col_count{query_mem_desc.getSlotCount()};
693  std::vector<int64_t> agg_bitmap_size(deferred ? agg_col_count : 0);
694 
695  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
696  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
697  ++target_idx) {
698  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
699  const auto agg_info = get_target_info(target_expr, g_bigint_count);
700  if (is_distinct_target(agg_info)) {
701  CHECK(agg_info.is_agg &&
702  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
703  CHECK(!agg_info.sql_type.is_varlen());
704 
705  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
706  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
707 
708  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
709  sizeof(int64_t));
710  const auto& count_distinct_desc =
711  query_mem_desc.getCountDistinctDescriptor(target_idx);
712  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
713  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
714  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
715  if (deferred) {
716  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
717  } else {
718  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
719  }
720  } else {
721  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::UnorderedSet);
722  if (deferred) {
723  agg_bitmap_size[agg_col_idx] = -1;
724  } else {
725  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
726  }
727  }
728  }
729  }
730 
731  return agg_bitmap_size;
732 }
733 
734 int64_t QueryMemoryInitializer::allocateCountDistinctBitmap(const size_t bitmap_byte_sz) {
738  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
739  row_set_mem_owner_->addCountDistinctBuffer(
740  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
741  return reinterpret_cast<int64_t>(ptr);
742  }
743  return reinterpret_cast<int64_t>(
744  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz, thread_idx_));
745 }
746 
748  auto count_distinct_set = new CountDistinctSet();
749  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
750  return reinterpret_cast<int64_t>(count_distinct_set);
751 }
752 
753 std::vector<QueryMemoryInitializer::QuantileParam>
755  const bool deferred,
756  const Executor* executor) {
757  size_t const slot_count = query_mem_desc.getSlotCount();
758  size_t const ntargets = executor->plan_state_->target_exprs_.size();
759  CHECK_GE(slot_count, ntargets);
760  std::vector<QuantileParam> quantile_params(deferred ? slot_count : 0);
761 
762  for (size_t target_idx = 0; target_idx < ntargets; ++target_idx) {
763  auto const target_expr = executor->plan_state_->target_exprs_[target_idx];
764  if (auto const agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr)) {
765  if (agg_expr->get_aggtype() == kAPPROX_QUANTILE) {
766  size_t const agg_col_idx =
767  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
768  CHECK_LT(agg_col_idx, slot_count);
769  CHECK_EQ(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx),
770  static_cast<int8_t>(sizeof(int64_t)));
771  auto const q = agg_expr->get_arg1()->get_constval().doubleval;
772  if (deferred) {
773  quantile_params[agg_col_idx] = q;
774  } else {
775  // allocate for APPROX_QUANTILE only when slot is used
776  init_agg_vals_[agg_col_idx] =
777  reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
778  }
779  }
780  }
781  }
782  return quantile_params;
783 }
784 
787  const int8_t* init_agg_vals_dev_ptr,
788  const size_t n,
789  const int device_id,
790  const unsigned block_size_x,
791  const unsigned grid_size_x) {
792 #ifdef HAVE_CUDA
794  const auto thread_count = block_size_x * grid_size_x;
795  const auto total_buff_size =
796  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
797  int8_t* dev_buffer = device_allocator_->alloc(total_buff_size);
798 
799  std::vector<int8_t*> dev_buffers(thread_count);
800 
801  for (size_t i = 0; i < thread_count; ++i) {
802  dev_buffers[i] = dev_buffer;
803  }
804 
805  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(int8_t*));
807  dev_ptr, dev_buffers.data(), thread_count * sizeof(int8_t*));
808 
810 
811  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
812  thread_count * sizeof(int64_t));
813 
815  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
816  (unsigned char)-1,
817  thread_count * n * sizeof(int64_t));
818 
820  reinterpret_cast<int64_t*>(
821  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
822  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
823  n * thread_count,
824  query_mem_desc.getGroupbyColCount(),
825  query_mem_desc.getEffectiveKeyWidth(),
826  query_mem_desc.getRowSize() / sizeof(int64_t),
827  query_mem_desc.hasKeylessHash(),
828  1,
829  block_size_x,
830  grid_size_x);
831 
832  return {dev_ptr, dev_buffer};
833 #else
834  UNREACHABLE();
835  return {};
836 #endif
837 }
838 
840  const RelAlgExecutionUnit& ra_exe_unit,
842  const int8_t* init_agg_vals_dev_ptr,
843  const int device_id,
844  const ExecutorDispatchMode dispatch_mode,
845  const unsigned block_size_x,
846  const unsigned grid_size_x,
847  const int8_t warp_size,
848  const bool can_sort_on_gpu,
849  const bool output_columnar,
850  RenderAllocator* render_allocator) {
851 #ifdef HAVE_CUDA
852  if (query_mem_desc.useStreamingTopN()) {
853  if (render_allocator) {
855  }
856  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
857  CHECK(!output_columnar);
858 
860  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
861  }
862 
863  auto dev_group_by_buffers =
866  query_mem_desc,
867  block_size_x,
868  grid_size_x,
869  device_id,
870  dispatch_mode,
871  num_rows_,
872  can_sort_on_gpu,
873  false,
874  ra_exe_unit.use_bump_allocator,
875  query_mem_desc.hasVarlenOutput(),
876  render_allocator);
877  if (query_mem_desc.hasVarlenOutput()) {
878  CHECK(dev_group_by_buffers.varlen_output_buffer);
880  reinterpret_cast<CUdeviceptr>(dev_group_by_buffers.varlen_output_buffer);
881  CHECK(query_mem_desc.varlenOutputBufferElemSize());
882  const size_t varlen_output_buf_bytes =
883  query_mem_desc.getEntryCount() *
884  query_mem_desc.varlenOutputBufferElemSize().value();
886  row_set_mem_owner_->allocate(varlen_output_buf_bytes, thread_idx_);
888  varlen_output_info_->gpu_start_address = static_cast<int64_t>(varlen_output_buffer_);
890  }
891  if (render_allocator) {
892  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
893  }
894  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
895  CHECK(!render_allocator);
896 
897  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
898  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
899  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
900  auto group_by_dev_buffer = dev_group_by_buffers.data;
901  const size_t col_count = query_mem_desc.getSlotCount();
902  int8_t* col_widths_dev_ptr{nullptr};
903  if (output_columnar) {
904  std::vector<int8_t> compact_col_widths(col_count);
905  for (size_t idx = 0; idx < col_count; ++idx) {
906  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
907  }
908  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
910  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
911  }
912  const int8_t warp_count =
913  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
914  const auto num_group_by_buffers =
915  getGroupByBuffersSize() - (query_mem_desc.hasVarlenOutput() ? 1 : 0);
916  for (size_t i = 0; i < num_group_by_buffers; i += step) {
917  if (output_columnar) {
919  reinterpret_cast<int64_t*>(group_by_dev_buffer),
920  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
921  dev_group_by_buffers.entry_count,
922  query_mem_desc.getGroupbyColCount(),
923  col_count,
924  col_widths_dev_ptr,
925  /*need_padding = */ true,
926  query_mem_desc.hasKeylessHash(),
927  sizeof(int64_t),
928  block_size_x,
929  grid_size_x);
930  } else {
932  reinterpret_cast<int64_t*>(group_by_dev_buffer),
933  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
934  dev_group_by_buffers.entry_count,
935  query_mem_desc.getGroupbyColCount(),
936  query_mem_desc.getEffectiveKeyWidth(),
937  query_mem_desc.getRowSize() / sizeof(int64_t),
938  query_mem_desc.hasKeylessHash(),
939  warp_count,
940  block_size_x,
941  grid_size_x);
942  }
943  group_by_dev_buffer += groups_buffer_size;
944  }
945  }
946  return dev_group_by_buffers;
947 #else
948  UNREACHABLE();
949  return {};
950 #endif
951 }
952 
955  const int device_id,
956  const unsigned block_size_x,
957  const unsigned grid_size_x,
958  const bool zero_initialize_buffers) {
959  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
960  CHECK_GT(num_columns, size_t(0));
961  size_t total_group_by_buffer_size{0};
962  const auto col_slot_context = query_mem_desc.getColSlotContext();
963 
964  std::vector<size_t> col_byte_offsets;
965  col_byte_offsets.reserve(num_columns);
966 
967  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
968  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
969  size_t group_buffer_size = num_rows_ * col_width;
970  col_byte_offsets.emplace_back(total_group_by_buffer_size);
971  total_group_by_buffer_size =
972  align_to_int64(total_group_by_buffer_size + group_buffer_size);
973  }
974 
975  int8_t* dev_buffers_allocation{nullptr};
976  dev_buffers_allocation = device_allocator_->alloc(total_group_by_buffer_size);
977  CHECK(dev_buffers_allocation);
978  if (zero_initialize_buffers) {
979  device_allocator_->zeroDeviceMem(dev_buffers_allocation, total_group_by_buffer_size);
980  }
981 
982  auto dev_buffers_mem = dev_buffers_allocation;
983  std::vector<int8_t*> dev_buffers(num_columns);
984  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
985  dev_buffers[col_idx] = dev_buffers_allocation + col_byte_offsets[col_idx];
986  }
987  auto dev_ptrs = device_allocator_->alloc(num_columns * sizeof(CUdeviceptr));
989  dev_ptrs, dev_buffers.data(), num_columns * sizeof(CUdeviceptr));
990 
991  return {dev_ptrs, dev_buffers_mem, (size_t)num_rows_};
992 }
993 
995  Data_Namespace::DataMgr* data_mgr,
996  const QueryMemoryDescriptor& query_mem_desc,
997  const size_t entry_count,
998  const GpuGroupByBuffers& gpu_group_by_buffers,
999  const int device_id,
1000  const unsigned block_size_x,
1001  const unsigned grid_size_x) {
1002  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
1003 
1004  int8_t* dev_buffer = gpu_group_by_buffers.data;
1005  int8_t* host_buffer = reinterpret_cast<int8_t*>(group_by_buffers_[0]);
1006 
1007  const size_t original_entry_count = gpu_group_by_buffers.entry_count;
1008  CHECK_LE(entry_count, original_entry_count);
1009  size_t output_device_col_offset{0};
1010  size_t output_host_col_offset{0};
1011 
1012  const auto col_slot_context = query_mem_desc.getColSlotContext();
1013 
1014  auto allocator = std::make_unique<CudaAllocator>(
1015  data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id));
1016 
1017  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1018  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
1019  const size_t output_device_col_size = original_entry_count * col_width;
1020  const size_t output_host_col_size = entry_count * col_width;
1021  allocator->copyFromDevice(host_buffer + output_host_col_offset,
1022  dev_buffer + output_device_col_offset,
1023  output_host_col_size);
1024  output_device_col_offset =
1025  align_to_int64(output_device_col_offset + output_device_col_size);
1026  output_host_col_offset =
1027  align_to_int64(output_host_col_offset + output_host_col_size);
1028  }
1029 }
1030 
1032  const QueryMemoryDescriptor& query_mem_desc,
1033  const ExecutorDeviceType device_type,
1034  const Executor* executor) const {
1035  return device_type == ExecutorDeviceType::CPU
1036  ? 1
1037  : executor->blockSize() *
1038  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
1039 }
1040 
1041 namespace {
1042 
1043 // in-place compaction of output buffer
1045  const QueryMemoryDescriptor& query_mem_desc,
1046  int8_t* projection_buffer,
1047  const size_t projection_count) {
1048  // the first column (row indices) remains unchanged.
1049  CHECK(projection_count <= query_mem_desc.getEntryCount());
1050  constexpr size_t row_index_width = sizeof(int64_t);
1051  size_t buffer_offset1{projection_count * row_index_width};
1052  // other columns are actual non-lazy columns for the projection:
1053  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
1054  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
1055  auto column_proj_size =
1056  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
1057  auto buffer_offset2 = query_mem_desc.getColOffInBytes(i);
1058  if (buffer_offset1 + column_proj_size >= buffer_offset2) {
1059  // overlapping
1060  std::memmove(projection_buffer + buffer_offset1,
1061  projection_buffer + buffer_offset2,
1062  column_proj_size);
1063  } else {
1064  std::memcpy(projection_buffer + buffer_offset1,
1065  projection_buffer + buffer_offset2,
1066  column_proj_size);
1067  }
1068  buffer_offset1 += align_to_int64(column_proj_size);
1069  }
1070  }
1071 }
1072 
1073 } // namespace
1074 
1076  const QueryMemoryDescriptor& query_mem_desc,
1077  const size_t projection_count) {
1078  const auto num_allocated_rows =
1079  std::min(projection_count, query_mem_desc.getEntryCount());
1080  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1081 
1082  // copy the results from the main buffer into projection_buffer
1084  query_mem_desc,
1085  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1086  num_allocated_rows);
1087 
1088  // update the entry count for the result set, and its underlying storage
1089  CHECK(!result_sets_.empty());
1090  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1091 }
1092 
1094  const QueryMemoryDescriptor& query_mem_desc,
1095  Data_Namespace::DataMgr* data_mgr,
1096  const GpuGroupByBuffers& gpu_group_by_buffers,
1097  const size_t projection_count,
1098  const int device_id) {
1099  // store total number of allocated rows:
1100  const auto num_allocated_rows =
1101  std::min(projection_count, query_mem_desc.getEntryCount());
1102 
1103  // copy the results from the main buffer into projection_buffer
1104  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1106  data_mgr,
1107  gpu_group_by_buffers,
1108  query_mem_desc,
1109  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1110  num_allocated_rows,
1111  device_id);
1112 
1113  // update the entry count for the result set, and its underlying storage
1114  CHECK(!result_sets_.empty());
1115  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1116 }
1117 
1119  DeviceAllocator& device_allocator,
1120  const QueryMemoryDescriptor& query_mem_desc,
1121  const size_t entry_count,
1122  const GpuGroupByBuffers& gpu_group_by_buffers,
1123  const RelAlgExecutionUnit* ra_exe_unit,
1124  const unsigned block_size_x,
1125  const unsigned grid_size_x,
1126  const int device_id,
1127  const bool prepend_index_buffer) const {
1128  const auto thread_count = block_size_x * grid_size_x;
1129 
1130  size_t total_buff_size{0};
1131  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
1132  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
1133  total_buff_size =
1134  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1135  } else {
1136  total_buff_size =
1137  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
1138  }
1139  copy_group_by_buffers_from_gpu(device_allocator,
1141  total_buff_size,
1142  gpu_group_by_buffers.data,
1143  query_mem_desc,
1144  block_size_x,
1145  grid_size_x,
1146  device_id,
1147  prepend_index_buffer,
1148  query_mem_desc.hasVarlenOutput());
1149 }
1150 
1152  const QueryMemoryDescriptor& query_mem_desc,
1153  const RelAlgExecutionUnit& ra_exe_unit) {
1154  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1155  CHECK_EQ(group_by_buffers_.size(), buffer_start_idx + 1);
1156 
1157  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1158  group_by_buffers_[buffer_start_idx],
1159  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1160  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
1161  1);
1162  CHECK_EQ(rows_copy.size(),
1163  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1164  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1165 }
1166 
1168  Data_Namespace::DataMgr* data_mgr,
1169  const QueryMemoryDescriptor& query_mem_desc,
1170  const GpuGroupByBuffers& gpu_group_by_buffers,
1171  const RelAlgExecutionUnit& ra_exe_unit,
1172  const unsigned total_thread_count,
1173  const int device_id) {
1174 #ifdef HAVE_CUDA
1176  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1177 
1178  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1179  data_mgr,
1180  reinterpret_cast<int64_t*>(gpu_group_by_buffers.data),
1181  ra_exe_unit,
1182  query_mem_desc,
1183  total_thread_count,
1184  device_id);
1185  CHECK_EQ(
1186  rows_copy.size(),
1187  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1188  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1189 #else
1190  UNREACHABLE();
1191 #endif
1192 }
1193 
1194 std::shared_ptr<VarlenOutputInfo> QueryMemoryInitializer::getVarlenOutputInfo() {
1195  if (varlen_output_info_) {
1196  return varlen_output_info_;
1197  }
1198 
1199  // shared_ptr so that both the ResultSet and QMI can hold on to the varlen info object
1200  // and update it as needed
1201  varlen_output_info_ = std::make_shared<VarlenOutputInfo>(VarlenOutputInfo{
1202  static_cast<int64_t>(varlen_output_buffer_), varlen_output_buffer_host_ptr_});
1203  return varlen_output_info_;
1204 }
GpuGroupByBuffers setupTableFunctionGpuBuffers(const QueryMemoryDescriptor &query_mem_desc, const int device_id, const unsigned block_size_x, const unsigned grid_size_x, const bool zero_initialize_buffers)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:230
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *device_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:70
RenderAllocator * getRenderAllocator(size_t device_id)
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
bool countDistinctDescriptorsLogicallyEmpty() const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:53
#define EMPTY_KEY_64
int8_t logical_size
GpuGroupByBuffers prepareTopNHeapsDevBuffer(const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
GpuGroupByBuffers createAndInitializeGroupByBufferGpu(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const int device_id, const ExecutorDispatchMode dispatch_mode, const unsigned block_size_x, const unsigned grid_size_x, const int8_t warp_size, const bool can_sort_on_gpu, const bool output_columnar, RenderAllocator *render_allocator)
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< QuantileParam > &quantile_params)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
DeviceAllocator * device_allocator_
ExecutorDeviceType
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
const std::optional< bool > union_all
Streaming Top N algorithm.
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
unsigned long long CUdeviceptr
Definition: nocuda.h:28
int8_t * allocate(const size_t num_bytes, const size_t thread_idx=0) override
std::vector< InputDescriptor > input_descs
#define UNREACHABLE()
Definition: Logger.h:266
#define CHECK_GE(x, y)
Definition: Logger.h:235
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
varlen_output_buffer_(0)
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getEffectiveKeyWidth() const
num_buffers_(1)
#define CHECK_GT(x, y)
Definition: Logger.h:234
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:97
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< QuantileParam > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
varlen_output_buffer_host_ptr_(nullptr)
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
const SlotSize & getSlotInfo(const size_t slot_idx) const
std::vector< Analyzer::Expr * > target_exprs_union
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
const size_t limit
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
std::vector< int64_t > init_agg_vals_
size_t getGroupbyColCount() const
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
bool g_bigint_count
int64_t g_bitmap_memory_limit
size_t g_max_memory_allocation_size
Definition: Execute.cpp:116
size_t getAllocatedSize() const
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:107
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
const std::shared_ptr< Analyzer::Estimator > estimator
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int outer_table_id, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
size_t getCountDistinctDescriptorsSize() const
QueryDescriptionType getQueryDescriptionType() const
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
void copyGroupByBuffersFromGpu(DeviceAllocator &device_allocator, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
std::optional< size_t > varlenOutputBufferElemSize() const
#define CHECK_LT(x, y)
Definition: Logger.h:232
std::shared_ptr< VarlenOutputInfo > getVarlenOutputInfo()
count_distinct_bitmap_mem_(0)
#define CHECK_LE(x, y)
Definition: Logger.h:233
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
size_t getNextColOffInBytesRowOnly(const int8_t *col_ptr, const size_t col_idx) const
Definition: sqldefs.h:77
Abstract class for managing device memory allocations.
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:756
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
count_distinct_bitmap_host_mem_(nullptr)
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
device_allocator_(device_allocator)
bool interleavedBins(const ExecutorDeviceType) const
const ColSlotContext & getColSlotContext() const
#define CHECK(condition)
Definition: Logger.h:222
void copyFromTableFunctionGpuBuffers(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
Basic constructors and methods of the row set interface.
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
bool g_optimize_row_initialization
Definition: Execute.cpp:101
count_distinct_bitmap_crt_ptr_(nullptr)
std::shared_ptr< VarlenOutputInfo > varlen_output_info_
constexpr double n
Definition: Utm.h:38
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
const size_t offset
count_distinct_bitmap_mem_bytes_(0)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
size_t getColOffInBytes(const size_t col_idx) const
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, const size_t thread_idx, RowSetMemoryOwner *mem_owner)
void copy_group_by_buffers_from_gpu(DeviceAllocator &device_allocator, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const int8_t *group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)
std::vector< std::unique_ptr< ResultSet > > result_sets_
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
int get_input_idx(RelAlgExecutionUnit const &ra_exe_unit, int const outer_table_id)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)