OmniSciDB  6686921089
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryMemoryInitializer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryInitializer.h"
18 
20 #include "Execute.h"
21 #include "GpuInitGroups.h"
22 #include "GpuMemUtils.h"
23 #include "Logger/Logger.h"
25 #include "ResultSet.h"
26 #include "StreamingTopN.h"
27 
28 #include <Shared/checked_alloc.h>
29 
30 // 8 GB, the limit of perfect hash group by under normal conditions
31 int64_t g_bitmap_memory_limit{8LL * 1000 * 1000 * 1000};
32 
33 namespace {
34 
36  const int32_t groups_buffer_entry_count = query_mem_desc.getEntryCount();
37  checked_int64_t total_bytes_per_group = 0;
38  const size_t num_count_distinct_descs =
39  query_mem_desc.getCountDistinctDescriptorsSize();
40  for (size_t i = 0; i < num_count_distinct_descs; i++) {
41  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
42  if (count_distinct_desc.impl_type_ != CountDistinctImplType::Bitmap) {
43  continue;
44  }
45  total_bytes_per_group += count_distinct_desc.bitmapPaddedSizeBytes();
46  }
47  int64_t total_bytes{0};
48  // Using OutOfHostMemory until we can verify that SlabTooBig would also be properly
49  // caught
50  try {
51  total_bytes = static_cast<int64_t>(total_bytes_per_group * groups_buffer_entry_count);
52  } catch (...) {
53  // Absurd amount of memory, merely computing the number of bits overflows int64_t.
54  // Don't bother to report the real amount, this is unlikely to ever happen.
55  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
56  }
57  if (total_bytes >= g_bitmap_memory_limit) {
58  throw OutOfHostMemory(total_bytes);
59  }
60 }
61 
62 int64_t* alloc_group_by_buffer(const size_t numBytes,
63  RenderAllocatorMap* render_allocator_map,
64  const size_t thread_idx,
65  RowSetMemoryOwner* mem_owner) {
66  if (render_allocator_map) {
67  // NOTE(adb): If we got here, we are performing an in-situ rendering query and are not
68  // using CUDA buffers. Therefore we need to allocate result set storage using CPU
69  // memory.
70  const auto gpu_idx = 0; // Only 1 GPU supported in CUDA-disabled rendering mode
71  auto render_allocator_ptr = render_allocator_map->getRenderAllocator(gpu_idx);
72  return reinterpret_cast<int64_t*>(render_allocator_ptr->alloc(numBytes));
73  } else {
74  return reinterpret_cast<int64_t*>(mem_owner->allocate(numBytes, thread_idx));
75  }
76 }
77 
78 inline int64_t get_consistent_frag_size(const std::vector<uint64_t>& frag_offsets) {
79  if (frag_offsets.size() < 2) {
80  return int64_t(-1);
81  }
82  const auto frag_size = frag_offsets[1] - frag_offsets[0];
83  for (size_t i = 2; i < frag_offsets.size(); ++i) {
84  const auto curr_size = frag_offsets[i] - frag_offsets[i - 1];
85  if (curr_size != frag_size) {
86  return int64_t(-1);
87  }
88  }
89  return !frag_size ? std::numeric_limits<int64_t>::max()
90  : static_cast<int64_t>(frag_size);
91 }
92 
93 inline std::vector<int64_t> get_consistent_frags_sizes(
94  const std::vector<std::vector<uint64_t>>& frag_offsets) {
95  if (frag_offsets.empty()) {
96  return {};
97  }
98  std::vector<int64_t> frag_sizes;
99  for (size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
100  std::vector<uint64_t> tab_offs;
101  for (auto& offsets : frag_offsets) {
102  tab_offs.push_back(offsets[tab_idx]);
103  }
104  frag_sizes.push_back(get_consistent_frag_size(tab_offs));
105  }
106  return frag_sizes;
107 }
108 
109 inline std::vector<int64_t> get_consistent_frags_sizes(
110  const std::vector<Analyzer::Expr*>& target_exprs,
111  const std::vector<int64_t>& table_frag_sizes) {
112  std::vector<int64_t> col_frag_sizes;
113  for (auto expr : target_exprs) {
114  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
115  if (col_var->get_rte_idx() < 0) {
116  CHECK_EQ(-1, col_var->get_rte_idx());
117  col_frag_sizes.push_back(int64_t(-1));
118  } else {
119  col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
120  }
121  } else {
122  col_frag_sizes.push_back(int64_t(-1));
123  }
124  }
125  return col_frag_sizes;
126 }
127 
128 inline std::vector<std::vector<int64_t>> get_col_frag_offsets(
129  const std::vector<Analyzer::Expr*>& target_exprs,
130  const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
131  std::vector<std::vector<int64_t>> col_frag_offsets;
132  for (auto& table_offsets : table_frag_offsets) {
133  std::vector<int64_t> col_offsets;
134  for (auto expr : target_exprs) {
135  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
136  if (col_var->get_rte_idx() < 0) {
137  CHECK_EQ(-1, col_var->get_rte_idx());
138  col_offsets.push_back(int64_t(-1));
139  } else {
140  CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
141  col_offsets.push_back(
142  static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
143  }
144  } else {
145  col_offsets.push_back(int64_t(-1));
146  }
147  }
148  col_frag_offsets.push_back(col_offsets);
149  }
150  return col_frag_offsets;
151 }
152 
153 } // namespace
154 
155 // Row-based execution constructor
157  const RelAlgExecutionUnit& ra_exe_unit,
159  const int device_id,
160  const ExecutorDeviceType device_type,
161  const ExecutorDispatchMode dispatch_mode,
162  const bool output_columnar,
163  const bool sort_on_gpu,
164  const int64_t num_rows,
165  const std::vector<std::vector<const int8_t*>>& col_buffers,
166  const std::vector<std::vector<uint64_t>>& frag_offsets,
167  RenderAllocatorMap* render_allocator_map,
168  RenderInfo* render_info,
169  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
170  DeviceAllocator* device_allocator,
171  const size_t thread_idx,
172  const Executor* executor)
173  : num_rows_(num_rows)
174  , row_set_mem_owner_(row_set_mem_owner)
175  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
176  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
183  , device_allocator_(device_allocator)
184  , thread_idx_(thread_idx) {
185  CHECK(!sort_on_gpu || output_columnar);
186 
187  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
188  if (consistent_frag_sizes.empty()) {
189  // No fragments in the input, no underlying buffers will be needed.
190  return;
191  }
192  if (!ra_exe_unit.use_bump_allocator) {
193  check_total_bitmap_memory(query_mem_desc);
194  }
195  if (device_type == ExecutorDeviceType::GPU) {
196  allocateCountDistinctGpuMem(query_mem_desc);
197  }
198 
199  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
200  allocateCountDistinctBuffers(query_mem_desc, false, executor);
201  allocateTDigests(query_mem_desc, false, executor);
202  if (render_info && render_info->useCudaBuffers()) {
203  return;
204  }
205  }
206 
207  if (ra_exe_unit.estimator) {
208  return;
209  }
210 
211  const auto thread_count = device_type == ExecutorDeviceType::GPU
212  ? executor->blockSize() * executor->gridSize()
213  : 1;
214 
215  size_t group_buffer_size{0};
216  if (ra_exe_unit.use_bump_allocator) {
217  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
218  // the fragment
219  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
220  group_buffer_size = num_rows * query_mem_desc.getRowSize();
221  } else {
222  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
223  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
224  }
225  } else {
226  group_buffer_size =
227  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
228  }
229  CHECK_GE(group_buffer_size, size_t(0));
230 
231  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
232  int64_t* group_by_buffer_template{nullptr};
233  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
234  group_by_buffer_template = reinterpret_cast<int64_t*>(
235  row_set_mem_owner_->allocate(group_buffer_size, thread_idx_));
236  initGroupByBuffer(group_by_buffer_template,
237  ra_exe_unit,
238  query_mem_desc,
239  device_type,
240  output_columnar,
241  executor);
242  }
243 
244  if (query_mem_desc.interleavedBins(device_type)) {
245  CHECK(query_mem_desc.hasKeylessHash());
246  }
247 
248  const auto step = device_type == ExecutorDeviceType::GPU &&
249  query_mem_desc.threadsShareMemory() &&
250  query_mem_desc.isGroupBy()
251  ? executor->blockSize()
252  : size_t(1);
253  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
254  query_mem_desc.hasKeylessHash()
255  ? query_mem_desc.getEntryCount()
256  : size_t(0);
257  const auto actual_group_buffer_size =
258  group_buffer_size + index_buffer_qw * sizeof(int64_t);
259  CHECK_GE(actual_group_buffer_size, group_buffer_size);
260 
261  if (query_mem_desc.hasVarlenOutput()) {
262  const auto varlen_buffer_elem_size_opt = query_mem_desc.varlenOutputBufferElemSize();
263  CHECK(varlen_buffer_elem_size_opt); // TODO(adb): relax
264  auto varlen_output_buffer = reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(
265  query_mem_desc.getEntryCount() * varlen_buffer_elem_size_opt.value()));
266  num_buffers_ += 1;
267  group_by_buffers_.push_back(varlen_output_buffer);
268  }
269 
270  for (size_t i = 0; i < group_buffers_count; i += step) {
271  auto group_by_buffer = alloc_group_by_buffer(actual_group_buffer_size,
272  render_allocator_map,
273  thread_idx_,
274  row_set_mem_owner_.get());
275  if (!query_mem_desc.lazyInitGroups(device_type)) {
276  if (group_by_buffer_template) {
277  memcpy(group_by_buffer + index_buffer_qw,
278  group_by_buffer_template,
279  group_buffer_size);
280  } else {
281  initGroupByBuffer(group_by_buffer + index_buffer_qw,
282  ra_exe_unit,
283  query_mem_desc,
284  device_type,
285  output_columnar,
286  executor);
287  }
288  }
289  group_by_buffers_.push_back(group_by_buffer);
290  for (size_t j = 1; j < step; ++j) {
291  group_by_buffers_.push_back(nullptr);
292  }
293  const auto column_frag_offsets =
294  get_col_frag_offsets(ra_exe_unit.target_exprs, frag_offsets);
295  const auto column_frag_sizes =
296  get_consistent_frags_sizes(ra_exe_unit.target_exprs, consistent_frag_sizes);
297  result_sets_.emplace_back(
298  new ResultSet(target_exprs_to_infos(ra_exe_unit.target_exprs, query_mem_desc),
299  executor->getColLazyFetchInfo(ra_exe_unit.target_exprs),
300  col_buffers,
301  column_frag_offsets,
302  column_frag_sizes,
303  device_type,
304  device_id,
307  executor->getCatalog(),
308  executor->blockSize(),
309  executor->gridSize()));
310  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
311  executor->plan_state_->init_agg_vals_,
313  for (size_t j = 1; j < step; ++j) {
314  result_sets_.emplace_back(nullptr);
315  }
316  }
317 }
318 
319 // Table functions execution constructor
321  const TableFunctionExecutionUnit& exe_unit,
323  const int device_id,
324  const ExecutorDeviceType device_type,
325  const int64_t num_rows,
326  const std::vector<std::vector<const int8_t*>>& col_buffers,
327  const std::vector<std::vector<uint64_t>>& frag_offsets,
328  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
329  DeviceAllocator* device_allocator,
330  const Executor* executor)
331  : num_rows_(num_rows)
332  , row_set_mem_owner_(row_set_mem_owner)
333  , init_agg_vals_(init_agg_val_vec(exe_unit.target_exprs, {}, query_mem_desc))
334  , num_buffers_(1)
341  , device_allocator_(device_allocator)
342  , thread_idx_(0) {
343  // Table functions output columnar, basically treat this as a projection
344  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
345  if (consistent_frag_sizes.empty()) {
346  // No fragments in the input, no underlying buffers will be needed.
347  return;
348  }
349 
350  size_t group_buffer_size{0};
351  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
352  group_buffer_size = num_rows_ * num_columns * sizeof(int64_t);
353  CHECK_GE(group_buffer_size, size_t(0));
354 
355  const auto index_buffer_qw =
356  device_type == ExecutorDeviceType::GPU && query_mem_desc.hasKeylessHash()
357  ? query_mem_desc.getEntryCount()
358  : size_t(0);
359  const auto actual_group_buffer_size =
360  group_buffer_size + index_buffer_qw * sizeof(int64_t);
361  CHECK_GE(actual_group_buffer_size, group_buffer_size);
362 
363  CHECK_EQ(num_buffers_, size_t(1));
364  auto group_by_buffer = alloc_group_by_buffer(
365  actual_group_buffer_size, nullptr, thread_idx_, row_set_mem_owner.get());
366  if (!query_mem_desc.lazyInitGroups(device_type)) {
367  initColumnarGroups(
368  query_mem_desc, group_by_buffer + index_buffer_qw, init_agg_vals_, executor);
369  }
370  group_by_buffers_.push_back(group_by_buffer);
371 
372  const auto column_frag_offsets =
373  get_col_frag_offsets(exe_unit.target_exprs, frag_offsets);
374  const auto column_frag_sizes =
375  get_consistent_frags_sizes(exe_unit.target_exprs, consistent_frag_sizes);
376  result_sets_.emplace_back(
377  new ResultSet(target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
378  /*col_lazy_fetch_info=*/{},
379  col_buffers,
380  column_frag_offsets,
381  column_frag_sizes,
382  device_type,
383  device_id,
385  row_set_mem_owner_,
386  executor->getCatalog(),
387  executor->blockSize(),
388  executor->gridSize()));
389  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
390  init_agg_vals_);
391 }
392 
394  int64_t* buffer,
395  const RelAlgExecutionUnit& ra_exe_unit,
397  const ExecutorDeviceType device_type,
398  const bool output_columnar,
399  const Executor* executor) {
400  if (output_columnar) {
401  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor);
402  } else {
403  auto rows_ptr = buffer;
404  auto actual_entry_count = query_mem_desc.getEntryCount();
405  const auto thread_count = device_type == ExecutorDeviceType::GPU
406  ? executor->blockSize() * executor->gridSize()
407  : 1;
408  auto warp_size =
409  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
410  if (query_mem_desc.useStreamingTopN()) {
411  const auto node_count_size = thread_count * sizeof(int64_t);
412  memset(rows_ptr, 0, node_count_size);
413  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
414  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
415  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
416  rows_ptr += rows_offset / sizeof(int64_t);
417  actual_entry_count = n * thread_count;
418  warp_size = 1;
419  }
420  initRowGroups(query_mem_desc,
421  rows_ptr,
423  actual_entry_count,
424  warp_size,
425  executor);
426  }
427 }
428 
430  int64_t* groups_buffer,
431  const std::vector<int64_t>& init_vals,
432  const int32_t groups_buffer_entry_count,
433  const size_t warp_size,
434  const Executor* executor) {
435  const size_t key_count{query_mem_desc.getGroupbyColCount()};
436  const size_t row_size{query_mem_desc.getRowSize()};
437  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
438 
439  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
440  auto quantile_params = allocateTDigests(query_mem_desc, true, executor);
441  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
442 
443  const auto query_mem_desc_fixedup =
445 
446  auto const is_true = [](auto const& x) { return static_cast<bool>(x); };
447  // not COUNT DISTINCT / APPROX_COUNT_DISTINCT / APPROX_QUANTILE
448  // we fallback to default implementation in that cases
449  if (!std::any_of(agg_bitmap_size.begin(), agg_bitmap_size.end(), is_true) &&
450  !std::any_of(quantile_params.begin(), quantile_params.end(), is_true) &&
452  std::vector<int8_t> sample_row(row_size - col_base_off);
453 
454  initColumnsPerRow(query_mem_desc_fixedup,
455  sample_row.data(),
456  init_vals,
457  agg_bitmap_size,
458  quantile_params);
459 
460  if (query_mem_desc.hasKeylessHash()) {
461  CHECK(warp_size >= 1);
462  CHECK(key_count == 1 || warp_size == 1);
463  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
464  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
465  ++bin, buffer_ptr += row_size) {
466  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
467  }
468  }
469  return;
470  }
471 
472  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
473  ++bin, buffer_ptr += row_size) {
474  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
476  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
477  }
478  } else {
479  if (query_mem_desc.hasKeylessHash()) {
480  CHECK(warp_size >= 1);
481  CHECK(key_count == 1 || warp_size == 1);
482  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
483  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
484  ++bin, buffer_ptr += row_size) {
485  initColumnsPerRow(query_mem_desc_fixedup,
486  &buffer_ptr[col_base_off],
487  init_vals,
488  agg_bitmap_size,
489  quantile_params);
490  }
491  }
492  return;
493  }
494 
495  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
496  ++bin, buffer_ptr += row_size) {
498  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
499  initColumnsPerRow(query_mem_desc_fixedup,
500  &buffer_ptr[col_base_off],
501  init_vals,
502  agg_bitmap_size,
503  quantile_params);
504  }
505  }
506 }
507 
508 namespace {
509 
510 template <typename T>
511 int8_t* initColumnarBuffer(T* buffer_ptr, const T init_val, const uint32_t entry_count) {
512  static_assert(sizeof(T) <= sizeof(int64_t), "Unsupported template type");
513  for (uint32_t i = 0; i < entry_count; ++i) {
514  buffer_ptr[i] = init_val;
515  }
516  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
517 }
518 
519 } // namespace
520 
523  int64_t* groups_buffer,
524  const std::vector<int64_t>& init_vals,
525  const Executor* executor) {
526  CHECK(groups_buffer);
527  for (const auto target_expr : executor->plan_state_->target_exprs_) {
528  const auto agg_info = get_target_info(target_expr, g_bigint_count);
529  CHECK(!is_distinct_target(agg_info));
530  }
531  const int32_t agg_col_count = query_mem_desc.getSlotCount();
532  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
533 
534  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
535  if (!query_mem_desc.hasKeylessHash()) {
536  const size_t key_count{query_mem_desc.getGroupbyColCount()};
537  for (size_t i = 0; i < key_count; ++i) {
538  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
539  EMPTY_KEY_64,
540  groups_buffer_entry_count);
541  }
542  }
543 
545  // initializing all aggregate columns:
546  int32_t init_val_idx = 0;
547  for (int32_t i = 0; i < agg_col_count; ++i) {
548  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
549  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
550  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
551  case 1:
552  buffer_ptr = initColumnarBuffer<int8_t>(
553  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
554  break;
555  case 2:
556  buffer_ptr =
557  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
558  init_vals[init_val_idx++],
559  groups_buffer_entry_count);
560  break;
561  case 4:
562  buffer_ptr =
563  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
564  init_vals[init_val_idx++],
565  groups_buffer_entry_count);
566  break;
567  case 8:
568  buffer_ptr =
569  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
570  init_vals[init_val_idx++],
571  groups_buffer_entry_count);
572  break;
573  case 0:
574  break;
575  default:
576  CHECK(false);
577  }
578 
579  buffer_ptr = align_to_int64(buffer_ptr);
580  }
581  }
582  }
583 }
584 
587  int8_t* row_ptr,
588  const std::vector<int64_t>& init_vals,
589  const std::vector<int64_t>& bitmap_sizes,
590  const std::vector<QuantileParam>& quantile_params) {
591  int8_t* col_ptr = row_ptr;
592  size_t init_vec_idx = 0;
593  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
594  col_ptr += query_mem_desc.getNextColOffInBytesRowOnly(col_ptr, col_idx++)) {
595  const int64_t bm_sz{bitmap_sizes[col_idx]};
596  int64_t init_val{0};
597  if (bm_sz && query_mem_desc.isGroupBy()) {
598  // COUNT DISTINCT / APPROX_COUNT_DISTINCT
599  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
600  sizeof(int64_t));
601  init_val =
603  ++init_vec_idx;
604  } else if (query_mem_desc.isGroupBy() && quantile_params[col_idx]) {
605  auto const q = *quantile_params[col_idx];
606  // allocate for APPROX_QUANTILE only when slot is used
607  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
608  ++init_vec_idx;
609  } else {
610  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
611  CHECK_LT(init_vec_idx, init_vals.size());
612  init_val = init_vals[init_vec_idx++];
613  }
614  }
615  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
616  case 1:
617  *col_ptr = static_cast<int8_t>(init_val);
618  break;
619  case 2:
620  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
621  break;
622  case 4:
623  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
624  break;
625  case 8:
626  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
627  break;
628  case 0:
629  continue;
630  default:
631  CHECK(false);
632  }
633  }
634 }
635 
638  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
639  return;
640  }
642 
643  size_t total_bytes_per_entry{0};
644  const size_t num_count_distinct_descs =
645  query_mem_desc.getCountDistinctDescriptorsSize();
646  for (size_t i = 0; i < num_count_distinct_descs; i++) {
647  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
648  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
649  continue;
650  }
651  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
652  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
653  }
654 
656  total_bytes_per_entry * query_mem_desc.getEntryCount();
657  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
659  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
661 
664 }
665 
666 // deferred is true for group by queries; initGroups will allocate a bitmap
667 // for each group slot
670  const bool deferred,
671  const Executor* executor) {
672  const size_t agg_col_count{query_mem_desc.getSlotCount()};
673  std::vector<int64_t> agg_bitmap_size(deferred ? agg_col_count : 0);
674 
675  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
676  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
677  ++target_idx) {
678  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
679  const auto agg_info = get_target_info(target_expr, g_bigint_count);
680  if (is_distinct_target(agg_info)) {
681  CHECK(agg_info.is_agg &&
682  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
683  CHECK(!agg_info.sql_type.is_varlen());
684 
685  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
686  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
687 
688  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
689  sizeof(int64_t));
690  const auto& count_distinct_desc =
691  query_mem_desc.getCountDistinctDescriptor(target_idx);
692  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
693  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
694  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
695  if (deferred) {
696  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
697  } else {
698  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
699  }
700  } else {
701  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::StdSet);
702  if (deferred) {
703  agg_bitmap_size[agg_col_idx] = -1;
704  } else {
705  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
706  }
707  }
708  }
709  }
710 
711  return agg_bitmap_size;
712 }
713 
714 int64_t QueryMemoryInitializer::allocateCountDistinctBitmap(const size_t bitmap_byte_sz) {
718  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
719  row_set_mem_owner_->addCountDistinctBuffer(
720  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
721  return reinterpret_cast<int64_t>(ptr);
722  }
723  return reinterpret_cast<int64_t>(
724  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz, thread_idx_));
725 }
726 
728  auto count_distinct_set = new std::set<int64_t>();
729  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
730  return reinterpret_cast<int64_t>(count_distinct_set);
731 }
732 
733 std::vector<QueryMemoryInitializer::QuantileParam>
735  const bool deferred,
736  const Executor* executor) {
737  size_t const slot_count = query_mem_desc.getSlotCount();
738  size_t const ntargets = executor->plan_state_->target_exprs_.size();
739  CHECK_GE(slot_count, ntargets);
740  std::vector<QuantileParam> quantile_params(deferred ? slot_count : 0);
741 
742  for (size_t target_idx = 0; target_idx < ntargets; ++target_idx) {
743  auto const target_expr = executor->plan_state_->target_exprs_[target_idx];
744  if (auto const agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr)) {
745  if (agg_expr->get_aggtype() == kAPPROX_QUANTILE) {
746  size_t const agg_col_idx =
747  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
748  CHECK_LT(agg_col_idx, slot_count);
749  CHECK_EQ(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx),
750  static_cast<int8_t>(sizeof(int64_t)));
751  auto const q = agg_expr->get_arg1()->get_constval().doubleval;
752  if (deferred) {
753  quantile_params[agg_col_idx] = q;
754  } else {
755  // allocate for APPROX_QUANTILE only when slot is used
756  init_agg_vals_[agg_col_idx] =
757  reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
758  }
759  }
760  }
761  }
762  return quantile_params;
763 }
764 
767  const int8_t* init_agg_vals_dev_ptr,
768  const size_t n,
769  const int device_id,
770  const unsigned block_size_x,
771  const unsigned grid_size_x) {
772 #ifdef HAVE_CUDA
774  const auto thread_count = block_size_x * grid_size_x;
775  const auto total_buff_size =
776  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
777  int8_t* dev_buffer = device_allocator_->alloc(total_buff_size);
778 
779  std::vector<int8_t*> dev_buffers(thread_count);
780 
781  for (size_t i = 0; i < thread_count; ++i) {
782  dev_buffers[i] = dev_buffer;
783  }
784 
785  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(int8_t*));
787  dev_ptr, dev_buffers.data(), thread_count * sizeof(int8_t*));
788 
790 
791  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
792  thread_count * sizeof(int64_t));
793 
795  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
796  (unsigned char)-1,
797  thread_count * n * sizeof(int64_t));
798 
800  reinterpret_cast<int64_t*>(
801  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
802  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
803  n * thread_count,
804  query_mem_desc.getGroupbyColCount(),
805  query_mem_desc.getEffectiveKeyWidth(),
806  query_mem_desc.getRowSize() / sizeof(int64_t),
807  query_mem_desc.hasKeylessHash(),
808  1,
809  block_size_x,
810  grid_size_x);
811 
812  return {dev_ptr, dev_buffer};
813 #else
814  UNREACHABLE();
815  return {};
816 #endif
817 }
818 
820  const RelAlgExecutionUnit& ra_exe_unit,
822  const int8_t* init_agg_vals_dev_ptr,
823  const int device_id,
824  const ExecutorDispatchMode dispatch_mode,
825  const unsigned block_size_x,
826  const unsigned grid_size_x,
827  const int8_t warp_size,
828  const bool can_sort_on_gpu,
829  const bool output_columnar,
830  RenderAllocator* render_allocator) {
831 #ifdef HAVE_CUDA
832  if (query_mem_desc.useStreamingTopN()) {
833  if (render_allocator) {
835  }
836  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
837  CHECK(!output_columnar);
838 
840  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
841  }
842 
843  auto dev_group_by_buffers =
846  query_mem_desc,
847  block_size_x,
848  grid_size_x,
849  device_id,
850  dispatch_mode,
851  num_rows_,
852  can_sort_on_gpu,
853  false,
854  ra_exe_unit.use_bump_allocator,
855  query_mem_desc.hasVarlenOutput(),
856  render_allocator);
857  if (query_mem_desc.hasVarlenOutput()) {
858  CHECK(dev_group_by_buffers.varlen_output_buffer);
860  reinterpret_cast<CUdeviceptr>(dev_group_by_buffers.varlen_output_buffer);
861  CHECK(query_mem_desc.varlenOutputBufferElemSize());
862  const size_t varlen_output_buf_bytes =
863  query_mem_desc.getEntryCount() *
864  query_mem_desc.varlenOutputBufferElemSize().value();
866  row_set_mem_owner_->allocate(varlen_output_buf_bytes, thread_idx_);
868  varlen_output_info_->gpu_start_address = static_cast<int64_t>(varlen_output_buffer_);
870  }
871  if (render_allocator) {
872  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
873  }
874  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
875  CHECK(!render_allocator);
876 
877  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
878  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
879  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
880  auto group_by_dev_buffer = dev_group_by_buffers.data;
881  const size_t col_count = query_mem_desc.getSlotCount();
882  int8_t* col_widths_dev_ptr{nullptr};
883  if (output_columnar) {
884  std::vector<int8_t> compact_col_widths(col_count);
885  for (size_t idx = 0; idx < col_count; ++idx) {
886  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
887  }
888  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
890  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
891  }
892  const int8_t warp_count =
893  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
894  const auto num_group_by_buffers =
895  getGroupByBuffersSize() - (query_mem_desc.hasVarlenOutput() ? 1 : 0);
896  for (size_t i = 0; i < num_group_by_buffers; i += step) {
897  if (output_columnar) {
899  reinterpret_cast<int64_t*>(group_by_dev_buffer),
900  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
901  dev_group_by_buffers.entry_count,
902  query_mem_desc.getGroupbyColCount(),
903  col_count,
904  col_widths_dev_ptr,
905  /*need_padding = */ true,
906  query_mem_desc.hasKeylessHash(),
907  sizeof(int64_t),
908  block_size_x,
909  grid_size_x);
910  } else {
912  reinterpret_cast<int64_t*>(group_by_dev_buffer),
913  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
914  dev_group_by_buffers.entry_count,
915  query_mem_desc.getGroupbyColCount(),
916  query_mem_desc.getEffectiveKeyWidth(),
917  query_mem_desc.getRowSize() / sizeof(int64_t),
918  query_mem_desc.hasKeylessHash(),
919  warp_count,
920  block_size_x,
921  grid_size_x);
922  }
923  group_by_dev_buffer += groups_buffer_size;
924  }
925  }
926  return dev_group_by_buffers;
927 #else
928  UNREACHABLE();
929  return {};
930 #endif
931 }
932 
935  const int device_id,
936  const unsigned block_size_x,
937  const unsigned grid_size_x) {
938  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
939  CHECK_GT(num_columns, size_t(0));
940 
941  const size_t column_size = num_rows_ * sizeof(int64_t);
942  const size_t groups_buffer_size = num_columns * (column_size == 0 ? 1 : column_size);
943  const size_t mem_size =
944  groups_buffer_size * (query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
945 
946  int8_t* dev_buffers_allocation{nullptr};
947  dev_buffers_allocation = device_allocator_->alloc(mem_size);
948  CHECK(dev_buffers_allocation);
949 
950  auto dev_buffers_mem = dev_buffers_allocation;
951  const size_t step{block_size_x};
952  const size_t num_ptrs{block_size_x * grid_size_x};
953  std::vector<int8_t*> dev_buffers(num_columns * num_ptrs);
954  auto dev_buffer = dev_buffers_mem;
955  for (size_t i = 0; i < num_ptrs; i += step) {
956  for (size_t j = 0; j < step; j += 1) {
957  for (size_t k = 0; k < num_columns; k++) {
958  dev_buffers[(i + j) * num_columns + k] = dev_buffer + k * column_size;
959  }
960  }
961  if (!query_mem_desc.blocksShareMemory()) {
962  dev_buffer += groups_buffer_size;
963  }
964  }
965 
966  auto dev_ptr = device_allocator_->alloc(num_columns * num_ptrs * sizeof(CUdeviceptr));
968  dev_ptr, dev_buffers.data(), num_columns * num_ptrs * sizeof(CUdeviceptr));
969 
970  return {dev_ptr, dev_buffers_mem, (size_t)num_rows_};
971 }
972 
974  Data_Namespace::DataMgr* data_mgr,
975  const QueryMemoryDescriptor& query_mem_desc,
976  const size_t entry_count,
977  const GpuGroupByBuffers& gpu_group_by_buffers,
978  const int device_id,
979  const unsigned block_size_x,
980  const unsigned grid_size_x) {
981  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
982  const size_t column_size = entry_count * sizeof(int64_t);
983  const size_t orig_column_size = gpu_group_by_buffers.entry_count * sizeof(int64_t);
984  int8_t* dev_buffer = gpu_group_by_buffers.data;
985  int8_t* host_buffer = reinterpret_cast<int8_t*>(group_by_buffers_[0]);
986  CHECK_LE(column_size, orig_column_size);
987 
988  auto allocator = data_mgr->createGpuAllocator(device_id);
989  if (orig_column_size == column_size) {
990  allocator->copyFromDevice(host_buffer, dev_buffer, column_size * num_columns);
991  } else {
992  for (size_t k = 0; k < num_columns; ++k) {
993  allocator->copyFromDevice(host_buffer, dev_buffer, column_size);
994  dev_buffer += orig_column_size;
995  host_buffer += column_size;
996  }
997  }
998 }
999 
1001  const QueryMemoryDescriptor& query_mem_desc,
1002  const ExecutorDeviceType device_type,
1003  const Executor* executor) const {
1004  return device_type == ExecutorDeviceType::CPU
1005  ? 1
1006  : executor->blockSize() *
1007  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
1008 }
1009 
1010 namespace {
1011 
1012 // in-place compaction of output buffer
1014  const QueryMemoryDescriptor& query_mem_desc,
1015  int8_t* projection_buffer,
1016  const size_t projection_count) {
1017  // the first column (row indices) remains unchanged.
1018  CHECK(projection_count <= query_mem_desc.getEntryCount());
1019  constexpr size_t row_index_width = sizeof(int64_t);
1020  size_t buffer_offset1{projection_count * row_index_width};
1021  // other columns are actual non-lazy columns for the projection:
1022  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
1023  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
1024  auto column_proj_size =
1025  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
1026  auto buffer_offset2 = query_mem_desc.getColOffInBytes(i);
1027  if (buffer_offset1 + column_proj_size >= buffer_offset2) {
1028  // overlapping
1029  std::memmove(projection_buffer + buffer_offset1,
1030  projection_buffer + buffer_offset2,
1031  column_proj_size);
1032  } else {
1033  std::memcpy(projection_buffer + buffer_offset1,
1034  projection_buffer + buffer_offset2,
1035  column_proj_size);
1036  }
1037  buffer_offset1 += align_to_int64(column_proj_size);
1038  }
1039  }
1040 }
1041 
1042 } // namespace
1043 
1045  const QueryMemoryDescriptor& query_mem_desc,
1046  const size_t projection_count) {
1047  const auto num_allocated_rows =
1048  std::min(projection_count, query_mem_desc.getEntryCount());
1049  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1050 
1051  // copy the results from the main buffer into projection_buffer
1053  query_mem_desc,
1054  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1055  num_allocated_rows);
1056 
1057  // update the entry count for the result set, and its underlying storage
1058  CHECK(!result_sets_.empty());
1059  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1060 }
1061 
1063  const QueryMemoryDescriptor& query_mem_desc,
1064  Data_Namespace::DataMgr* data_mgr,
1065  const GpuGroupByBuffers& gpu_group_by_buffers,
1066  const size_t projection_count,
1067  const int device_id) {
1068  // store total number of allocated rows:
1069  const auto num_allocated_rows =
1070  std::min(projection_count, query_mem_desc.getEntryCount());
1071 
1072  // copy the results from the main buffer into projection_buffer
1073  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1075  data_mgr,
1076  gpu_group_by_buffers,
1077  query_mem_desc,
1078  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1079  num_allocated_rows,
1080  device_id);
1081 
1082  // update the entry count for the result set, and its underlying storage
1083  CHECK(!result_sets_.empty());
1084  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1085 }
1086 
1088  DeviceAllocator& device_allocator,
1089  const QueryMemoryDescriptor& query_mem_desc,
1090  const size_t entry_count,
1091  const GpuGroupByBuffers& gpu_group_by_buffers,
1092  const RelAlgExecutionUnit* ra_exe_unit,
1093  const unsigned block_size_x,
1094  const unsigned grid_size_x,
1095  const int device_id,
1096  const bool prepend_index_buffer) const {
1097  const auto thread_count = block_size_x * grid_size_x;
1098 
1099  size_t total_buff_size{0};
1100  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
1101  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
1102  total_buff_size =
1103  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1104  } else {
1105  total_buff_size =
1106  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
1107  }
1108  copy_group_by_buffers_from_gpu(device_allocator,
1110  total_buff_size,
1111  gpu_group_by_buffers.data,
1112  query_mem_desc,
1113  block_size_x,
1114  grid_size_x,
1115  device_id,
1116  prepend_index_buffer,
1117  query_mem_desc.hasVarlenOutput());
1118 }
1119 
1121  const QueryMemoryDescriptor& query_mem_desc,
1122  const RelAlgExecutionUnit& ra_exe_unit) {
1123  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1124  CHECK_EQ(group_by_buffers_.size(), buffer_start_idx + 1);
1125 
1126  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1127  group_by_buffers_[buffer_start_idx],
1128  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1129  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
1130  1);
1131  CHECK_EQ(rows_copy.size(),
1132  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1133  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1134 }
1135 
1137  Data_Namespace::DataMgr* data_mgr,
1138  const QueryMemoryDescriptor& query_mem_desc,
1139  const GpuGroupByBuffers& gpu_group_by_buffers,
1140  const RelAlgExecutionUnit& ra_exe_unit,
1141  const unsigned total_thread_count,
1142  const int device_id) {
1143 #ifdef HAVE_CUDA
1145  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1146 
1147  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1148  data_mgr,
1149  reinterpret_cast<int64_t*>(gpu_group_by_buffers.data),
1150  ra_exe_unit,
1151  query_mem_desc,
1152  total_thread_count,
1153  device_id);
1154  CHECK_EQ(
1155  rows_copy.size(),
1156  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1157  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1158 #else
1159  UNREACHABLE();
1160 #endif
1161 }
1162 
1163 std::shared_ptr<VarlenOutputInfo> QueryMemoryInitializer::getVarlenOutputInfo() {
1164  if (varlen_output_info_) {
1165  return varlen_output_info_;
1166  }
1167 
1168  // shared_ptr so that both the ResultSet and QMI can hold on to the varlen info object
1169  // and update it as needed
1170  varlen_output_info_ = std::make_shared<VarlenOutputInfo>(VarlenOutputInfo{
1171  static_cast<int64_t>(varlen_output_buffer_), varlen_output_buffer_host_ptr_});
1172  return varlen_output_info_;
1173 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:217
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *device_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:60
RenderAllocator * getRenderAllocator(size_t device_id)
bool countDistinctDescriptorsLogicallyEmpty() const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:68
#define EMPTY_KEY_64
GpuGroupByBuffers prepareTopNHeapsDevBuffer(const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
GpuGroupByBuffers createAndInitializeGroupByBufferGpu(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const int device_id, const ExecutorDispatchMode dispatch_mode, const unsigned block_size_x, const unsigned grid_size_x, const int8_t warp_size, const bool can_sort_on_gpu, const bool output_columnar, RenderAllocator *render_allocator)
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< QuantileParam > &quantile_params)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
DeviceAllocator * device_allocator_
ExecutorDeviceType
Streaming Top N algorithm.
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
unsigned long long CUdeviceptr
Definition: nocuda.h:27
int8_t * allocate(const size_t num_bytes, const size_t thread_idx=0) override
#define UNREACHABLE()
Definition: Logger.h:253
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
#define CHECK_GE(x, y)
Definition: Logger.h:222
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
varlen_output_buffer_(0)
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getEffectiveKeyWidth() const
num_buffers_(1)
#define CHECK_GT(x, y)
Definition: Logger.h:221
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< QuantileParam > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
varlen_output_buffer_host_ptr_(nullptr)
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
const size_t limit
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
int64_t g_bitmap_memory_limit
std::unique_ptr< DeviceAllocator > createGpuAllocator(int device_id)
Definition: DataMgr.cpp:522
std::vector< int64_t > init_agg_vals_
size_t getGroupbyColCount() const
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
bool g_bigint_count
size_t g_max_memory_allocation_size
Definition: Execute.cpp:109
size_t getAllocatedSize() const
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:153
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
const std::shared_ptr< Analyzer::Estimator > estimator
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
size_t getCountDistinctDescriptorsSize() const
QueryDescriptionType getQueryDescriptionType() const
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
void copyGroupByBuffersFromGpu(DeviceAllocator &device_allocator, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
std::optional< size_t > varlenOutputBufferElemSize() const
#define CHECK_LT(x, y)
Definition: Logger.h:219
std::shared_ptr< VarlenOutputInfo > getVarlenOutputInfo()
count_distinct_bitmap_mem_(0)
#define CHECK_LE(x, y)
Definition: Logger.h:220
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
size_t getNextColOffInBytesRowOnly(const int8_t *col_ptr, const size_t col_idx) const
Definition: sqldefs.h:76
Abstract class for managing device memory allocations.
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:491
count_distinct_bitmap_host_mem_(nullptr)
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
device_allocator_(device_allocator)
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:209
GpuGroupByBuffers setupTableFunctionGpuBuffers(const QueryMemoryDescriptor &query_mem_desc, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
void copyFromTableFunctionGpuBuffers(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
Basic constructors and methods of the row set interface.
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
bool g_optimize_row_initialization
Definition: Execute.cpp:97
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)
count_distinct_bitmap_crt_ptr_(nullptr)
std::shared_ptr< VarlenOutputInfo > varlen_output_info_
constexpr double n
Definition: Utm.h:46
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
const size_t offset
count_distinct_bitmap_mem_bytes_(0)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
size_t getColOffInBytes(const size_t col_idx) const
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, const size_t thread_idx, RowSetMemoryOwner *mem_owner)
void copy_group_by_buffers_from_gpu(DeviceAllocator &device_allocator, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const int8_t *group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)
std::vector< std::unique_ptr< ResultSet > > result_sets_
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)