OmniSciDB  21ac014ffc
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryMemoryInitializer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryInitializer.h"
18 
19 #include "Execute.h"
20 #include "GpuInitGroups.h"
21 #include "GpuMemUtils.h"
22 #include "Logger/Logger.h"
24 #include "ResultSet.h"
25 #include "StreamingTopN.h"
26 
27 #include <Shared/checked_alloc.h>
28 
29 // 8 GB, the limit of perfect hash group by under normal conditions
30 int64_t g_bitmap_memory_limit{8LL * 1000 * 1000 * 1000};
31 
32 namespace {
33 
35  const int32_t groups_buffer_entry_count = query_mem_desc.getEntryCount();
36  checked_int64_t total_bytes_per_group = 0;
37  const size_t num_count_distinct_descs =
38  query_mem_desc.getCountDistinctDescriptorsSize();
39  for (size_t i = 0; i < num_count_distinct_descs; i++) {
40  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
41  if (count_distinct_desc.impl_type_ != CountDistinctImplType::Bitmap) {
42  continue;
43  }
44  total_bytes_per_group += count_distinct_desc.bitmapPaddedSizeBytes();
45  }
46  int64_t total_bytes{0};
47  // Using OutOfHostMemory until we can verify that SlabTooBig would also be properly
48  // caught
49  try {
50  total_bytes = static_cast<int64_t>(total_bytes_per_group * groups_buffer_entry_count);
51  } catch (...) {
52  // Absurd amount of memory, merely computing the number of bits overflows int64_t.
53  // Don't bother to report the real amount, this is unlikely to ever happen.
54  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
55  }
56  if (total_bytes >= g_bitmap_memory_limit) {
57  throw OutOfHostMemory(total_bytes);
58  }
59 }
60 
61 int64_t* alloc_group_by_buffer(const size_t numBytes,
62  RenderAllocatorMap* render_allocator_map,
63  const size_t thread_idx,
64  RowSetMemoryOwner* mem_owner) {
65  if (render_allocator_map) {
66  // NOTE(adb): If we got here, we are performing an in-situ rendering query and are not
67  // using CUDA buffers. Therefore we need to allocate result set storage using CPU
68  // memory.
69  const auto gpu_idx = 0; // Only 1 GPU supported in CUDA-disabled rendering mode
70  auto render_allocator_ptr = render_allocator_map->getRenderAllocator(gpu_idx);
71  return reinterpret_cast<int64_t*>(render_allocator_ptr->alloc(numBytes));
72  } else {
73  return reinterpret_cast<int64_t*>(mem_owner->allocate(numBytes, thread_idx));
74  }
75 }
76 
77 inline int64_t get_consistent_frag_size(const std::vector<uint64_t>& frag_offsets) {
78  if (frag_offsets.size() < 2) {
79  return int64_t(-1);
80  }
81  const auto frag_size = frag_offsets[1] - frag_offsets[0];
82  for (size_t i = 2; i < frag_offsets.size(); ++i) {
83  const auto curr_size = frag_offsets[i] - frag_offsets[i - 1];
84  if (curr_size != frag_size) {
85  return int64_t(-1);
86  }
87  }
88  return !frag_size ? std::numeric_limits<int64_t>::max()
89  : static_cast<int64_t>(frag_size);
90 }
91 
92 inline std::vector<int64_t> get_consistent_frags_sizes(
93  const std::vector<std::vector<uint64_t>>& frag_offsets) {
94  if (frag_offsets.empty()) {
95  return {};
96  }
97  std::vector<int64_t> frag_sizes;
98  for (size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
99  std::vector<uint64_t> tab_offs;
100  for (auto& offsets : frag_offsets) {
101  tab_offs.push_back(offsets[tab_idx]);
102  }
103  frag_sizes.push_back(get_consistent_frag_size(tab_offs));
104  }
105  return frag_sizes;
106 }
107 
108 inline std::vector<int64_t> get_consistent_frags_sizes(
109  const std::vector<Analyzer::Expr*>& target_exprs,
110  const std::vector<int64_t>& table_frag_sizes) {
111  std::vector<int64_t> col_frag_sizes;
112  for (auto expr : target_exprs) {
113  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
114  if (col_var->get_rte_idx() < 0) {
115  CHECK_EQ(-1, col_var->get_rte_idx());
116  col_frag_sizes.push_back(int64_t(-1));
117  } else {
118  col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
119  }
120  } else {
121  col_frag_sizes.push_back(int64_t(-1));
122  }
123  }
124  return col_frag_sizes;
125 }
126 
127 inline std::vector<std::vector<int64_t>> get_col_frag_offsets(
128  const std::vector<Analyzer::Expr*>& target_exprs,
129  const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
130  std::vector<std::vector<int64_t>> col_frag_offsets;
131  for (auto& table_offsets : table_frag_offsets) {
132  std::vector<int64_t> col_offsets;
133  for (auto expr : target_exprs) {
134  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
135  if (col_var->get_rte_idx() < 0) {
136  CHECK_EQ(-1, col_var->get_rte_idx());
137  col_offsets.push_back(int64_t(-1));
138  } else {
139  CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
140  col_offsets.push_back(
141  static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
142  }
143  } else {
144  col_offsets.push_back(int64_t(-1));
145  }
146  }
147  col_frag_offsets.push_back(col_offsets);
148  }
149  return col_frag_offsets;
150 }
151 
152 } // namespace
153 
154 // Row-based execution constructor
156  const RelAlgExecutionUnit& ra_exe_unit,
158  const int device_id,
159  const ExecutorDeviceType device_type,
160  const ExecutorDispatchMode dispatch_mode,
161  const bool output_columnar,
162  const bool sort_on_gpu,
163  const int64_t num_rows,
164  const std::vector<std::vector<const int8_t*>>& col_buffers,
165  const std::vector<std::vector<uint64_t>>& frag_offsets,
166  RenderAllocatorMap* render_allocator_map,
167  RenderInfo* render_info,
168  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
169  DeviceAllocator* device_allocator,
170  const size_t thread_idx,
171  const Executor* executor)
172  : num_rows_(num_rows)
173  , row_set_mem_owner_(row_set_mem_owner)
174  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
175  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
182  , device_allocator_(device_allocator)
183  , thread_idx_(thread_idx) {
184  CHECK(!sort_on_gpu || output_columnar);
185 
186  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
187  if (consistent_frag_sizes.empty()) {
188  // No fragments in the input, no underlying buffers will be needed.
189  return;
190  }
191  if (!ra_exe_unit.use_bump_allocator) {
192  check_total_bitmap_memory(query_mem_desc);
193  }
194  if (device_type == ExecutorDeviceType::GPU) {
195  allocateCountDistinctGpuMem(query_mem_desc);
196  }
197 
198  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
199  allocateCountDistinctBuffers(query_mem_desc, false, executor);
200  allocateTDigests(query_mem_desc, false, executor);
201  if (render_info && render_info->useCudaBuffers()) {
202  return;
203  }
204  }
205 
206  if (ra_exe_unit.estimator) {
207  return;
208  }
209 
210  const auto thread_count = device_type == ExecutorDeviceType::GPU
211  ? executor->blockSize() * executor->gridSize()
212  : 1;
213 
214  size_t group_buffer_size{0};
215  if (ra_exe_unit.use_bump_allocator) {
216  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
217  // the fragment
218  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
219  group_buffer_size = num_rows * query_mem_desc.getRowSize();
220  } else {
221  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
222  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
223  }
224  } else {
225  group_buffer_size =
226  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
227  }
228  CHECK_GE(group_buffer_size, size_t(0));
229 
230  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
231  int64_t* group_by_buffer_template{nullptr};
232  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
233  group_by_buffer_template = reinterpret_cast<int64_t*>(
234  row_set_mem_owner_->allocate(group_buffer_size, thread_idx_));
235  initGroupByBuffer(group_by_buffer_template,
236  ra_exe_unit,
237  query_mem_desc,
238  device_type,
239  output_columnar,
240  executor);
241  }
242 
243  if (query_mem_desc.interleavedBins(device_type)) {
244  CHECK(query_mem_desc.hasKeylessHash());
245  }
246 
247  const auto step = device_type == ExecutorDeviceType::GPU &&
248  query_mem_desc.threadsShareMemory() &&
249  query_mem_desc.isGroupBy()
250  ? executor->blockSize()
251  : size_t(1);
252  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
253  query_mem_desc.hasKeylessHash()
254  ? query_mem_desc.getEntryCount()
255  : size_t(0);
256  const auto actual_group_buffer_size =
257  group_buffer_size + index_buffer_qw * sizeof(int64_t);
258  CHECK_GE(actual_group_buffer_size, group_buffer_size);
259 
260  if (query_mem_desc.hasVarlenOutput()) {
261  const auto varlen_buffer_elem_size_opt = query_mem_desc.varlenOutputBufferElemSize();
262  CHECK(varlen_buffer_elem_size_opt); // TODO(adb): relax
263  auto varlen_output_buffer = reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(
264  query_mem_desc.getEntryCount() * varlen_buffer_elem_size_opt.value()));
265  num_buffers_ += 1;
266  group_by_buffers_.push_back(varlen_output_buffer);
267  }
268 
269  for (size_t i = 0; i < group_buffers_count; i += step) {
270  auto group_by_buffer = alloc_group_by_buffer(actual_group_buffer_size,
271  render_allocator_map,
272  thread_idx_,
273  row_set_mem_owner_.get());
274  if (!query_mem_desc.lazyInitGroups(device_type)) {
275  if (group_by_buffer_template) {
276  memcpy(group_by_buffer + index_buffer_qw,
277  group_by_buffer_template,
278  group_buffer_size);
279  } else {
280  initGroupByBuffer(group_by_buffer + index_buffer_qw,
281  ra_exe_unit,
282  query_mem_desc,
283  device_type,
284  output_columnar,
285  executor);
286  }
287  }
288  group_by_buffers_.push_back(group_by_buffer);
289  for (size_t j = 1; j < step; ++j) {
290  group_by_buffers_.push_back(nullptr);
291  }
292  const auto column_frag_offsets =
293  get_col_frag_offsets(ra_exe_unit.target_exprs, frag_offsets);
294  const auto column_frag_sizes =
295  get_consistent_frags_sizes(ra_exe_unit.target_exprs, consistent_frag_sizes);
296  result_sets_.emplace_back(
297  new ResultSet(target_exprs_to_infos(ra_exe_unit.target_exprs, query_mem_desc),
298  executor->getColLazyFetchInfo(ra_exe_unit.target_exprs),
299  col_buffers,
300  column_frag_offsets,
301  column_frag_sizes,
302  device_type,
303  device_id,
306  executor->getCatalog(),
307  executor->blockSize(),
308  executor->gridSize()));
309  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
310  executor->plan_state_->init_agg_vals_,
312  for (size_t j = 1; j < step; ++j) {
313  result_sets_.emplace_back(nullptr);
314  }
315  }
316 }
317 
318 // Table functions execution constructor
320  const TableFunctionExecutionUnit& exe_unit,
322  const int device_id,
323  const ExecutorDeviceType device_type,
324  const int64_t num_rows,
325  const std::vector<std::vector<const int8_t*>>& col_buffers,
326  const std::vector<std::vector<uint64_t>>& frag_offsets,
327  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
328  DeviceAllocator* device_allocator,
329  const Executor* executor)
330  : num_rows_(num_rows)
331  , row_set_mem_owner_(row_set_mem_owner)
332  , init_agg_vals_(init_agg_val_vec(exe_unit.target_exprs, {}, query_mem_desc))
333  , num_buffers_(1)
340  , device_allocator_(device_allocator)
341  , thread_idx_(0) {
342  // Table functions output columnar, basically treat this as a projection
343  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
344  if (consistent_frag_sizes.empty()) {
345  // No fragments in the input, no underlying buffers will be needed.
346  return;
347  }
348 
349  size_t group_buffer_size{0};
350  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
351  group_buffer_size = num_rows_ * num_columns * sizeof(int64_t);
352  CHECK_GE(group_buffer_size, size_t(0));
353 
354  const auto index_buffer_qw =
355  device_type == ExecutorDeviceType::GPU && query_mem_desc.hasKeylessHash()
356  ? query_mem_desc.getEntryCount()
357  : size_t(0);
358  const auto actual_group_buffer_size =
359  group_buffer_size + index_buffer_qw * sizeof(int64_t);
360  CHECK_GE(actual_group_buffer_size, group_buffer_size);
361 
362  CHECK_EQ(num_buffers_, size_t(1));
363  auto group_by_buffer = alloc_group_by_buffer(
364  actual_group_buffer_size, nullptr, thread_idx_, row_set_mem_owner.get());
365  if (!query_mem_desc.lazyInitGroups(device_type)) {
366  initColumnarGroups(
367  query_mem_desc, group_by_buffer + index_buffer_qw, init_agg_vals_, executor);
368  }
369  group_by_buffers_.push_back(group_by_buffer);
370 
371  const auto column_frag_offsets =
372  get_col_frag_offsets(exe_unit.target_exprs, frag_offsets);
373  const auto column_frag_sizes =
374  get_consistent_frags_sizes(exe_unit.target_exprs, consistent_frag_sizes);
375  result_sets_.emplace_back(
376  new ResultSet(target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
377  /*col_lazy_fetch_info=*/{},
378  col_buffers,
379  column_frag_offsets,
380  column_frag_sizes,
381  device_type,
382  device_id,
384  row_set_mem_owner_,
385  executor->getCatalog(),
386  executor->blockSize(),
387  executor->gridSize()));
388  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
389  init_agg_vals_);
390 }
391 
393  int64_t* buffer,
394  const RelAlgExecutionUnit& ra_exe_unit,
396  const ExecutorDeviceType device_type,
397  const bool output_columnar,
398  const Executor* executor) {
399  if (output_columnar) {
400  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor);
401  } else {
402  auto rows_ptr = buffer;
403  auto actual_entry_count = query_mem_desc.getEntryCount();
404  const auto thread_count = device_type == ExecutorDeviceType::GPU
405  ? executor->blockSize() * executor->gridSize()
406  : 1;
407  auto warp_size =
408  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
409  if (query_mem_desc.useStreamingTopN()) {
410  const auto node_count_size = thread_count * sizeof(int64_t);
411  memset(rows_ptr, 0, node_count_size);
412  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
413  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
414  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
415  rows_ptr += rows_offset / sizeof(int64_t);
416  actual_entry_count = n * thread_count;
417  warp_size = 1;
418  }
419  initRowGroups(query_mem_desc,
420  rows_ptr,
422  actual_entry_count,
423  warp_size,
424  executor);
425  }
426 }
427 
429  int64_t* groups_buffer,
430  const std::vector<int64_t>& init_vals,
431  const int32_t groups_buffer_entry_count,
432  const size_t warp_size,
433  const Executor* executor) {
434  const size_t key_count{query_mem_desc.getGroupbyColCount()};
435  const size_t row_size{query_mem_desc.getRowSize()};
436  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
437 
438  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
439  auto quantile_params = allocateTDigests(query_mem_desc, true, executor);
440  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
441 
442  const auto query_mem_desc_fixedup =
444 
445  auto const is_true = [](auto const& x) { return static_cast<bool>(x); };
446  // not COUNT DISTINCT / APPROX_COUNT_DISTINCT / APPROX_QUANTILE
447  // we fallback to default implementation in that cases
448  if (!std::any_of(agg_bitmap_size.begin(), agg_bitmap_size.end(), is_true) &&
449  !std::any_of(quantile_params.begin(), quantile_params.end(), is_true) &&
451  std::vector<int8_t> sample_row(row_size - col_base_off);
452 
453  initColumnsPerRow(query_mem_desc_fixedup,
454  sample_row.data(),
455  init_vals,
456  agg_bitmap_size,
457  quantile_params);
458 
459  if (query_mem_desc.hasKeylessHash()) {
460  CHECK(warp_size >= 1);
461  CHECK(key_count == 1 || warp_size == 1);
462  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
463  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
464  ++bin, buffer_ptr += row_size) {
465  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
466  }
467  }
468  return;
469  }
470 
471  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
472  ++bin, buffer_ptr += row_size) {
473  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
475  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
476  }
477  } else {
478  if (query_mem_desc.hasKeylessHash()) {
479  CHECK(warp_size >= 1);
480  CHECK(key_count == 1 || warp_size == 1);
481  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
482  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
483  ++bin, buffer_ptr += row_size) {
484  initColumnsPerRow(query_mem_desc_fixedup,
485  &buffer_ptr[col_base_off],
486  init_vals,
487  agg_bitmap_size,
488  quantile_params);
489  }
490  }
491  return;
492  }
493 
494  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
495  ++bin, buffer_ptr += row_size) {
497  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
498  initColumnsPerRow(query_mem_desc_fixedup,
499  &buffer_ptr[col_base_off],
500  init_vals,
501  agg_bitmap_size,
502  quantile_params);
503  }
504  }
505 }
506 
507 namespace {
508 
509 template <typename T>
510 int8_t* initColumnarBuffer(T* buffer_ptr, const T init_val, const uint32_t entry_count) {
511  static_assert(sizeof(T) <= sizeof(int64_t), "Unsupported template type");
512  for (uint32_t i = 0; i < entry_count; ++i) {
513  buffer_ptr[i] = init_val;
514  }
515  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
516 }
517 
518 } // namespace
519 
522  int64_t* groups_buffer,
523  const std::vector<int64_t>& init_vals,
524  const Executor* executor) {
525  CHECK(groups_buffer);
526  for (const auto target_expr : executor->plan_state_->target_exprs_) {
527  const auto agg_info = get_target_info(target_expr, g_bigint_count);
528  CHECK(!is_distinct_target(agg_info));
529  }
530  const int32_t agg_col_count = query_mem_desc.getSlotCount();
531  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
532 
533  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
534  if (!query_mem_desc.hasKeylessHash()) {
535  const size_t key_count{query_mem_desc.getGroupbyColCount()};
536  for (size_t i = 0; i < key_count; ++i) {
537  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
538  EMPTY_KEY_64,
539  groups_buffer_entry_count);
540  }
541  }
542 
544  // initializing all aggregate columns:
545  int32_t init_val_idx = 0;
546  for (int32_t i = 0; i < agg_col_count; ++i) {
547  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
548  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
549  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
550  case 1:
551  buffer_ptr = initColumnarBuffer<int8_t>(
552  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
553  break;
554  case 2:
555  buffer_ptr =
556  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
557  init_vals[init_val_idx++],
558  groups_buffer_entry_count);
559  break;
560  case 4:
561  buffer_ptr =
562  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
563  init_vals[init_val_idx++],
564  groups_buffer_entry_count);
565  break;
566  case 8:
567  buffer_ptr =
568  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
569  init_vals[init_val_idx++],
570  groups_buffer_entry_count);
571  break;
572  case 0:
573  break;
574  default:
575  CHECK(false);
576  }
577 
578  buffer_ptr = align_to_int64(buffer_ptr);
579  }
580  }
581  }
582 }
583 
586  int8_t* row_ptr,
587  const std::vector<int64_t>& init_vals,
588  const std::vector<int64_t>& bitmap_sizes,
589  const std::vector<QuantileParam>& quantile_params) {
590  int8_t* col_ptr = row_ptr;
591  size_t init_vec_idx = 0;
592  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
593  col_ptr += query_mem_desc.getNextColOffInBytesRowOnly(col_ptr, col_idx++)) {
594  const int64_t bm_sz{bitmap_sizes[col_idx]};
595  int64_t init_val{0};
596  if (bm_sz && query_mem_desc.isGroupBy()) {
597  // COUNT DISTINCT / APPROX_COUNT_DISTINCT
598  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
599  sizeof(int64_t));
600  init_val =
602  ++init_vec_idx;
603  } else if (query_mem_desc.isGroupBy() && quantile_params[col_idx]) {
604  auto const q = *quantile_params[col_idx];
605  // allocate for APPROX_QUANTILE only when slot is used
606  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
607  ++init_vec_idx;
608  } else {
609  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
610  CHECK_LT(init_vec_idx, init_vals.size());
611  init_val = init_vals[init_vec_idx++];
612  }
613  }
614  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
615  case 1:
616  *col_ptr = static_cast<int8_t>(init_val);
617  break;
618  case 2:
619  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
620  break;
621  case 4:
622  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
623  break;
624  case 8:
625  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
626  break;
627  case 0:
628  continue;
629  default:
630  CHECK(false);
631  }
632  }
633 }
634 
637  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
638  return;
639  }
641 
642  size_t total_bytes_per_entry{0};
643  const size_t num_count_distinct_descs =
644  query_mem_desc.getCountDistinctDescriptorsSize();
645  for (size_t i = 0; i < num_count_distinct_descs; i++) {
646  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
647  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
648  continue;
649  }
650  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
651  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
652  }
653 
655  total_bytes_per_entry * query_mem_desc.getEntryCount();
656  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
658  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
660 
663 }
664 
665 // deferred is true for group by queries; initGroups will allocate a bitmap
666 // for each group slot
669  const bool deferred,
670  const Executor* executor) {
671  const size_t agg_col_count{query_mem_desc.getSlotCount()};
672  std::vector<int64_t> agg_bitmap_size(deferred ? agg_col_count : 0);
673 
674  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
675  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
676  ++target_idx) {
677  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
678  const auto agg_info = get_target_info(target_expr, g_bigint_count);
679  if (is_distinct_target(agg_info)) {
680  CHECK(agg_info.is_agg &&
681  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
682  CHECK(!agg_info.sql_type.is_varlen());
683 
684  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
685  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
686 
687  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
688  sizeof(int64_t));
689  const auto& count_distinct_desc =
690  query_mem_desc.getCountDistinctDescriptor(target_idx);
691  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
692  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
693  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
694  if (deferred) {
695  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
696  } else {
697  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
698  }
699  } else {
700  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::StdSet);
701  if (deferred) {
702  agg_bitmap_size[agg_col_idx] = -1;
703  } else {
704  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
705  }
706  }
707  }
708  }
709 
710  return agg_bitmap_size;
711 }
712 
713 int64_t QueryMemoryInitializer::allocateCountDistinctBitmap(const size_t bitmap_byte_sz) {
717  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
718  row_set_mem_owner_->addCountDistinctBuffer(
719  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
720  return reinterpret_cast<int64_t>(ptr);
721  }
722  return reinterpret_cast<int64_t>(
723  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz, thread_idx_));
724 }
725 
727  auto count_distinct_set = new std::set<int64_t>();
728  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
729  return reinterpret_cast<int64_t>(count_distinct_set);
730 }
731 
732 std::vector<QueryMemoryInitializer::QuantileParam>
734  const bool deferred,
735  const Executor* executor) {
736  size_t const slot_count = query_mem_desc.getSlotCount();
737  size_t const ntargets = executor->plan_state_->target_exprs_.size();
738  CHECK_GE(slot_count, ntargets);
739  std::vector<QuantileParam> quantile_params(deferred ? slot_count : 0);
740 
741  for (size_t target_idx = 0; target_idx < ntargets; ++target_idx) {
742  auto const target_expr = executor->plan_state_->target_exprs_[target_idx];
743  if (auto const agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr)) {
744  if (agg_expr->get_aggtype() == kAPPROX_QUANTILE) {
745  size_t const agg_col_idx =
746  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
747  CHECK_LT(agg_col_idx, slot_count);
748  CHECK_EQ(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx),
749  static_cast<int8_t>(sizeof(int64_t)));
750  auto const q = agg_expr->get_arg1()->get_constval().doubleval;
751  if (deferred) {
752  quantile_params[agg_col_idx] = q;
753  } else {
754  // allocate for APPROX_QUANTILE only when slot is used
755  init_agg_vals_[agg_col_idx] =
756  reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
757  }
758  }
759  }
760  }
761  return quantile_params;
762 }
763 
764 #ifdef HAVE_CUDA
765 GpuGroupByBuffers QueryMemoryInitializer::prepareTopNHeapsDevBuffer(
767  const CUdeviceptr init_agg_vals_dev_ptr,
768  const size_t n,
769  const int device_id,
770  const unsigned block_size_x,
771  const unsigned grid_size_x) {
773  const auto thread_count = block_size_x * grid_size_x;
774  const auto total_buff_size =
775  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
776  CUdeviceptr dev_buffer =
777  reinterpret_cast<CUdeviceptr>(device_allocator_->alloc(total_buff_size));
778 
779  std::vector<CUdeviceptr> dev_buffers(thread_count);
780 
781  for (size_t i = 0; i < thread_count; ++i) {
782  dev_buffers[i] = dev_buffer;
783  }
784 
785  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(CUdeviceptr));
787  reinterpret_cast<int8_t*>(dev_buffers.data()),
788  thread_count * sizeof(CUdeviceptr));
789 
791 
792  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
793  thread_count * sizeof(int64_t));
794 
796  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
797  (unsigned char)-1,
798  thread_count * n * sizeof(int64_t));
799 
801  reinterpret_cast<int64_t*>(
802  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
803  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
804  n * thread_count,
805  query_mem_desc.getGroupbyColCount(),
806  query_mem_desc.getEffectiveKeyWidth(),
807  query_mem_desc.getRowSize() / sizeof(int64_t),
808  query_mem_desc.hasKeylessHash(),
809  1,
810  block_size_x,
811  grid_size_x);
812 
813  return {reinterpret_cast<CUdeviceptr>(dev_ptr), dev_buffer};
814 }
815 
816 GpuGroupByBuffers QueryMemoryInitializer::createAndInitializeGroupByBufferGpu(
817  const RelAlgExecutionUnit& ra_exe_unit,
818  const QueryMemoryDescriptor& query_mem_desc,
819  const CUdeviceptr init_agg_vals_dev_ptr,
820  const int device_id,
821  const ExecutorDispatchMode dispatch_mode,
822  const unsigned block_size_x,
823  const unsigned grid_size_x,
824  const int8_t warp_size,
825  const bool can_sort_on_gpu,
826  const bool output_columnar,
827  RenderAllocator* render_allocator) {
828  if (query_mem_desc.useStreamingTopN()) {
829  if (render_allocator) {
831  }
832  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
833  CHECK(!output_columnar);
834 
835  return prepareTopNHeapsDevBuffer(
836  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
837  }
838 
839  auto dev_group_by_buffers =
842  query_mem_desc,
843  block_size_x,
844  grid_size_x,
845  device_id,
846  dispatch_mode,
847  num_rows_,
848  can_sort_on_gpu,
849  false,
850  ra_exe_unit.use_bump_allocator,
851  query_mem_desc.hasVarlenOutput(),
852  render_allocator);
853  if (query_mem_desc.hasVarlenOutput()) {
854  CHECK(dev_group_by_buffers.varlen_output_buffer);
855  varlen_output_buffer_ = dev_group_by_buffers.varlen_output_buffer;
856  CHECK(query_mem_desc.varlenOutputBufferElemSize());
857  const size_t varlen_output_buf_bytes =
858  query_mem_desc.getEntryCount() *
859  query_mem_desc.varlenOutputBufferElemSize().value();
861  row_set_mem_owner_->allocate(varlen_output_buf_bytes, thread_idx_);
863  varlen_output_info_->gpu_start_address = static_cast<int64_t>(varlen_output_buffer_);
865  }
866  if (render_allocator) {
867  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
868  }
869  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
870  CHECK(!render_allocator);
871 
872  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
873  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
874  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
875  auto group_by_dev_buffer = dev_group_by_buffers.data;
876  const size_t col_count = query_mem_desc.getSlotCount();
877  int8_t* col_widths_dev_ptr{nullptr};
878  if (output_columnar) {
879  std::vector<int8_t> compact_col_widths(col_count);
880  for (size_t idx = 0; idx < col_count; ++idx) {
881  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
882  }
883  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
885  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
886  }
887  const int8_t warp_count =
888  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
889  const auto num_group_by_buffers =
890  getGroupByBuffersSize() - (query_mem_desc.hasVarlenOutput() ? 1 : 0);
891  for (size_t i = 0; i < num_group_by_buffers; i += step) {
892  if (output_columnar) {
894  reinterpret_cast<int64_t*>(group_by_dev_buffer),
895  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
896  dev_group_by_buffers.entry_count,
897  query_mem_desc.getGroupbyColCount(),
898  col_count,
899  col_widths_dev_ptr,
900  /*need_padding = */ true,
901  query_mem_desc.hasKeylessHash(),
902  sizeof(int64_t),
903  block_size_x,
904  grid_size_x);
905  } else {
906  init_group_by_buffer_on_device(reinterpret_cast<int64_t*>(group_by_dev_buffer),
907  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
908  dev_group_by_buffers.entry_count,
909  query_mem_desc.getGroupbyColCount(),
910  query_mem_desc.getEffectiveKeyWidth(),
911  query_mem_desc.getRowSize() / sizeof(int64_t),
912  query_mem_desc.hasKeylessHash(),
913  warp_count,
914  block_size_x,
915  grid_size_x);
916  }
917  group_by_dev_buffer += groups_buffer_size;
918  }
919  }
920  return dev_group_by_buffers;
921 }
922 
923 GpuGroupByBuffers QueryMemoryInitializer::setupTableFunctionGpuBuffers(
924  const QueryMemoryDescriptor& query_mem_desc,
925  const int device_id,
926  const unsigned block_size_x,
927  const unsigned grid_size_x) {
928  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
929  CHECK_GT(num_columns, size_t(0));
930 
931  const size_t column_size = num_rows_ * sizeof(int64_t);
932  const size_t groups_buffer_size = num_columns * (column_size == 0 ? 1 : column_size);
933  const size_t mem_size =
934  groups_buffer_size * (query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
935 
936  int8_t* dev_buffers_allocation{nullptr};
937  dev_buffers_allocation = device_allocator_->alloc(mem_size);
938  CHECK(dev_buffers_allocation);
939 
940  CUdeviceptr dev_buffers_mem = reinterpret_cast<CUdeviceptr>(dev_buffers_allocation);
941  const size_t step{block_size_x};
942  const size_t num_ptrs{block_size_x * grid_size_x};
943  std::vector<CUdeviceptr> dev_buffers(num_columns * num_ptrs);
944  auto dev_buffer = dev_buffers_mem;
945  for (size_t i = 0; i < num_ptrs; i += step) {
946  for (size_t j = 0; j < step; j += 1) {
947  for (size_t k = 0; k < num_columns; k++) {
948  dev_buffers[(i + j) * num_columns + k] = dev_buffer + k * column_size;
949  }
950  }
951  if (!query_mem_desc.blocksShareMemory()) {
952  dev_buffer += groups_buffer_size;
953  }
954  }
955 
956  auto dev_ptr = device_allocator_->alloc(num_columns * num_ptrs * sizeof(CUdeviceptr));
958  reinterpret_cast<int8_t*>(dev_buffers.data()),
959  num_columns * num_ptrs * sizeof(CUdeviceptr));
960 
961  return {reinterpret_cast<CUdeviceptr>(dev_ptr), dev_buffers_mem, (size_t)num_rows_};
962 }
963 
964 void QueryMemoryInitializer::copyFromTableFunctionGpuBuffers(
965  Data_Namespace::DataMgr* data_mgr,
966  const QueryMemoryDescriptor& query_mem_desc,
967  const size_t entry_count,
968  const GpuGroupByBuffers& gpu_group_by_buffers,
969  const int device_id,
970  const unsigned block_size_x,
971  const unsigned grid_size_x) {
972  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
973  const size_t column_size = entry_count * sizeof(int64_t);
974  const size_t orig_column_size = gpu_group_by_buffers.entry_count * sizeof(int64_t);
975  int8_t* dev_buffer = reinterpret_cast<int8_t*>(gpu_group_by_buffers.data);
976  int8_t* host_buffer = reinterpret_cast<int8_t*>(group_by_buffers_[0]);
977  CHECK_LE(column_size, orig_column_size);
978  if (orig_column_size == column_size) {
979  copy_from_gpu(data_mgr,
980  host_buffer,
981  reinterpret_cast<CUdeviceptr>(dev_buffer),
982  column_size * num_columns,
983  device_id);
984  } else {
985  for (size_t k = 0; k < num_columns; ++k) {
986  copy_from_gpu(data_mgr,
987  host_buffer,
988  reinterpret_cast<CUdeviceptr>(dev_buffer),
989  column_size,
990  device_id);
991  dev_buffer += orig_column_size;
992  host_buffer += column_size;
993  }
994  }
995 }
996 
997 #endif
998 
1000  const QueryMemoryDescriptor& query_mem_desc,
1001  const ExecutorDeviceType device_type,
1002  const Executor* executor) const {
1003  return device_type == ExecutorDeviceType::CPU
1004  ? 1
1005  : executor->blockSize() *
1006  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
1007 }
1008 
1009 namespace {
1010 
1011 // in-place compaction of output buffer
1013  const QueryMemoryDescriptor& query_mem_desc,
1014  int8_t* projection_buffer,
1015  const size_t projection_count) {
1016  // the first column (row indices) remains unchanged.
1017  CHECK(projection_count <= query_mem_desc.getEntryCount());
1018  constexpr size_t row_index_width = sizeof(int64_t);
1019  size_t buffer_offset1{projection_count * row_index_width};
1020  // other columns are actual non-lazy columns for the projection:
1021  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
1022  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
1023  auto column_proj_size =
1024  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
1025  auto buffer_offset2 = query_mem_desc.getColOffInBytes(i);
1026  if (buffer_offset1 + column_proj_size >= buffer_offset2) {
1027  // overlapping
1028  std::memmove(projection_buffer + buffer_offset1,
1029  projection_buffer + buffer_offset2,
1030  column_proj_size);
1031  } else {
1032  std::memcpy(projection_buffer + buffer_offset1,
1033  projection_buffer + buffer_offset2,
1034  column_proj_size);
1035  }
1036  buffer_offset1 += align_to_int64(column_proj_size);
1037  }
1038  }
1039 }
1040 
1041 } // namespace
1042 
1044  const QueryMemoryDescriptor& query_mem_desc,
1045  const size_t projection_count) {
1046  const auto num_allocated_rows =
1047  std::min(projection_count, query_mem_desc.getEntryCount());
1048  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1049 
1050  // copy the results from the main buffer into projection_buffer
1052  query_mem_desc,
1053  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1054  num_allocated_rows);
1055 
1056  // update the entry count for the result set, and its underlying storage
1057  CHECK(!result_sets_.empty());
1058  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1059 }
1060 
1062  const QueryMemoryDescriptor& query_mem_desc,
1063  Data_Namespace::DataMgr* data_mgr,
1064  const GpuGroupByBuffers& gpu_group_by_buffers,
1065  const size_t projection_count,
1066  const int device_id) {
1067  // store total number of allocated rows:
1068  const auto num_allocated_rows =
1069  std::min(projection_count, query_mem_desc.getEntryCount());
1070 
1071  // copy the results from the main buffer into projection_buffer
1072  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1074  data_mgr,
1075  gpu_group_by_buffers,
1076  query_mem_desc,
1077  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1078  num_allocated_rows,
1079  device_id);
1080 
1081  // update the entry count for the result set, and its underlying storage
1082  CHECK(!result_sets_.empty());
1083  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1084 }
1085 
1087  Data_Namespace::DataMgr* data_mgr,
1088  const QueryMemoryDescriptor& query_mem_desc,
1089  const size_t entry_count,
1090  const GpuGroupByBuffers& gpu_group_by_buffers,
1091  const RelAlgExecutionUnit* ra_exe_unit,
1092  const unsigned block_size_x,
1093  const unsigned grid_size_x,
1094  const int device_id,
1095  const bool prepend_index_buffer) const {
1096  const auto thread_count = block_size_x * grid_size_x;
1097 
1098  size_t total_buff_size{0};
1099  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
1100  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
1101  total_buff_size =
1102  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1103  } else {
1104  total_buff_size =
1105  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
1106  }
1109  total_buff_size,
1110  gpu_group_by_buffers.data,
1111  query_mem_desc,
1112  block_size_x,
1113  grid_size_x,
1114  device_id,
1115  prepend_index_buffer,
1116  query_mem_desc.hasVarlenOutput());
1117 }
1118 
1120  const QueryMemoryDescriptor& query_mem_desc,
1121  const RelAlgExecutionUnit& ra_exe_unit) {
1122  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1123  CHECK_EQ(group_by_buffers_.size(), buffer_start_idx + 1);
1124 
1125  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1126  group_by_buffers_[buffer_start_idx],
1127  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1128  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
1129  1);
1130  CHECK_EQ(rows_copy.size(),
1131  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1132  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1133 }
1134 
1136  Data_Namespace::DataMgr* data_mgr,
1137  const QueryMemoryDescriptor& query_mem_desc,
1138  const GpuGroupByBuffers& gpu_group_by_buffers,
1139  const RelAlgExecutionUnit& ra_exe_unit,
1140  const unsigned total_thread_count,
1141  const int device_id) {
1142 #ifdef HAVE_CUDA
1144  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1145 
1146  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1147  data_mgr,
1148  reinterpret_cast<int64_t*>(gpu_group_by_buffers.data),
1149  ra_exe_unit,
1150  query_mem_desc,
1151  total_thread_count,
1152  device_id);
1153  CHECK_EQ(
1154  rows_copy.size(),
1155  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1156  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1157 #else
1158  UNREACHABLE();
1159 #endif
1160 }
1161 
1162 std::shared_ptr<VarlenOutputInfo> QueryMemoryInitializer::getVarlenOutputInfo() {
1163  if (varlen_output_info_) {
1164  return varlen_output_info_;
1165  }
1166 
1167  // shared_ptr so that both the ResultSet and QMI can hold on to the varlen info object
1168  // and update it as needed
1169  varlen_output_info_ = std::make_shared<VarlenOutputInfo>(VarlenOutputInfo{
1170  static_cast<int64_t>(varlen_output_buffer_), varlen_output_buffer_host_ptr_});
1171  return varlen_output_info_;
1172 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:214
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
RenderAllocator * getRenderAllocator(size_t device_id)
bool countDistinctDescriptorsLogicallyEmpty() const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:69
#define EMPTY_KEY_64
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< QuantileParam > &quantile_params)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
DeviceAllocator * device_allocator_
ExecutorDeviceType
CUdeviceptr data
Definition: GpuMemUtils.h:61
Streaming Top N algorithm.
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
unsigned long long CUdeviceptr
Definition: nocuda.h:27
int8_t * allocate(const size_t num_bytes, const size_t thread_idx=0) override
virtual void copyToDevice(int8_t *device_dst, const int8_t *host_src, const size_t num_bytes) const =0
#define UNREACHABLE()
Definition: Logger.h:250
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
#define CHECK_GE(x, y)
Definition: Logger.h:219
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
varlen_output_buffer_(0)
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getEffectiveKeyWidth() const
num_buffers_(1)
#define CHECK_GT(x, y)
Definition: Logger.h:218
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< QuantileParam > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
varlen_output_buffer_host_ptr_(nullptr)
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)
const size_t limit
int64_t g_bitmap_memory_limit
std::vector< int64_t > init_agg_vals_
size_t getGroupbyColCount() const
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
void copy_from_gpu(Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
bool g_bigint_count
size_t g_max_memory_allocation_size
Definition: Execute.cpp:105
size_t getAllocatedSize() const
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:153
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
const std::shared_ptr< Analyzer::Estimator > estimator
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
size_t getCountDistinctDescriptorsSize() const
QueryDescriptionType getQueryDescriptionType() const
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
std::optional< size_t > varlenOutputBufferElemSize() const
#define CHECK_LT(x, y)
Definition: Logger.h:216
std::shared_ptr< VarlenOutputInfo > getVarlenOutputInfo()
count_distinct_bitmap_mem_(0)
#define CHECK_LE(x, y)
Definition: Logger.h:217
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
size_t getNextColOffInBytesRowOnly(const int8_t *col_ptr, const size_t col_idx) const
Definition: sqldefs.h:76
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:491
void copyGroupByBuffersFromGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
count_distinct_bitmap_host_mem_(nullptr)
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
device_allocator_(device_allocator)
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:206
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *cuda_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:60
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
Basic constructors and methods of the row set interface.
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
bool g_optimize_row_initialization
Definition: Execute.cpp:94
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)
count_distinct_bitmap_crt_ptr_(nullptr)
std::shared_ptr< VarlenOutputInfo > varlen_output_info_
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
const size_t offset
count_distinct_bitmap_mem_bytes_(0)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
size_t getColOffInBytes(const size_t col_idx) const
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, const size_t thread_idx, RowSetMemoryOwner *mem_owner)
std::vector< std::unique_ptr< ResultSet > > result_sets_
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)