OmniSciDB  94e8789169
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
QueryMemoryInitializer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryInitializer.h"
18 
19 #include "Execute.h"
20 #include "GpuInitGroups.h"
21 #include "GpuMemUtils.h"
22 #include "Logger/Logger.h"
24 #include "ResultSet.h"
25 #include "StreamingTopN.h"
26 
27 #include <Shared/checked_alloc.h>
28 
29 // 8 GB, the limit of perfect hash group by under normal conditions
30 int64_t g_bitmap_memory_limit{8 * 1000 * 1000 * 1000L};
31 
32 namespace {
33 
35  const int32_t groups_buffer_entry_count = query_mem_desc.getEntryCount();
36  checked_int64_t total_bytes_per_group = 0;
37  const size_t num_count_distinct_descs =
38  query_mem_desc.getCountDistinctDescriptorsSize();
39  for (size_t i = 0; i < num_count_distinct_descs; i++) {
40  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
41  if (count_distinct_desc.impl_type_ != CountDistinctImplType::Bitmap) {
42  continue;
43  }
44  total_bytes_per_group += count_distinct_desc.bitmapPaddedSizeBytes();
45  }
46  int64_t total_bytes{0};
47  // Using OutOfHostMemory until we can verify that SlabTooBig would also be properly
48  // caught
49  try {
50  total_bytes = static_cast<int64_t>(total_bytes_per_group * groups_buffer_entry_count);
51  } catch (...) {
52  // Absurd amount of memory, merely computing the number of bits overflows int64_t.
53  // Don't bother to report the real amount, this is unlikely to ever happen.
54  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
55  }
56  if (total_bytes >= g_bitmap_memory_limit) {
57  throw OutOfHostMemory(total_bytes);
58  }
59 }
60 
61 int64_t* alloc_group_by_buffer(const size_t numBytes,
62  RenderAllocatorMap* render_allocator_map,
63  RowSetMemoryOwner* mem_owner) {
64  if (render_allocator_map) {
65  // NOTE(adb): If we got here, we are performing an in-situ rendering query and are not
66  // using CUDA buffers. Therefore we need to allocate result set storage using CPU
67  // memory.
68  const auto gpu_idx = 0; // Only 1 GPU supported in CUDA-disabled rendering mode
69  auto render_allocator_ptr = render_allocator_map->getRenderAllocator(gpu_idx);
70  return reinterpret_cast<int64_t*>(render_allocator_ptr->alloc(numBytes));
71  } else {
72  return reinterpret_cast<int64_t*>(mem_owner->allocate(numBytes));
73  }
74 }
75 
76 inline int64_t get_consistent_frag_size(const std::vector<uint64_t>& frag_offsets) {
77  if (frag_offsets.size() < 2) {
78  return int64_t(-1);
79  }
80  const auto frag_size = frag_offsets[1] - frag_offsets[0];
81  for (size_t i = 2; i < frag_offsets.size(); ++i) {
82  const auto curr_size = frag_offsets[i] - frag_offsets[i - 1];
83  if (curr_size != frag_size) {
84  return int64_t(-1);
85  }
86  }
87  return !frag_size ? std::numeric_limits<int64_t>::max()
88  : static_cast<int64_t>(frag_size);
89 }
90 
91 inline std::vector<int64_t> get_consistent_frags_sizes(
92  const std::vector<std::vector<uint64_t>>& frag_offsets) {
93  if (frag_offsets.empty()) {
94  return {};
95  }
96  std::vector<int64_t> frag_sizes;
97  for (size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
98  std::vector<uint64_t> tab_offs;
99  for (auto& offsets : frag_offsets) {
100  tab_offs.push_back(offsets[tab_idx]);
101  }
102  frag_sizes.push_back(get_consistent_frag_size(tab_offs));
103  }
104  return frag_sizes;
105 }
106 
107 inline std::vector<int64_t> get_consistent_frags_sizes(
108  const std::vector<Analyzer::Expr*>& target_exprs,
109  const std::vector<int64_t>& table_frag_sizes) {
110  std::vector<int64_t> col_frag_sizes;
111  for (auto expr : target_exprs) {
112  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
113  if (col_var->get_rte_idx() < 0) {
114  CHECK_EQ(-1, col_var->get_rte_idx());
115  col_frag_sizes.push_back(int64_t(-1));
116  } else {
117  col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
118  }
119  } else {
120  col_frag_sizes.push_back(int64_t(-1));
121  }
122  }
123  return col_frag_sizes;
124 }
125 
126 inline std::vector<std::vector<int64_t>> get_col_frag_offsets(
127  const std::vector<Analyzer::Expr*>& target_exprs,
128  const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
129  std::vector<std::vector<int64_t>> col_frag_offsets;
130  for (auto& table_offsets : table_frag_offsets) {
131  std::vector<int64_t> col_offsets;
132  for (auto expr : target_exprs) {
133  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
134  if (col_var->get_rte_idx() < 0) {
135  CHECK_EQ(-1, col_var->get_rte_idx());
136  col_offsets.push_back(int64_t(-1));
137  } else {
138  CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
139  col_offsets.push_back(
140  static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
141  }
142  } else {
143  col_offsets.push_back(int64_t(-1));
144  }
145  }
146  col_frag_offsets.push_back(col_offsets);
147  }
148  return col_frag_offsets;
149 }
150 
151 } // namespace
152 
154  const RelAlgExecutionUnit& ra_exe_unit,
156  const int device_id,
157  const ExecutorDeviceType device_type,
158  const ExecutorDispatchMode dispatch_mode,
159  const bool output_columnar,
160  const bool sort_on_gpu,
161  const int64_t num_rows,
162  const std::vector<std::vector<const int8_t*>>& col_buffers,
163  const std::vector<std::vector<uint64_t>>& frag_offsets,
164  RenderAllocatorMap* render_allocator_map,
165  RenderInfo* render_info,
166  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
167  DeviceAllocator* device_allocator,
168  const Executor* executor)
169  : num_rows_(num_rows)
170  , row_set_mem_owner_(row_set_mem_owner)
171  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
172  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
177  , device_allocator_(device_allocator) {
178  CHECK(!sort_on_gpu || output_columnar);
179 
180  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
181  if (consistent_frag_sizes.empty()) {
182  // No fragments in the input, no underlying buffers will be needed.
183  return;
184  }
185  if (!ra_exe_unit.use_bump_allocator) {
186  check_total_bitmap_memory(query_mem_desc);
187  }
188  if (device_type == ExecutorDeviceType::GPU) {
189  allocateCountDistinctGpuMem(query_mem_desc);
190  }
191 
192  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
193  allocateCountDistinctBuffers(query_mem_desc, false, executor);
194  allocateTDigests(query_mem_desc, false, executor);
195  if (render_info && render_info->useCudaBuffers()) {
196  return;
197  }
198  }
199 
200  if (ra_exe_unit.estimator) {
201  return;
202  }
203 
204  const auto thread_count = device_type == ExecutorDeviceType::GPU
205  ? executor->blockSize() * executor->gridSize()
206  : 1;
207 
208  size_t group_buffer_size{0};
209  if (ra_exe_unit.use_bump_allocator) {
210  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
211  // the fragment
212  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
213  group_buffer_size = num_rows * query_mem_desc.getRowSize();
214  } else {
215  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
216  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
217  }
218  } else {
219  group_buffer_size =
220  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
221  }
222  CHECK_GE(group_buffer_size, size_t(0));
223 
224  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
225  int64_t* group_by_buffer_template{nullptr};
226  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
227  group_by_buffer_template =
228  reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(group_buffer_size));
229  initGroupByBuffer(group_by_buffer_template,
230  ra_exe_unit,
231  query_mem_desc,
232  device_type,
233  output_columnar,
234  executor);
235  }
236 
237  if (query_mem_desc.interleavedBins(device_type)) {
238  CHECK(query_mem_desc.hasKeylessHash());
239  }
240 
241  const auto step = device_type == ExecutorDeviceType::GPU &&
242  query_mem_desc.threadsShareMemory() &&
243  query_mem_desc.isGroupBy()
244  ? executor->blockSize()
245  : size_t(1);
246  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
247  query_mem_desc.hasKeylessHash()
248  ? query_mem_desc.getEntryCount()
249  : size_t(0);
250  const auto actual_group_buffer_size =
251  group_buffer_size + index_buffer_qw * sizeof(int64_t);
252  CHECK_GE(actual_group_buffer_size, group_buffer_size);
253 
254  for (size_t i = 0; i < group_buffers_count; i += step) {
255  auto group_by_buffer = alloc_group_by_buffer(
256  actual_group_buffer_size, render_allocator_map, row_set_mem_owner_.get());
257  if (!query_mem_desc.lazyInitGroups(device_type)) {
258  if (group_by_buffer_template) {
259  memcpy(group_by_buffer + index_buffer_qw,
260  group_by_buffer_template,
261  group_buffer_size);
262  } else {
263  initGroupByBuffer(group_by_buffer + index_buffer_qw,
264  ra_exe_unit,
265  query_mem_desc,
266  device_type,
267  output_columnar,
268  executor);
269  }
270  }
271  group_by_buffers_.push_back(group_by_buffer);
272  for (size_t j = 1; j < step; ++j) {
273  group_by_buffers_.push_back(nullptr);
274  }
275  const auto column_frag_offsets =
276  get_col_frag_offsets(ra_exe_unit.target_exprs, frag_offsets);
277  const auto column_frag_sizes =
278  get_consistent_frags_sizes(ra_exe_unit.target_exprs, consistent_frag_sizes);
279  result_sets_.emplace_back(
280  new ResultSet(target_exprs_to_infos(ra_exe_unit.target_exprs, query_mem_desc),
281  executor->getColLazyFetchInfo(ra_exe_unit.target_exprs),
282  col_buffers,
283  column_frag_offsets,
284  column_frag_sizes,
285  device_type,
286  device_id,
289  executor->getCatalog(),
290  executor->blockSize(),
291  executor->gridSize()));
292  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
293  executor->plan_state_->init_agg_vals_);
294  for (size_t j = 1; j < step; ++j) {
295  result_sets_.emplace_back(nullptr);
296  }
297  }
298 }
299 
301  const TableFunctionExecutionUnit& exe_unit,
303  const int device_id,
304  const ExecutorDeviceType device_type,
305  const int64_t num_rows,
306  const std::vector<std::vector<const int8_t*>>& col_buffers,
307  const std::vector<std::vector<uint64_t>>& frag_offsets,
308  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
309  DeviceAllocator* device_allocator,
310  const Executor* executor)
311  : num_rows_(num_rows)
312  , row_set_mem_owner_(row_set_mem_owner)
313  , init_agg_vals_(init_agg_val_vec(exe_unit.target_exprs, {}, query_mem_desc))
314  , num_buffers_(/*computeNumberOfBuffers(query_mem_desc, device_type, executor)*/ 1)
319  , device_allocator_(device_allocator) {
320  // Table functions output columnar, basically treat this as a projection
321  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
322  if (consistent_frag_sizes.empty()) {
323  // No fragments in the input, no underlying buffers will be needed.
324  return;
325  }
326 
327  size_t group_buffer_size{0};
328  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
329  group_buffer_size = num_rows_ * num_columns * sizeof(int64_t);
330  CHECK_GE(group_buffer_size, size_t(0));
331 
332  const auto index_buffer_qw =
333  device_type == ExecutorDeviceType::GPU && query_mem_desc.hasKeylessHash()
334  ? query_mem_desc.getEntryCount()
335  : size_t(0);
336  const auto actual_group_buffer_size =
337  group_buffer_size + index_buffer_qw * sizeof(int64_t);
338  CHECK_GE(actual_group_buffer_size, group_buffer_size);
339 
340  CHECK_EQ(num_buffers_, size_t(1));
341  auto group_by_buffer =
342  alloc_group_by_buffer(actual_group_buffer_size, nullptr, row_set_mem_owner.get());
343  if (!query_mem_desc.lazyInitGroups(device_type)) {
344  initColumnarGroups(
345  query_mem_desc, group_by_buffer + index_buffer_qw, init_agg_vals_, executor);
346  }
347  group_by_buffers_.push_back(group_by_buffer);
348 
349  const auto column_frag_offsets =
350  get_col_frag_offsets(exe_unit.target_exprs, frag_offsets);
351  const auto column_frag_sizes =
352  get_consistent_frags_sizes(exe_unit.target_exprs, consistent_frag_sizes);
353  result_sets_.emplace_back(
354  new ResultSet(target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
355  {},
356  col_buffers,
357  column_frag_offsets,
358  column_frag_sizes,
359  device_type,
360  device_id,
362  row_set_mem_owner_,
363  executor->getCatalog(),
364  executor->blockSize(),
365  executor->gridSize()));
366  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
367  init_agg_vals_);
368 }
369 
371  int64_t* buffer,
372  const RelAlgExecutionUnit& ra_exe_unit,
374  const ExecutorDeviceType device_type,
375  const bool output_columnar,
376  const Executor* executor) {
377  if (output_columnar) {
378  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor);
379  } else {
380  auto rows_ptr = buffer;
381  auto actual_entry_count = query_mem_desc.getEntryCount();
382  const auto thread_count = device_type == ExecutorDeviceType::GPU
383  ? executor->blockSize() * executor->gridSize()
384  : 1;
385  auto warp_size =
386  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
387  if (query_mem_desc.useStreamingTopN()) {
388  const auto node_count_size = thread_count * sizeof(int64_t);
389  memset(rows_ptr, 0, node_count_size);
390  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
391  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
392  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
393  rows_ptr += rows_offset / sizeof(int64_t);
394  actual_entry_count = n * thread_count;
395  warp_size = 1;
396  }
397  initGroups(query_mem_desc,
398  rows_ptr,
400  actual_entry_count,
401  warp_size,
402  executor);
403  }
404 }
405 
407  int64_t* groups_buffer,
408  const std::vector<int64_t>& init_vals,
409  const int32_t groups_buffer_entry_count,
410  const size_t warp_size,
411  const Executor* executor) {
412  const size_t key_count{query_mem_desc.getGroupbyColCount()};
413  const size_t row_size{query_mem_desc.getRowSize()};
414  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
415 
416  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
417  auto tdigest_deferred = allocateTDigests(query_mem_desc, true, executor);
418  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
419 
420  const auto query_mem_desc_fixedup =
422 
423  if (query_mem_desc.hasKeylessHash()) {
424  CHECK(warp_size >= 1);
425  CHECK(key_count == 1 || warp_size == 1);
426  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
427  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
428  ++bin, buffer_ptr += row_size) {
429  initColumnPerRow(query_mem_desc_fixedup,
430  &buffer_ptr[col_base_off],
431  bin,
432  init_vals,
433  agg_bitmap_size,
434  tdigest_deferred);
435  }
436  }
437  return;
438  }
439 
440  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
441  ++bin, buffer_ptr += row_size) {
443  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
444  initColumnPerRow(query_mem_desc_fixedup,
445  &buffer_ptr[col_base_off],
446  bin,
447  init_vals,
448  agg_bitmap_size,
449  tdigest_deferred);
450  }
451 }
452 
453 namespace {
454 
455 template <typename T>
456 int8_t* initColumnarBuffer(T* buffer_ptr, const T init_val, const uint32_t entry_count) {
457  static_assert(sizeof(T) <= sizeof(int64_t), "Unsupported template type");
458  for (uint32_t i = 0; i < entry_count; ++i) {
459  buffer_ptr[i] = init_val;
460  }
461  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
462 }
463 
464 } // namespace
465 
468  int64_t* groups_buffer,
469  const std::vector<int64_t>& init_vals,
470  const Executor* executor) {
471  CHECK(groups_buffer);
472  for (const auto target_expr : executor->plan_state_->target_exprs_) {
473  const auto agg_info = get_target_info(target_expr, g_bigint_count);
474  CHECK(!is_distinct_target(agg_info));
475  }
476  const int32_t agg_col_count = query_mem_desc.getSlotCount();
477  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
478 
479  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
480  if (!query_mem_desc.hasKeylessHash()) {
481  const size_t key_count{query_mem_desc.getGroupbyColCount()};
482  for (size_t i = 0; i < key_count; ++i) {
483  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
484  EMPTY_KEY_64,
485  groups_buffer_entry_count);
486  }
487  }
488 
490  // initializing all aggregate columns:
491  int32_t init_val_idx = 0;
492  for (int32_t i = 0; i < agg_col_count; ++i) {
493  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
494  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
495  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
496  case 1:
497  buffer_ptr = initColumnarBuffer<int8_t>(
498  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
499  break;
500  case 2:
501  buffer_ptr =
502  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
503  init_vals[init_val_idx++],
504  groups_buffer_entry_count);
505  break;
506  case 4:
507  buffer_ptr =
508  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
509  init_vals[init_val_idx++],
510  groups_buffer_entry_count);
511  break;
512  case 8:
513  buffer_ptr =
514  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
515  init_vals[init_val_idx++],
516  groups_buffer_entry_count);
517  break;
518  case 0:
519  break;
520  default:
521  CHECK(false);
522  }
523 
524  buffer_ptr = align_to_int64(buffer_ptr);
525  }
526  }
527  }
528 }
529 
531  int8_t* row_ptr,
532  const size_t bin,
533  const std::vector<int64_t>& init_vals,
534  const std::vector<int64_t>& bitmap_sizes,
535  const std::vector<bool>& tdigest_deferred) {
536  int8_t* col_ptr = row_ptr;
537  size_t init_vec_idx = 0;
538  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
539  col_ptr += query_mem_desc.getNextColOffInBytes(col_ptr, bin, col_idx++)) {
540  const int64_t bm_sz{bitmap_sizes[col_idx]};
541  int64_t init_val{0};
542  if (bm_sz && query_mem_desc.isGroupBy()) {
543  // COUNT DISTINCT / APPROX_COUNT_DISTINCT
544  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
545  sizeof(int64_t));
546  init_val =
548  ++init_vec_idx;
549  } else if (query_mem_desc.isGroupBy() && tdigest_deferred[col_idx]) {
550  // allocate for APPROX_MEDIAN only when slot is used
551  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest());
552  ++init_vec_idx;
553  } else {
554  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
555  CHECK_LT(init_vec_idx, init_vals.size());
556  init_val = init_vals[init_vec_idx++];
557  }
558  }
559  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
560  case 1:
561  *col_ptr = static_cast<int8_t>(init_val);
562  break;
563  case 2:
564  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
565  break;
566  case 4:
567  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
568  break;
569  case 8:
570  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
571  break;
572  case 0:
573  continue;
574  default:
575  CHECK(false);
576  }
577  }
578 }
579 
582  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
583  return;
584  }
586 
587  size_t total_bytes_per_entry{0};
588  const size_t num_count_distinct_descs =
589  query_mem_desc.getCountDistinctDescriptorsSize();
590  for (size_t i = 0; i < num_count_distinct_descs; i++) {
591  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
592  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
593  continue;
594  }
595  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
596  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
597  }
598 
600  total_bytes_per_entry * query_mem_desc.getEntryCount();
601  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
603  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
605 
608 }
609 
610 // deferred is true for group by queries; initGroups will allocate a bitmap
611 // for each group slot
614  const bool deferred,
615  const Executor* executor) {
616  const size_t agg_col_count{query_mem_desc.getSlotCount()};
617  std::vector<int64_t> agg_bitmap_size(deferred ? agg_col_count : 0);
618 
619  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
620  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
621  ++target_idx) {
622  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
623  const auto agg_info = get_target_info(target_expr, g_bigint_count);
624  if (is_distinct_target(agg_info)) {
625  CHECK(agg_info.is_agg &&
626  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
627  CHECK(!agg_info.sql_type.is_varlen());
628 
629  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
630  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
631 
632  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
633  sizeof(int64_t));
634  const auto& count_distinct_desc =
635  query_mem_desc.getCountDistinctDescriptor(target_idx);
636  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
637  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
638  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
639  if (deferred) {
640  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
641  } else {
642  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
643  }
644  } else {
645  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::StdSet);
646  if (deferred) {
647  agg_bitmap_size[agg_col_idx] = -1;
648  } else {
649  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
650  }
651  }
652  }
653  }
654 
655  return agg_bitmap_size;
656 }
657 
658 int64_t QueryMemoryInitializer::allocateCountDistinctBitmap(const size_t bitmap_byte_sz) {
662  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
663  row_set_mem_owner_->addCountDistinctBuffer(
664  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
665  return reinterpret_cast<int64_t>(ptr);
666  }
667  return reinterpret_cast<int64_t>(
668  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz));
669 }
670 
672  auto count_distinct_set = new std::set<int64_t>();
673  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
674  return reinterpret_cast<int64_t>(count_distinct_set);
675 }
676 
679  const bool deferred,
680  const Executor* executor) {
681  size_t const slot_count = query_mem_desc.getSlotCount();
682  size_t const ntargets = executor->plan_state_->target_exprs_.size();
683  CHECK_GE(slot_count, ntargets);
684  std::vector<bool> tdigest_deferred(deferred ? slot_count : 0);
685 
686  for (size_t target_idx = 0; target_idx < ntargets; ++target_idx) {
687  auto const target_expr = executor->plan_state_->target_exprs_[target_idx];
688  if (auto const agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr)) {
689  if (agg_expr->get_aggtype() == kAPPROX_MEDIAN) {
690  size_t const agg_col_idx =
691  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
692  CHECK_LT(agg_col_idx, slot_count);
693  CHECK_EQ(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx),
694  static_cast<int8_t>(sizeof(int64_t)));
695  if (deferred) {
696  tdigest_deferred[agg_col_idx] = true;
697  } else {
698  // allocate for APPROX_MEDIAN only when slot is used
699  init_agg_vals_[agg_col_idx] =
700  reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest());
701  }
702  }
703  }
704  }
705  return tdigest_deferred;
706 }
707 
708 #ifdef HAVE_CUDA
709 GpuGroupByBuffers QueryMemoryInitializer::prepareTopNHeapsDevBuffer(
711  const CUdeviceptr init_agg_vals_dev_ptr,
712  const size_t n,
713  const int device_id,
714  const unsigned block_size_x,
715  const unsigned grid_size_x) {
717  const auto thread_count = block_size_x * grid_size_x;
718  const auto total_buff_size =
719  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
720  CUdeviceptr dev_buffer =
721  reinterpret_cast<CUdeviceptr>(device_allocator_->alloc(total_buff_size));
722 
723  std::vector<CUdeviceptr> dev_buffers(thread_count);
724 
725  for (size_t i = 0; i < thread_count; ++i) {
726  dev_buffers[i] = dev_buffer;
727  }
728 
729  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(CUdeviceptr));
731  reinterpret_cast<int8_t*>(dev_buffers.data()),
732  thread_count * sizeof(CUdeviceptr));
733 
735 
736  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
737  thread_count * sizeof(int64_t));
738 
740  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
741  (unsigned char)-1,
742  thread_count * n * sizeof(int64_t));
743 
745  reinterpret_cast<int64_t*>(
746  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
747  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
748  n * thread_count,
749  query_mem_desc.getGroupbyColCount(),
750  query_mem_desc.getEffectiveKeyWidth(),
751  query_mem_desc.getRowSize() / sizeof(int64_t),
752  query_mem_desc.hasKeylessHash(),
753  1,
754  block_size_x,
755  grid_size_x);
756 
757  return {reinterpret_cast<CUdeviceptr>(dev_ptr), dev_buffer};
758 }
759 
760 GpuGroupByBuffers QueryMemoryInitializer::createAndInitializeGroupByBufferGpu(
761  const RelAlgExecutionUnit& ra_exe_unit,
762  const QueryMemoryDescriptor& query_mem_desc,
763  const CUdeviceptr init_agg_vals_dev_ptr,
764  const int device_id,
765  const ExecutorDispatchMode dispatch_mode,
766  const unsigned block_size_x,
767  const unsigned grid_size_x,
768  const int8_t warp_size,
769  const bool can_sort_on_gpu,
770  const bool output_columnar,
771  RenderAllocator* render_allocator) {
772  if (query_mem_desc.useStreamingTopN()) {
773  if (render_allocator) {
775  }
776  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
777  CHECK(!output_columnar);
778 
779  return prepareTopNHeapsDevBuffer(
780  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
781  }
782 
783  auto dev_group_by_buffers = create_dev_group_by_buffers(device_allocator_,
785  query_mem_desc,
786  block_size_x,
787  grid_size_x,
788  device_id,
789  dispatch_mode,
790  num_rows_,
791  can_sort_on_gpu,
792  false,
793  ra_exe_unit.use_bump_allocator,
794  render_allocator);
795 
796  if (render_allocator) {
797  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
798  }
799  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
800  CHECK(!render_allocator);
801 
802  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
803  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
804  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
805  auto group_by_dev_buffer = dev_group_by_buffers.second;
806  const size_t col_count = query_mem_desc.getSlotCount();
807  int8_t* col_widths_dev_ptr{nullptr};
808  if (output_columnar) {
809  std::vector<int8_t> compact_col_widths(col_count);
810  for (size_t idx = 0; idx < col_count; ++idx) {
811  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
812  }
813  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
815  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
816  }
817  const int8_t warp_count =
818  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
819  for (size_t i = 0; i < getGroupByBuffersSize(); i += step) {
820  if (output_columnar) {
822  reinterpret_cast<int64_t*>(group_by_dev_buffer),
823  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
824  dev_group_by_buffers.entry_count,
825  query_mem_desc.getGroupbyColCount(),
826  col_count,
827  col_widths_dev_ptr,
828  /*need_padding = */ true,
829  query_mem_desc.hasKeylessHash(),
830  sizeof(int64_t),
831  block_size_x,
832  grid_size_x);
833  } else {
834  init_group_by_buffer_on_device(reinterpret_cast<int64_t*>(group_by_dev_buffer),
835  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
836  dev_group_by_buffers.entry_count,
837  query_mem_desc.getGroupbyColCount(),
838  query_mem_desc.getEffectiveKeyWidth(),
839  query_mem_desc.getRowSize() / sizeof(int64_t),
840  query_mem_desc.hasKeylessHash(),
841  warp_count,
842  block_size_x,
843  grid_size_x);
844  }
845  group_by_dev_buffer += groups_buffer_size;
846  }
847  }
848  return dev_group_by_buffers;
849 }
850 
851 GpuGroupByBuffers QueryMemoryInitializer::setupTableFunctionGpuBuffers(
852  const QueryMemoryDescriptor& query_mem_desc,
853  const int device_id,
854  const unsigned block_size_x,
855  const unsigned grid_size_x) {
856  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
857  CHECK_GT(num_columns, size_t(0));
858 
859  const size_t column_size = num_rows_ * sizeof(int64_t);
860  const size_t groups_buffer_size = num_columns * (column_size == 0 ? 1 : column_size);
861  const size_t mem_size =
862  groups_buffer_size * (query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
863 
864  int8_t* dev_buffers_allocation{nullptr};
865  dev_buffers_allocation = device_allocator_->alloc(mem_size);
866  CHECK(dev_buffers_allocation);
867 
868  CUdeviceptr dev_buffers_mem = reinterpret_cast<CUdeviceptr>(dev_buffers_allocation);
869  const size_t step{block_size_x};
870  const size_t num_ptrs{block_size_x * grid_size_x};
871  std::vector<CUdeviceptr> dev_buffers(num_columns * num_ptrs);
872  auto dev_buffer = dev_buffers_mem;
873  for (size_t i = 0; i < num_ptrs; i += step) {
874  for (size_t j = 0; j < step; j += 1) {
875  for (size_t k = 0; k < num_columns; k++) {
876  dev_buffers[(i + j) * num_columns + k] = dev_buffer + k * column_size;
877  }
878  }
879  if (!query_mem_desc.blocksShareMemory()) {
880  dev_buffer += groups_buffer_size;
881  }
882  }
883 
884  auto dev_ptr = device_allocator_->alloc(num_columns * num_ptrs * sizeof(CUdeviceptr));
886  reinterpret_cast<int8_t*>(dev_buffers.data()),
887  num_columns * num_ptrs * sizeof(CUdeviceptr));
888 
889  return {reinterpret_cast<CUdeviceptr>(dev_ptr), dev_buffers_mem, (size_t)num_rows_};
890 }
891 
892 void QueryMemoryInitializer::copyFromTableFunctionGpuBuffers(
893  Data_Namespace::DataMgr* data_mgr,
894  const QueryMemoryDescriptor& query_mem_desc,
895  const size_t entry_count,
896  const GpuGroupByBuffers& gpu_group_by_buffers,
897  const int device_id,
898  const unsigned block_size_x,
899  const unsigned grid_size_x) {
900  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
901  const size_t column_size = entry_count * sizeof(int64_t);
902  const size_t orig_column_size = gpu_group_by_buffers.entry_count * sizeof(int64_t);
903  int8_t* dev_buffer = reinterpret_cast<int8_t*>(gpu_group_by_buffers.second);
904  int8_t* host_buffer = reinterpret_cast<int8_t*>(group_by_buffers_[0]);
905  CHECK_LE(column_size, orig_column_size);
906  if (orig_column_size == column_size) {
907  copy_from_gpu(data_mgr,
908  host_buffer,
909  reinterpret_cast<CUdeviceptr>(dev_buffer),
910  column_size * num_columns,
911  device_id);
912  } else {
913  for (size_t k = 0; k < num_columns; ++k) {
914  copy_from_gpu(data_mgr,
915  host_buffer,
916  reinterpret_cast<CUdeviceptr>(dev_buffer),
917  column_size,
918  device_id);
919  dev_buffer += orig_column_size;
920  host_buffer += column_size;
921  }
922  }
923 }
924 
925 #endif
926 
928  const QueryMemoryDescriptor& query_mem_desc,
929  const ExecutorDeviceType device_type,
930  const Executor* executor) const {
931  return device_type == ExecutorDeviceType::CPU
932  ? 1
933  : executor->blockSize() *
934  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
935 }
936 
937 namespace {
938 
939 // in-place compaction of output buffer
941  const QueryMemoryDescriptor& query_mem_desc,
942  int8_t* projection_buffer,
943  const size_t projection_count) {
944  // the first column (row indices) remains unchanged.
945  CHECK(projection_count <= query_mem_desc.getEntryCount());
946  constexpr size_t row_index_width = sizeof(int64_t);
947  size_t buffer_offset1{projection_count * row_index_width};
948  // other columns are actual non-lazy columns for the projection:
949  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
950  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
951  auto column_proj_size =
952  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
953  auto buffer_offset2 = query_mem_desc.getColOffInBytes(i);
954  if (buffer_offset1 + column_proj_size >= buffer_offset2) {
955  // overlapping
956  std::memmove(projection_buffer + buffer_offset1,
957  projection_buffer + buffer_offset2,
958  column_proj_size);
959  } else {
960  std::memcpy(projection_buffer + buffer_offset1,
961  projection_buffer + buffer_offset2,
962  column_proj_size);
963  }
964  buffer_offset1 += align_to_int64(column_proj_size);
965  }
966  }
967 }
968 
969 } // namespace
970 
972  const QueryMemoryDescriptor& query_mem_desc,
973  const size_t projection_count) {
974  const auto num_allocated_rows =
975  std::min(projection_count, query_mem_desc.getEntryCount());
976 
977  // copy the results from the main buffer into projection_buffer
979  query_mem_desc,
980  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
981  num_allocated_rows);
982 
983  // update the entry count for the result set, and its underlying storage
984  CHECK(!result_sets_.empty());
985  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
986 }
987 
989  const QueryMemoryDescriptor& query_mem_desc,
990  Data_Namespace::DataMgr* data_mgr,
991  const GpuGroupByBuffers& gpu_group_by_buffers,
992  const size_t projection_count,
993  const int device_id) {
994  // store total number of allocated rows:
995  const auto num_allocated_rows =
996  std::min(projection_count, query_mem_desc.getEntryCount());
997 
998  // copy the results from the main buffer into projection_buffer
1000  data_mgr,
1001  gpu_group_by_buffers,
1002  query_mem_desc,
1003  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
1004  num_allocated_rows,
1005  device_id);
1006 
1007  // update the entry count for the result set, and its underlying storage
1008  CHECK(!result_sets_.empty());
1009  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1010 }
1011 
1013  Data_Namespace::DataMgr* data_mgr,
1014  const QueryMemoryDescriptor& query_mem_desc,
1015  const size_t entry_count,
1016  const GpuGroupByBuffers& gpu_group_by_buffers,
1017  const RelAlgExecutionUnit* ra_exe_unit,
1018  const unsigned block_size_x,
1019  const unsigned grid_size_x,
1020  const int device_id,
1021  const bool prepend_index_buffer) const {
1022  const auto thread_count = block_size_x * grid_size_x;
1023 
1024  size_t total_buff_size{0};
1025  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
1026  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
1027  total_buff_size =
1028  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1029  } else {
1030  total_buff_size =
1031  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
1032  }
1035  total_buff_size,
1036  gpu_group_by_buffers.second,
1037  query_mem_desc,
1038  block_size_x,
1039  grid_size_x,
1040  device_id,
1041  prepend_index_buffer);
1042 }
1043 
1045  const QueryMemoryDescriptor& query_mem_desc,
1046  const RelAlgExecutionUnit& ra_exe_unit) {
1047  CHECK_EQ(group_by_buffers_.size(), size_t(1));
1048 
1049  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1050  group_by_buffers_[0],
1051  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1052  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
1053  1);
1054  CHECK_EQ(rows_copy.size(),
1055  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1056  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
1057 }
1058 
1060  Data_Namespace::DataMgr* data_mgr,
1061  const QueryMemoryDescriptor& query_mem_desc,
1062  const GpuGroupByBuffers& gpu_group_by_buffers,
1063  const RelAlgExecutionUnit& ra_exe_unit,
1064  const unsigned total_thread_count,
1065  const int device_id) {
1066 #ifdef HAVE_CUDA
1068 
1069  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1070  data_mgr,
1071  reinterpret_cast<int64_t*>(gpu_group_by_buffers.second),
1072  ra_exe_unit,
1073  query_mem_desc,
1074  total_thread_count,
1075  device_id);
1076  CHECK_EQ(
1077  rows_copy.size(),
1078  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1079  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
1080 #else
1081  UNREACHABLE();
1082 #endif
1083 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:205
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
void initGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
RenderAllocator * getRenderAllocator(size_t device_id)
bool countDistinctDescriptorsLogicallyEmpty() const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:69
#define EMPTY_KEY_64
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *cuda_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:60
DeviceAllocator * device_allocator_
ExecutorDeviceType
Streaming Top N algorithm.
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:79
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
unsigned long long CUdeviceptr
Definition: nocuda.h:27
virtual void copyToDevice(int8_t *device_dst, const int8_t *host_src, const size_t num_bytes) const =0
#define UNREACHABLE()
Definition: Logger.h:241
std::vector< bool > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
#define CHECK_GE(x, y)
Definition: Logger.h:210
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getNextColOffInBytes(const int8_t *col_ptr, const size_t bin, const size_t col_idx) const
size_t getEffectiveKeyWidth() const
num_buffers_(1)
#define CHECK_GT(x, y)
Definition: Logger.h:209
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
int8_t * allocate(const size_t num_bytes) override
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
const size_t limit
CUdeviceptr second
Definition: GpuMemUtils.h:61
int64_t g_bitmap_memory_limit
std::vector< int64_t > init_agg_vals_
const SortInfo sort_info
size_t getGroupbyColCount() const
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
void copy_from_gpu(Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
bool g_bigint_count
size_t g_max_memory_allocation_size
Definition: Execute.cpp:103
size_t getAllocatedSize() const
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:130
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
const std::shared_ptr< Analyzer::Estimator > estimator
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
size_t getCountDistinctDescriptorsSize() const
QueryDescriptionType getQueryDescriptionType() const
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:207
count_distinct_bitmap_mem_(0)
#define CHECK_LE(x, y)
Definition: Logger.h:208
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer)
Definition: sqldefs.h:76
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:483
void copyGroupByBuffersFromGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
count_distinct_bitmap_host_mem_(nullptr)
void initColumnPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const size_t bin, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< bool > &tdigest_deferred)
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
device_allocator_(device_allocator)
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:197
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const Executor *executor)
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
Basic constructors and methods of the row set interface.
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)
count_distinct_bitmap_crt_ptr_(nullptr)
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
const size_t offset
count_distinct_bitmap_mem_bytes_(0)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
size_t getBufferColSlotCount() const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
size_t getColOffInBytes(const size_t col_idx) const
std::vector< std::unique_ptr< ResultSet > > result_sets_
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, RowSetMemoryOwner *mem_owner)
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)