OmniSciDB  340b00dbf6
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
QueryMemoryInitializer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2019 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryInitializer.h"
18 
19 #include "Execute.h"
20 #include "GpuInitGroups.h"
21 #include "GpuMemUtils.h"
22 #include "Logger/Logger.h"
24 #include "ResultSet.h"
25 #include "StreamingTopN.h"
26 
27 #include <Shared/checked_alloc.h>
28 
29 // 8 GB, the limit of perfect hash group by under normal conditions
30 int64_t g_bitmap_memory_limit{8 * 1000 * 1000 * 1000L};
31 
32 namespace {
33 
35  const int32_t groups_buffer_entry_count = query_mem_desc.getEntryCount();
36  checked_int64_t total_bytes_per_group = 0;
37  const size_t num_count_distinct_descs =
38  query_mem_desc.getCountDistinctDescriptorsSize();
39  for (size_t i = 0; i < num_count_distinct_descs; i++) {
40  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
41  if (count_distinct_desc.impl_type_ != CountDistinctImplType::Bitmap) {
42  continue;
43  }
44  total_bytes_per_group += count_distinct_desc.bitmapPaddedSizeBytes();
45  }
46  int64_t total_bytes{0};
47  // Using OutOfHostMemory until we can verify that SlabTooBig would also be properly
48  // caught
49  try {
50  total_bytes = static_cast<int64_t>(total_bytes_per_group * groups_buffer_entry_count);
51  } catch (...) {
52  // Absurd amount of memory, merely computing the number of bits overflows int64_t.
53  // Don't bother to report the real amount, this is unlikely to ever happen.
54  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
55  }
56  if (total_bytes >= g_bitmap_memory_limit) {
57  throw OutOfHostMemory(total_bytes);
58  }
59 }
60 
61 int64_t* alloc_group_by_buffer(const size_t numBytes,
62  RenderAllocatorMap* render_allocator_map,
63  RowSetMemoryOwner* mem_owner) {
64  if (render_allocator_map) {
65  // NOTE(adb): If we got here, we are performing an in-situ rendering query and are not
66  // using CUDA buffers. Therefore we need to allocate result set storage using CPU
67  // memory.
68  const auto gpu_idx = 0; // Only 1 GPU supported in CUDA-disabled rendering mode
69  auto render_allocator_ptr = render_allocator_map->getRenderAllocator(gpu_idx);
70  return reinterpret_cast<int64_t*>(render_allocator_ptr->alloc(numBytes));
71  } else {
72  return reinterpret_cast<int64_t*>(mem_owner->allocate(numBytes));
73  }
74 }
75 
76 inline int64_t get_consistent_frag_size(const std::vector<uint64_t>& frag_offsets) {
77  if (frag_offsets.size() < 2) {
78  return int64_t(-1);
79  }
80  const auto frag_size = frag_offsets[1] - frag_offsets[0];
81  for (size_t i = 2; i < frag_offsets.size(); ++i) {
82  const auto curr_size = frag_offsets[i] - frag_offsets[i - 1];
83  if (curr_size != frag_size) {
84  return int64_t(-1);
85  }
86  }
87  return !frag_size ? std::numeric_limits<int64_t>::max()
88  : static_cast<int64_t>(frag_size);
89 }
90 
91 inline std::vector<int64_t> get_consistent_frags_sizes(
92  const std::vector<std::vector<uint64_t>>& frag_offsets) {
93  if (frag_offsets.empty()) {
94  return {};
95  }
96  std::vector<int64_t> frag_sizes;
97  for (size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
98  std::vector<uint64_t> tab_offs;
99  for (auto& offsets : frag_offsets) {
100  tab_offs.push_back(offsets[tab_idx]);
101  }
102  frag_sizes.push_back(get_consistent_frag_size(tab_offs));
103  }
104  return frag_sizes;
105 }
106 
107 inline std::vector<int64_t> get_consistent_frags_sizes(
108  const std::vector<Analyzer::Expr*>& target_exprs,
109  const std::vector<int64_t>& table_frag_sizes) {
110  std::vector<int64_t> col_frag_sizes;
111  for (auto expr : target_exprs) {
112  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
113  if (col_var->get_rte_idx() < 0) {
114  CHECK_EQ(-1, col_var->get_rte_idx());
115  col_frag_sizes.push_back(int64_t(-1));
116  } else {
117  col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
118  }
119  } else {
120  col_frag_sizes.push_back(int64_t(-1));
121  }
122  }
123  return col_frag_sizes;
124 }
125 
126 inline std::vector<std::vector<int64_t>> get_col_frag_offsets(
127  const std::vector<Analyzer::Expr*>& target_exprs,
128  const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
129  std::vector<std::vector<int64_t>> col_frag_offsets;
130  for (auto& table_offsets : table_frag_offsets) {
131  std::vector<int64_t> col_offsets;
132  for (auto expr : target_exprs) {
133  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
134  if (col_var->get_rte_idx() < 0) {
135  CHECK_EQ(-1, col_var->get_rte_idx());
136  col_offsets.push_back(int64_t(-1));
137  } else {
138  CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
139  col_offsets.push_back(
140  static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
141  }
142  } else {
143  col_offsets.push_back(int64_t(-1));
144  }
145  }
146  col_frag_offsets.push_back(col_offsets);
147  }
148  return col_frag_offsets;
149 }
150 
151 } // namespace
152 
154  const RelAlgExecutionUnit& ra_exe_unit,
156  const int device_id,
157  const ExecutorDeviceType device_type,
158  const ExecutorDispatchMode dispatch_mode,
159  const bool output_columnar,
160  const bool sort_on_gpu,
161  const int64_t num_rows,
162  const std::vector<std::vector<const int8_t*>>& col_buffers,
163  const std::vector<std::vector<uint64_t>>& frag_offsets,
164  RenderAllocatorMap* render_allocator_map,
165  RenderInfo* render_info,
166  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
167  DeviceAllocator* device_allocator,
168  const Executor* executor)
169  : num_rows_(num_rows)
170  , row_set_mem_owner_(row_set_mem_owner)
171  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
172  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
177  , device_allocator_(device_allocator) {
178  CHECK(!sort_on_gpu || output_columnar);
179 
180  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
181  if (consistent_frag_sizes.empty()) {
182  // No fragments in the input, no underlying buffers will be needed.
183  return;
184  }
185  if (!ra_exe_unit.use_bump_allocator) {
186  check_total_bitmap_memory(query_mem_desc);
187  }
188  if (device_type == ExecutorDeviceType::GPU) {
189  allocateCountDistinctGpuMem(query_mem_desc);
190  }
191 
192  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
193  allocateCountDistinctBuffers(query_mem_desc, false, executor);
194  if (render_info && render_info->useCudaBuffers()) {
195  return;
196  }
197  }
198 
199  if (ra_exe_unit.estimator) {
200  return;
201  }
202 
203  const auto thread_count = device_type == ExecutorDeviceType::GPU
204  ? executor->blockSize() * executor->gridSize()
205  : 1;
206 
207  size_t group_buffer_size{0};
208  if (ra_exe_unit.use_bump_allocator) {
209  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
210  // the fragment
211  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
212  group_buffer_size = num_rows * query_mem_desc.getRowSize();
213  } else {
214  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
215  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
216  }
217  } else {
218  group_buffer_size =
219  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
220  }
221  CHECK_GE(group_buffer_size, size_t(0));
222 
223  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
224  int64_t* group_by_buffer_template{nullptr};
225  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
226  group_by_buffer_template =
227  reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(group_buffer_size));
228  initGroupByBuffer(group_by_buffer_template,
229  ra_exe_unit,
230  query_mem_desc,
231  device_type,
232  output_columnar,
233  executor);
234  }
235 
236  if (query_mem_desc.interleavedBins(device_type)) {
237  CHECK(query_mem_desc.hasKeylessHash());
238  }
239 
240  const auto step = device_type == ExecutorDeviceType::GPU &&
241  query_mem_desc.threadsShareMemory() &&
242  query_mem_desc.isGroupBy()
243  ? executor->blockSize()
244  : size_t(1);
245  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
246  query_mem_desc.hasKeylessHash()
247  ? query_mem_desc.getEntryCount()
248  : size_t(0);
249  const auto actual_group_buffer_size =
250  group_buffer_size + index_buffer_qw * sizeof(int64_t);
251  CHECK_GE(actual_group_buffer_size, group_buffer_size);
252 
253  for (size_t i = 0; i < group_buffers_count; i += step) {
254  auto group_by_buffer = alloc_group_by_buffer(
255  actual_group_buffer_size, render_allocator_map, row_set_mem_owner_.get());
256  if (!query_mem_desc.lazyInitGroups(device_type)) {
257  if (group_by_buffer_template) {
258  memcpy(group_by_buffer + index_buffer_qw,
259  group_by_buffer_template,
260  group_buffer_size);
261  } else {
262  initGroupByBuffer(group_by_buffer + index_buffer_qw,
263  ra_exe_unit,
264  query_mem_desc,
265  device_type,
266  output_columnar,
267  executor);
268  }
269  }
270  group_by_buffers_.push_back(group_by_buffer);
271  for (size_t j = 1; j < step; ++j) {
272  group_by_buffers_.push_back(nullptr);
273  }
274  const auto column_frag_offsets =
275  get_col_frag_offsets(ra_exe_unit.target_exprs, frag_offsets);
276  const auto column_frag_sizes =
277  get_consistent_frags_sizes(ra_exe_unit.target_exprs, consistent_frag_sizes);
278  result_sets_.emplace_back(
279  new ResultSet(target_exprs_to_infos(ra_exe_unit.target_exprs, query_mem_desc),
280  executor->getColLazyFetchInfo(ra_exe_unit.target_exprs),
281  col_buffers,
282  column_frag_offsets,
283  column_frag_sizes,
284  device_type,
285  device_id,
288  executor));
289  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
290  executor->plan_state_->init_agg_vals_);
291  for (size_t j = 1; j < step; ++j) {
292  result_sets_.emplace_back(nullptr);
293  }
294  }
295 }
296 
298  const TableFunctionExecutionUnit& exe_unit,
300  const int device_id,
301  const ExecutorDeviceType device_type,
302  const int64_t num_rows,
303  const std::vector<std::vector<const int8_t*>>& col_buffers,
304  const std::vector<std::vector<uint64_t>>& frag_offsets,
305  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
306  DeviceAllocator* device_allocator,
307  const Executor* executor)
308  : num_rows_(num_rows)
309  , row_set_mem_owner_(row_set_mem_owner)
310  , init_agg_vals_(init_agg_val_vec(exe_unit.target_exprs, {}, query_mem_desc))
311  , num_buffers_(/*computeNumberOfBuffers(query_mem_desc, device_type, executor)*/ 1)
316  , device_allocator_(device_allocator) {
317  // Table functions output columnar, basically treat this as a projection
318  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
319  if (consistent_frag_sizes.empty()) {
320  // No fragments in the input, no underlying buffers will be needed.
321  return;
322  }
323 
324  size_t group_buffer_size{0};
325  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
326  group_buffer_size = num_rows_ * num_columns * sizeof(int64_t);
327  CHECK_GE(group_buffer_size, size_t(0));
328 
329  const auto index_buffer_qw =
330  device_type == ExecutorDeviceType::GPU && query_mem_desc.hasKeylessHash()
331  ? query_mem_desc.getEntryCount()
332  : size_t(0);
333  const auto actual_group_buffer_size =
334  group_buffer_size + index_buffer_qw * sizeof(int64_t);
335  CHECK_GE(actual_group_buffer_size, group_buffer_size);
336 
337  CHECK_EQ(num_buffers_, size_t(1));
338  auto group_by_buffer =
339  alloc_group_by_buffer(actual_group_buffer_size, nullptr, row_set_mem_owner.get());
340  if (!query_mem_desc.lazyInitGroups(device_type)) {
341  initColumnarGroups(
342  query_mem_desc, group_by_buffer + index_buffer_qw, init_agg_vals_, executor);
343  }
344  group_by_buffers_.push_back(group_by_buffer);
345 
346  const auto column_frag_offsets =
347  get_col_frag_offsets(exe_unit.target_exprs, frag_offsets);
348  const auto column_frag_sizes =
349  get_consistent_frags_sizes(exe_unit.target_exprs, consistent_frag_sizes);
350  result_sets_.emplace_back(
351  new ResultSet(target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
352  {},
353  col_buffers,
354  column_frag_offsets,
355  column_frag_sizes,
356  device_type,
357  device_id,
359  row_set_mem_owner_,
360  executor));
361  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
362  init_agg_vals_);
363 }
364 
366  int64_t* buffer,
367  const RelAlgExecutionUnit& ra_exe_unit,
369  const ExecutorDeviceType device_type,
370  const bool output_columnar,
371  const Executor* executor) {
372  if (output_columnar) {
373  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor);
374  } else {
375  auto rows_ptr = buffer;
376  auto actual_entry_count = query_mem_desc.getEntryCount();
377  const auto thread_count = device_type == ExecutorDeviceType::GPU
378  ? executor->blockSize() * executor->gridSize()
379  : 1;
380  auto warp_size =
381  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
382  if (query_mem_desc.useStreamingTopN()) {
383  const auto node_count_size = thread_count * sizeof(int64_t);
384  memset(rows_ptr, 0, node_count_size);
385  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
386  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
387  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
388  rows_ptr += rows_offset / sizeof(int64_t);
389  actual_entry_count = n * thread_count;
390  warp_size = 1;
391  }
392  initGroups(query_mem_desc,
393  rows_ptr,
395  actual_entry_count,
396  warp_size,
397  executor);
398  }
399 }
400 
402  int64_t* groups_buffer,
403  const std::vector<int64_t>& init_vals,
404  const int32_t groups_buffer_entry_count,
405  const size_t warp_size,
406  const Executor* executor) {
407  const size_t key_count{query_mem_desc.getGroupbyColCount()};
408  const size_t row_size{query_mem_desc.getRowSize()};
409  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
410 
411  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
412  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
413 
414  const auto query_mem_desc_fixedup =
416 
417  if (query_mem_desc.hasKeylessHash()) {
418  CHECK(warp_size >= 1);
419  CHECK(key_count == 1 || warp_size == 1);
420  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
421  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
422  ++bin, buffer_ptr += row_size) {
423  initColumnPerRow(query_mem_desc_fixedup,
424  &buffer_ptr[col_base_off],
425  bin,
426  init_vals,
427  agg_bitmap_size);
428  }
429  }
430  return;
431  }
432 
433  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
434  ++bin, buffer_ptr += row_size) {
436  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
437  initColumnPerRow(query_mem_desc_fixedup,
438  &buffer_ptr[col_base_off],
439  bin,
440  init_vals,
441  agg_bitmap_size);
442  }
443 }
444 
445 namespace {
446 
447 template <typename T>
448 int8_t* initColumnarBuffer(T* buffer_ptr, const T init_val, const uint32_t entry_count) {
449  static_assert(sizeof(T) <= sizeof(int64_t), "Unsupported template type");
450  for (uint32_t i = 0; i < entry_count; ++i) {
451  buffer_ptr[i] = init_val;
452  }
453  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
454 }
455 
456 } // namespace
457 
460  int64_t* groups_buffer,
461  const std::vector<int64_t>& init_vals,
462  const Executor* executor) {
463  CHECK(groups_buffer);
464  for (const auto target_expr : executor->plan_state_->target_exprs_) {
465  const auto agg_info = get_target_info(target_expr, g_bigint_count);
466  CHECK(!is_distinct_target(agg_info));
467  }
468  const int32_t agg_col_count = query_mem_desc.getSlotCount();
469  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
470 
471  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
472  if (!query_mem_desc.hasKeylessHash()) {
473  const size_t key_count{query_mem_desc.getGroupbyColCount()};
474  for (size_t i = 0; i < key_count; ++i) {
475  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
476  EMPTY_KEY_64,
477  groups_buffer_entry_count);
478  }
479  }
480 
482  // initializing all aggregate columns:
483  int32_t init_val_idx = 0;
484  for (int32_t i = 0; i < agg_col_count; ++i) {
485  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
486  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
487  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
488  case 1:
489  buffer_ptr = initColumnarBuffer<int8_t>(
490  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
491  break;
492  case 2:
493  buffer_ptr =
494  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
495  init_vals[init_val_idx++],
496  groups_buffer_entry_count);
497  break;
498  case 4:
499  buffer_ptr =
500  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
501  init_vals[init_val_idx++],
502  groups_buffer_entry_count);
503  break;
504  case 8:
505  buffer_ptr =
506  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
507  init_vals[init_val_idx++],
508  groups_buffer_entry_count);
509  break;
510  case 0:
511  break;
512  default:
513  CHECK(false);
514  }
515 
516  buffer_ptr = align_to_int64(buffer_ptr);
517  }
518  }
519  }
520 }
521 
523  int8_t* row_ptr,
524  const size_t bin,
525  const std::vector<int64_t>& init_vals,
526  const std::vector<int64_t>& bitmap_sizes) {
527  int8_t* col_ptr = row_ptr;
528  size_t init_vec_idx = 0;
529  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
530  col_ptr += query_mem_desc.getNextColOffInBytes(col_ptr, bin, col_idx++)) {
531  const int64_t bm_sz{bitmap_sizes[col_idx]};
532  int64_t init_val{0};
533  if (!bm_sz || !query_mem_desc.isGroupBy()) {
534  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
535  CHECK_LT(init_vec_idx, init_vals.size());
536  init_val = init_vals[init_vec_idx++];
537  }
538  } else {
539  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
540  sizeof(int64_t));
541  init_val =
543  ++init_vec_idx;
544  }
545  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
546  case 1:
547  *col_ptr = static_cast<int8_t>(init_val);
548  break;
549  case 2:
550  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
551  break;
552  case 4:
553  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
554  break;
555  case 8:
556  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
557  break;
558  case 0:
559  continue;
560  default:
561  CHECK(false);
562  }
563  }
564 }
565 
568  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
569  return;
570  }
572 
573  size_t total_bytes_per_entry{0};
574  const size_t num_count_distinct_descs =
575  query_mem_desc.getCountDistinctDescriptorsSize();
576  for (size_t i = 0; i < num_count_distinct_descs; i++) {
577  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
578  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
579  continue;
580  }
581  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
582  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
583  }
584 
586  total_bytes_per_entry * query_mem_desc.getEntryCount();
587  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
589  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
591 
594 }
595 
596 // deferred is true for group by queries; initGroups will allocate a bitmap
597 // for each group slot
600  const bool deferred,
601  const Executor* executor) {
602  const size_t agg_col_count{query_mem_desc.getSlotCount()};
603  std::vector<int64_t> agg_bitmap_size(deferred ? agg_col_count : 0);
604 
605  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
606  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
607  ++target_idx) {
608  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
609  const auto agg_info = get_target_info(target_expr, g_bigint_count);
610  if (is_distinct_target(agg_info)) {
611  CHECK(agg_info.is_agg &&
612  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
613  CHECK(!agg_info.sql_type.is_varlen());
614 
615  const auto agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
616  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
617 
618  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
619  sizeof(int64_t));
620  const auto& count_distinct_desc =
621  query_mem_desc.getCountDistinctDescriptor(target_idx);
622  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
623  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
624  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
625  if (deferred) {
626  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
627  } else {
628  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
629  }
630  } else {
631  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::StdSet);
632  if (deferred) {
633  agg_bitmap_size[agg_col_idx] = -1;
634  } else {
635  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
636  }
637  }
638  }
639  }
640 
641  return agg_bitmap_size;
642 }
643 
644 int64_t QueryMemoryInitializer::allocateCountDistinctBitmap(const size_t bitmap_byte_sz) {
648  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
649  row_set_mem_owner_->addCountDistinctBuffer(
650  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
651  return reinterpret_cast<int64_t>(ptr);
652  }
653  return reinterpret_cast<int64_t>(
654  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz));
655 }
656 
658  auto count_distinct_set = new std::set<int64_t>();
659  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
660  return reinterpret_cast<int64_t>(count_distinct_set);
661 }
662 
663 #ifdef HAVE_CUDA
664 GpuGroupByBuffers QueryMemoryInitializer::prepareTopNHeapsDevBuffer(
666  const CUdeviceptr init_agg_vals_dev_ptr,
667  const size_t n,
668  const int device_id,
669  const unsigned block_size_x,
670  const unsigned grid_size_x) {
672  const auto thread_count = block_size_x * grid_size_x;
673  const auto total_buff_size =
674  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
675  CUdeviceptr dev_buffer =
676  reinterpret_cast<CUdeviceptr>(device_allocator_->alloc(total_buff_size));
677 
678  std::vector<CUdeviceptr> dev_buffers(thread_count);
679 
680  for (size_t i = 0; i < thread_count; ++i) {
681  dev_buffers[i] = dev_buffer;
682  }
683 
684  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(CUdeviceptr));
686  reinterpret_cast<int8_t*>(dev_buffers.data()),
687  thread_count * sizeof(CUdeviceptr));
688 
690 
691  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
692  thread_count * sizeof(int64_t));
693 
695  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
696  (unsigned char)-1,
697  thread_count * n * sizeof(int64_t));
698 
700  reinterpret_cast<int64_t*>(
701  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
702  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
703  n * thread_count,
704  query_mem_desc.getGroupbyColCount(),
705  query_mem_desc.getEffectiveKeyWidth(),
706  query_mem_desc.getRowSize() / sizeof(int64_t),
707  query_mem_desc.hasKeylessHash(),
708  1,
709  block_size_x,
710  grid_size_x);
711 
712  return {reinterpret_cast<CUdeviceptr>(dev_ptr), dev_buffer};
713 }
714 
715 GpuGroupByBuffers QueryMemoryInitializer::createAndInitializeGroupByBufferGpu(
716  const RelAlgExecutionUnit& ra_exe_unit,
717  const QueryMemoryDescriptor& query_mem_desc,
718  const CUdeviceptr init_agg_vals_dev_ptr,
719  const int device_id,
720  const ExecutorDispatchMode dispatch_mode,
721  const unsigned block_size_x,
722  const unsigned grid_size_x,
723  const int8_t warp_size,
724  const bool can_sort_on_gpu,
725  const bool output_columnar,
726  RenderAllocator* render_allocator) {
727  if (query_mem_desc.useStreamingTopN()) {
728  if (render_allocator) {
730  }
731  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
732  CHECK(!output_columnar);
733 
734  return prepareTopNHeapsDevBuffer(
735  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
736  }
737 
738  auto dev_group_by_buffers = create_dev_group_by_buffers(device_allocator_,
740  query_mem_desc,
741  block_size_x,
742  grid_size_x,
743  device_id,
744  dispatch_mode,
745  num_rows_,
746  can_sort_on_gpu,
747  false,
748  ra_exe_unit.use_bump_allocator,
749  render_allocator);
750 
751  if (render_allocator) {
752  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
753  }
754  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
755  CHECK(!render_allocator);
756 
757  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
758  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
759  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
760  auto group_by_dev_buffer = dev_group_by_buffers.second;
761  const size_t col_count = query_mem_desc.getSlotCount();
762  int8_t* col_widths_dev_ptr{nullptr};
763  if (output_columnar) {
764  std::vector<int8_t> compact_col_widths(col_count);
765  for (size_t idx = 0; idx < col_count; ++idx) {
766  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
767  }
768  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
770  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
771  }
772  const int8_t warp_count =
773  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
774  for (size_t i = 0; i < getGroupByBuffersSize(); i += step) {
775  if (output_columnar) {
777  reinterpret_cast<int64_t*>(group_by_dev_buffer),
778  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
779  dev_group_by_buffers.entry_count,
780  query_mem_desc.getGroupbyColCount(),
781  col_count,
782  col_widths_dev_ptr,
783  /*need_padding = */ true,
784  query_mem_desc.hasKeylessHash(),
785  sizeof(int64_t),
786  block_size_x,
787  grid_size_x);
788  } else {
789  init_group_by_buffer_on_device(reinterpret_cast<int64_t*>(group_by_dev_buffer),
790  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
791  dev_group_by_buffers.entry_count,
792  query_mem_desc.getGroupbyColCount(),
793  query_mem_desc.getEffectiveKeyWidth(),
794  query_mem_desc.getRowSize() / sizeof(int64_t),
795  query_mem_desc.hasKeylessHash(),
796  warp_count,
797  block_size_x,
798  grid_size_x);
799  }
800  group_by_dev_buffer += groups_buffer_size;
801  }
802  }
803  return dev_group_by_buffers;
804 }
805 
806 GpuGroupByBuffers QueryMemoryInitializer::setupTableFunctionGpuBuffers(
807  const QueryMemoryDescriptor& query_mem_desc,
808  const int device_id,
809  const unsigned block_size_x,
810  const unsigned grid_size_x) {
811  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
812  CHECK_GT(num_columns, size_t(0));
813 
814  const size_t column_size = num_rows_ * sizeof(int64_t);
815  const size_t groups_buffer_size = num_columns * column_size;
816  const size_t mem_size =
817  groups_buffer_size * (query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
818 
819  int8_t* dev_buffers_allocation{nullptr};
820  dev_buffers_allocation = device_allocator_->alloc(mem_size);
821  CHECK(dev_buffers_allocation);
822 
823  CUdeviceptr dev_buffers_mem = reinterpret_cast<CUdeviceptr>(dev_buffers_allocation);
824  const size_t step{block_size_x};
825  const size_t num_ptrs{block_size_x * grid_size_x};
826  std::vector<CUdeviceptr> dev_buffers(num_columns * num_ptrs);
827  auto dev_buffer = dev_buffers_mem;
828  for (size_t i = 0; i < num_ptrs; i += step) {
829  for (size_t j = 0; j < step; j += 1) {
830  for (size_t k = 0; k < num_columns; k++) {
831  dev_buffers[(i + j) * num_columns + k] = dev_buffer + k * column_size;
832  }
833  }
834  if (!query_mem_desc.blocksShareMemory()) {
835  dev_buffer += groups_buffer_size;
836  }
837  }
838 
839  auto dev_ptr = device_allocator_->alloc(num_columns * num_ptrs * sizeof(CUdeviceptr));
841  reinterpret_cast<int8_t*>(dev_buffers.data()),
842  num_columns * num_ptrs * sizeof(CUdeviceptr));
843 
844  return {reinterpret_cast<CUdeviceptr>(dev_ptr), dev_buffers_mem, (size_t)num_rows_};
845 }
846 
847 void QueryMemoryInitializer::copyFromTableFunctionGpuBuffers(
848  Data_Namespace::DataMgr* data_mgr,
849  const QueryMemoryDescriptor& query_mem_desc,
850  const size_t entry_count,
851  const GpuGroupByBuffers& gpu_group_by_buffers,
852  const int device_id,
853  const unsigned block_size_x,
854  const unsigned grid_size_x) {
855  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
856  const size_t column_size = entry_count * sizeof(int64_t);
857  const size_t orig_column_size = gpu_group_by_buffers.entry_count * sizeof(int64_t);
858  int8_t* dev_buffer = reinterpret_cast<int8_t*>(gpu_group_by_buffers.second);
859  int8_t* host_buffer = reinterpret_cast<int8_t*>(group_by_buffers_[0]);
860  CHECK_LE(column_size, orig_column_size);
861  if (orig_column_size == column_size) {
862  copy_from_gpu(data_mgr,
863  host_buffer,
864  reinterpret_cast<CUdeviceptr>(dev_buffer),
865  column_size * num_columns,
866  device_id);
867  } else {
868  for (size_t k = 0; k < num_columns; ++k) {
869  copy_from_gpu(data_mgr,
870  host_buffer,
871  reinterpret_cast<CUdeviceptr>(dev_buffer),
872  column_size,
873  device_id);
874  dev_buffer += orig_column_size;
875  host_buffer += column_size;
876  }
877  }
878 }
879 
880 #endif
881 
883  const QueryMemoryDescriptor& query_mem_desc,
884  const ExecutorDeviceType device_type,
885  const Executor* executor) const {
886  return device_type == ExecutorDeviceType::CPU
887  ? 1
888  : executor->blockSize() *
889  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
890 }
891 
892 namespace {
893 
894 // in-place compaction of output buffer
896  const QueryMemoryDescriptor& query_mem_desc,
897  int8_t* projection_buffer,
898  const size_t projection_count) {
899  // the first column (row indices) remains unchanged.
900  CHECK(projection_count <= query_mem_desc.getEntryCount());
901  constexpr size_t row_index_width = sizeof(int64_t);
902  size_t buffer_offset1{projection_count * row_index_width};
903  // other columns are actual non-lazy columns for the projection:
904  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
905  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
906  auto column_proj_size =
907  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
908  auto buffer_offset2 = query_mem_desc.getColOffInBytes(i);
909  if (buffer_offset1 + column_proj_size >= buffer_offset2) {
910  // overlapping
911  std::memmove(projection_buffer + buffer_offset1,
912  projection_buffer + buffer_offset2,
913  column_proj_size);
914  } else {
915  std::memcpy(projection_buffer + buffer_offset1,
916  projection_buffer + buffer_offset2,
917  column_proj_size);
918  }
919  buffer_offset1 += align_to_int64(column_proj_size);
920  }
921  }
922 }
923 
924 } // namespace
925 
927  const QueryMemoryDescriptor& query_mem_desc,
928  const size_t projection_count) {
929  const auto num_allocated_rows =
930  std::min(projection_count, query_mem_desc.getEntryCount());
931 
932  // copy the results from the main buffer into projection_buffer
934  query_mem_desc,
935  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
936  num_allocated_rows);
937 
938  // update the entry count for the result set, and its underlying storage
939  CHECK(!result_sets_.empty());
940  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
941 }
942 
944  const QueryMemoryDescriptor& query_mem_desc,
945  Data_Namespace::DataMgr* data_mgr,
946  const GpuGroupByBuffers& gpu_group_by_buffers,
947  const size_t projection_count,
948  const int device_id) {
949  // store total number of allocated rows:
950  const auto num_allocated_rows =
951  std::min(projection_count, query_mem_desc.getEntryCount());
952 
953  // copy the results from the main buffer into projection_buffer
955  data_mgr,
956  gpu_group_by_buffers,
957  query_mem_desc,
958  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
959  num_allocated_rows,
960  device_id);
961 
962  // update the entry count for the result set, and its underlying storage
963  CHECK(!result_sets_.empty());
964  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
965 }
966 
968  Data_Namespace::DataMgr* data_mgr,
969  const QueryMemoryDescriptor& query_mem_desc,
970  const size_t entry_count,
971  const GpuGroupByBuffers& gpu_group_by_buffers,
972  const RelAlgExecutionUnit* ra_exe_unit,
973  const unsigned block_size_x,
974  const unsigned grid_size_x,
975  const int device_id,
976  const bool prepend_index_buffer) const {
977  const auto thread_count = block_size_x * grid_size_x;
978 
979  size_t total_buff_size{0};
980  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
981  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
982  total_buff_size =
983  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
984  } else {
985  total_buff_size =
986  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
987  }
990  total_buff_size,
991  gpu_group_by_buffers.second,
992  query_mem_desc,
993  block_size_x,
994  grid_size_x,
995  device_id,
996  prepend_index_buffer);
997 }
998 
1000  const QueryMemoryDescriptor& query_mem_desc,
1001  const RelAlgExecutionUnit& ra_exe_unit) {
1002  CHECK_EQ(group_by_buffers_.size(), size_t(1));
1003 
1004  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1005  group_by_buffers_[0],
1006  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1007  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
1008  1);
1009  CHECK_EQ(rows_copy.size(),
1010  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1011  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
1012 }
1013 
1015  Data_Namespace::DataMgr* data_mgr,
1016  const QueryMemoryDescriptor& query_mem_desc,
1017  const GpuGroupByBuffers& gpu_group_by_buffers,
1018  const RelAlgExecutionUnit& ra_exe_unit,
1019  const unsigned total_thread_count,
1020  const int device_id) {
1021 #ifdef HAVE_CUDA
1023 
1024  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1025  data_mgr,
1026  reinterpret_cast<int64_t*>(gpu_group_by_buffers.second),
1027  ra_exe_unit,
1028  query_mem_desc,
1029  total_thread_count,
1030  device_id);
1031  CHECK_EQ(
1032  rows_copy.size(),
1033  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1034  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
1035 #else
1036  UNREACHABLE();
1037 #endif
1038 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:205
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
void initGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
RenderAllocator * getRenderAllocator(size_t device_id)
bool countDistinctDescriptorsLogicallyEmpty() const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:69
#define EMPTY_KEY_64
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *cuda_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:60
DeviceAllocator * device_allocator_
ExecutorDeviceType
Streaming Top N algorithm.
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:79
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
unsigned long long CUdeviceptr
Definition: nocuda.h:27
virtual void copyToDevice(int8_t *device_dst, const int8_t *host_src, const size_t num_bytes) const =0
#define UNREACHABLE()
Definition: Logger.h:241
#define CHECK_GE(x, y)
Definition: Logger.h:210
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getNextColOffInBytes(const int8_t *col_ptr, const size_t bin, const size_t col_idx) const
size_t getEffectiveKeyWidth() const
num_buffers_(1)
#define CHECK_GT(x, y)
Definition: Logger.h:209
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
void initColumnPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const size_t bin, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes)
const size_t limit
CUdeviceptr second
Definition: GpuMemUtils.h:61
int64_t g_bitmap_memory_limit
std::vector< int64_t > init_agg_vals_
const SortInfo sort_info
size_t getGroupbyColCount() const
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
int8_t * allocate(const size_t num_bytes)
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
void copy_from_gpu(Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
bool g_bigint_count
size_t g_max_memory_allocation_size
Definition: Execute.cpp:99
size_t getAllocatedSize() const
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:130
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
const std::shared_ptr< Analyzer::Estimator > estimator
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
size_t getCountDistinctDescriptorsSize() const
QueryDescriptionType getQueryDescriptionType() const
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:207
count_distinct_bitmap_mem_(0)
#define CHECK_LE(x, y)
Definition: Logger.h:208
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer)
Definition: sqldefs.h:76
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:461
void copyGroupByBuffersFromGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
count_distinct_bitmap_host_mem_(nullptr)
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
device_allocator_(device_allocator)
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:197
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const Executor *executor)
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
Basic constructors and methods of the row set interface.
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)
count_distinct_bitmap_crt_ptr_(nullptr)
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
const size_t offset
count_distinct_bitmap_mem_bytes_(0)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
size_t getBufferColSlotCount() const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
size_t getColOffInBytes(const size_t col_idx) const
std::vector< std::unique_ptr< ResultSet > > result_sets_
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, RowSetMemoryOwner *mem_owner)
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)