OmniSciDB  8a228a1076
QueryMemoryInitializer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2019 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryInitializer.h"
18 
19 #include "Execute.h"
20 #include "GpuInitGroups.h"
21 #include "GpuMemUtils.h"
22 #include "Logger/Logger.h"
24 #include "ResultSet.h"
25 #include "StreamingTopN.h"
26 
27 #include <Shared/checked_alloc.h>
28 
29 namespace {
30 
31 inline void check_total_bitmap_memory(const QueryMemoryDescriptor& query_mem_desc) {
32  const int32_t groups_buffer_entry_count = query_mem_desc.getEntryCount();
33  if (g_enable_watchdog) {
34  checked_int64_t total_bytes_per_group = 0;
35  const size_t num_count_distinct_descs =
36  query_mem_desc.getCountDistinctDescriptorsSize();
37  for (size_t i = 0; i < num_count_distinct_descs; i++) {
38  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
39  if (count_distinct_desc.impl_type_ != CountDistinctImplType::Bitmap) {
40  continue;
41  }
42  total_bytes_per_group += count_distinct_desc.bitmapPaddedSizeBytes();
43  }
44  int64_t total_bytes{0};
45  // Using OutOfHostMemory until we can verify that SlabTooBig would also be properly
46  // caught
47  try {
48  total_bytes =
49  static_cast<int64_t>(total_bytes_per_group * groups_buffer_entry_count);
50  } catch (...) {
51  // Absurd amount of memory, merely computing the number of bits overflows int64_t.
52  // Don't bother to report the real amount, this is unlikely to ever happen.
53  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
54  }
55  if (total_bytes >= 2 * 1000 * 1000 * 1000L) {
56  throw OutOfHostMemory(total_bytes);
57  }
58  }
59 }
60 
61 int64_t* alloc_group_by_buffer(const size_t numBytes,
62  RenderAllocatorMap* render_allocator_map,
63  RowSetMemoryOwner* mem_owner) {
64  if (render_allocator_map) {
65  // NOTE(adb): If we got here, we are performing an in-situ rendering query and are not
66  // using CUDA buffers. Therefore we need to allocate result set storage using CPU
67  // memory.
68  const auto gpu_idx = 0; // Only 1 GPU supported in CUDA-disabled rendering mode
69  auto render_allocator_ptr = render_allocator_map->getRenderAllocator(gpu_idx);
70  return reinterpret_cast<int64_t*>(render_allocator_ptr->alloc(numBytes));
71  } else {
72  return reinterpret_cast<int64_t*>(mem_owner->allocate(numBytes));
73  }
74 }
75 
76 inline int64_t get_consistent_frag_size(const std::vector<uint64_t>& frag_offsets) {
77  if (frag_offsets.size() < 2) {
78  return ssize_t(-1);
79  }
80  const auto frag_size = frag_offsets[1] - frag_offsets[0];
81  for (size_t i = 2; i < frag_offsets.size(); ++i) {
82  const auto curr_size = frag_offsets[i] - frag_offsets[i - 1];
83  if (curr_size != frag_size) {
84  return int64_t(-1);
85  }
86  }
87  return !frag_size ? std::numeric_limits<int64_t>::max()
88  : static_cast<int64_t>(frag_size);
89 }
90 
91 inline std::vector<int64_t> get_consistent_frags_sizes(
92  const std::vector<std::vector<uint64_t>>& frag_offsets) {
93  if (frag_offsets.empty()) {
94  return {};
95  }
96  std::vector<int64_t> frag_sizes;
97  for (size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
98  std::vector<uint64_t> tab_offs;
99  for (auto& offsets : frag_offsets) {
100  tab_offs.push_back(offsets[tab_idx]);
101  }
102  frag_sizes.push_back(get_consistent_frag_size(tab_offs));
103  }
104  return frag_sizes;
105 }
106 
107 inline std::vector<int64_t> get_consistent_frags_sizes(
108  const std::vector<Analyzer::Expr*>& target_exprs,
109  const std::vector<int64_t>& table_frag_sizes) {
110  std::vector<int64_t> col_frag_sizes;
111  for (auto expr : target_exprs) {
112  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
113  if (col_var->get_rte_idx() < 0) {
114  CHECK_EQ(-1, col_var->get_rte_idx());
115  col_frag_sizes.push_back(int64_t(-1));
116  } else {
117  col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
118  }
119  } else {
120  col_frag_sizes.push_back(int64_t(-1));
121  }
122  }
123  return col_frag_sizes;
124 }
125 
126 inline std::vector<std::vector<int64_t>> get_col_frag_offsets(
127  const std::vector<Analyzer::Expr*>& target_exprs,
128  const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
129  std::vector<std::vector<int64_t>> col_frag_offsets;
130  for (auto& table_offsets : table_frag_offsets) {
131  std::vector<int64_t> col_offsets;
132  for (auto expr : target_exprs) {
133  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
134  if (col_var->get_rte_idx() < 0) {
135  CHECK_EQ(-1, col_var->get_rte_idx());
136  col_offsets.push_back(int64_t(-1));
137  } else {
138  CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
139  col_offsets.push_back(
140  static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
141  }
142  } else {
143  col_offsets.push_back(int64_t(-1));
144  }
145  }
146  col_frag_offsets.push_back(col_offsets);
147  }
148  return col_frag_offsets;
149 }
150 
151 } // namespace
152 
154  const RelAlgExecutionUnit& ra_exe_unit,
155  const QueryMemoryDescriptor& query_mem_desc,
156  const int device_id,
157  const ExecutorDeviceType device_type,
158  const ExecutorDispatchMode dispatch_mode,
159  const bool output_columnar,
160  const bool sort_on_gpu,
161  const int64_t num_rows,
162  const std::vector<std::vector<const int8_t*>>& col_buffers,
163  const std::vector<std::vector<uint64_t>>& frag_offsets,
164  RenderAllocatorMap* render_allocator_map,
165  RenderInfo* render_info,
166  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
167  DeviceAllocator* device_allocator,
168  const Executor* executor)
169  : num_rows_(num_rows)
170  , row_set_mem_owner_(row_set_mem_owner)
171  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
172  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
173  , count_distinct_bitmap_mem_(0)
174  , count_distinct_bitmap_mem_bytes_(0)
175  , count_distinct_bitmap_crt_ptr_(nullptr)
176  , count_distinct_bitmap_host_mem_(nullptr)
177  , device_allocator_(device_allocator) {
178  CHECK(!sort_on_gpu || output_columnar);
179 
180  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
181  if (consistent_frag_sizes.empty()) {
182  // No fragments in the input, no underlying buffers will be needed.
183  return;
184  }
185  if (!ra_exe_unit.use_bump_allocator) {
186  check_total_bitmap_memory(query_mem_desc);
187  }
188  if (device_type == ExecutorDeviceType::GPU) {
189  allocateCountDistinctGpuMem(query_mem_desc);
190  }
191 
192  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
193  allocateCountDistinctBuffers(query_mem_desc, false, executor);
194  if (render_info && render_info->useCudaBuffers()) {
195  return;
196  }
197  }
198 
199  if (ra_exe_unit.estimator) {
200  return;
201  }
202 
203  const auto thread_count = device_type == ExecutorDeviceType::GPU
204  ? executor->blockSize() * executor->gridSize()
205  : 1;
206 
207  size_t group_buffer_size{0};
208  if (ra_exe_unit.use_bump_allocator) {
209  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
210  // the fragment
211  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
212  group_buffer_size = num_rows * query_mem_desc.getRowSize();
213  } else {
214  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
215  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
216  }
217  } else {
218  group_buffer_size =
219  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
220  }
221  CHECK_GE(group_buffer_size, size_t(0));
222 
223  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
224  int64_t* group_by_buffer_template{nullptr};
225  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
226  group_by_buffer_template =
227  reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(group_buffer_size));
228  initGroupByBuffer(group_by_buffer_template,
229  ra_exe_unit,
230  query_mem_desc,
231  device_type,
232  output_columnar,
233  executor);
234  }
235 
236  if (query_mem_desc.interleavedBins(device_type)) {
237  CHECK(query_mem_desc.hasKeylessHash());
238  }
239 
240  const auto step = device_type == ExecutorDeviceType::GPU &&
241  query_mem_desc.threadsShareMemory() &&
242  query_mem_desc.isGroupBy()
243  ? executor->blockSize()
244  : size_t(1);
245  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
246  query_mem_desc.hasKeylessHash()
247  ? query_mem_desc.getEntryCount()
248  : size_t(0);
249  const auto actual_group_buffer_size =
250  group_buffer_size + index_buffer_qw * sizeof(int64_t);
251  CHECK_GE(actual_group_buffer_size, group_buffer_size);
252 
253  for (size_t i = 0; i < group_buffers_count; i += step) {
254  auto group_by_buffer = alloc_group_by_buffer(
255  actual_group_buffer_size, render_allocator_map, row_set_mem_owner_.get());
256  if (!query_mem_desc.lazyInitGroups(device_type)) {
257  if (group_by_buffer_template) {
258  memcpy(group_by_buffer + index_buffer_qw,
259  group_by_buffer_template,
260  group_buffer_size);
261  } else {
262  initGroupByBuffer(group_by_buffer + index_buffer_qw,
263  ra_exe_unit,
264  query_mem_desc,
265  device_type,
266  output_columnar,
267  executor);
268  }
269  }
270  group_by_buffers_.push_back(group_by_buffer);
271  for (size_t j = 1; j < step; ++j) {
272  group_by_buffers_.push_back(nullptr);
273  }
274  const auto column_frag_offsets =
275  get_col_frag_offsets(ra_exe_unit.target_exprs, frag_offsets);
276  const auto column_frag_sizes =
277  get_consistent_frags_sizes(ra_exe_unit.target_exprs, consistent_frag_sizes);
278  result_sets_.emplace_back(
279  new ResultSet(target_exprs_to_infos(ra_exe_unit.target_exprs, query_mem_desc),
280  executor->getColLazyFetchInfo(ra_exe_unit.target_exprs),
281  col_buffers,
282  column_frag_offsets,
283  column_frag_sizes,
284  device_type,
285  device_id,
288  executor));
289  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
290  executor->plan_state_->init_agg_vals_);
291  for (size_t j = 1; j < step; ++j) {
292  result_sets_.emplace_back(nullptr);
293  }
294  }
295 }
296 
298  const TableFunctionExecutionUnit& exe_unit,
299  const QueryMemoryDescriptor& query_mem_desc,
300  const int device_id,
301  const ExecutorDeviceType device_type,
302  const int64_t num_rows,
303  const std::vector<std::vector<const int8_t*>>& col_buffers,
304  const std::vector<std::vector<uint64_t>>& frag_offsets,
305  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
306  DeviceAllocator* device_allocator,
307  const Executor* executor)
308  : num_rows_(num_rows)
309  , row_set_mem_owner_(row_set_mem_owner)
310  , init_agg_vals_(init_agg_val_vec(exe_unit.target_exprs, {}, query_mem_desc))
311  , num_buffers_(/*computeNumberOfBuffers(query_mem_desc, device_type, executor)*/ 1)
316  , device_allocator_(device_allocator) {
317  // Table functions output columnar, basically treat this as a projection
318  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
319  if (consistent_frag_sizes.empty()) {
320  // No fragments in the input, no underlying buffers will be needed.
321  return;
322  }
323 
324  size_t group_buffer_size{0};
325  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
326  group_buffer_size = num_rows_ * num_columns * sizeof(int64_t);
327  CHECK_GE(group_buffer_size, size_t(0));
328 
329  const auto index_buffer_qw =
330  device_type == ExecutorDeviceType::GPU && query_mem_desc.hasKeylessHash()
331  ? query_mem_desc.getEntryCount()
332  : size_t(0);
333  const auto actual_group_buffer_size =
334  group_buffer_size + index_buffer_qw * sizeof(int64_t);
335  CHECK_GE(actual_group_buffer_size, group_buffer_size);
336 
337  CHECK_EQ(num_buffers_, size_t(1));
338  auto group_by_buffer =
339  alloc_group_by_buffer(actual_group_buffer_size, nullptr, row_set_mem_owner.get());
340  if (!query_mem_desc.lazyInitGroups(device_type)) {
342  query_mem_desc, group_by_buffer + index_buffer_qw, init_agg_vals_, executor);
343  }
344  group_by_buffers_.push_back(group_by_buffer);
345 
346  const auto column_frag_offsets =
347  get_col_frag_offsets(exe_unit.target_exprs, frag_offsets);
348  const auto column_frag_sizes =
349  get_consistent_frags_sizes(exe_unit.target_exprs, consistent_frag_sizes);
350  result_sets_.emplace_back(
351  new ResultSet(target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
352  {},
353  col_buffers,
354  column_frag_offsets,
355  column_frag_sizes,
356  device_type,
357  device_id,
360  executor));
361  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
363 }
364 
366  int64_t* buffer,
367  const RelAlgExecutionUnit& ra_exe_unit,
368  const QueryMemoryDescriptor& query_mem_desc,
369  const ExecutorDeviceType device_type,
370  const bool output_columnar,
371  const Executor* executor) {
372  if (output_columnar) {
373  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor);
374  } else {
375  auto rows_ptr = buffer;
376  auto actual_entry_count = query_mem_desc.getEntryCount();
377  const auto thread_count = device_type == ExecutorDeviceType::GPU
378  ? executor->blockSize() * executor->gridSize()
379  : 1;
380  auto warp_size =
381  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
382  if (query_mem_desc.useStreamingTopN()) {
383  const auto node_count_size = thread_count * sizeof(int64_t);
384  memset(rows_ptr, 0, node_count_size);
385  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
386  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
387  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
388  rows_ptr += rows_offset / sizeof(int64_t);
389  actual_entry_count = n * thread_count;
390  warp_size = 1;
391  }
392  initGroups(query_mem_desc,
393  rows_ptr,
395  actual_entry_count,
396  warp_size,
397  executor);
398  }
399 }
400 
402  int64_t* groups_buffer,
403  const std::vector<int64_t>& init_vals,
404  const int32_t groups_buffer_entry_count,
405  const size_t warp_size,
406  const Executor* executor) {
407  const size_t key_count{query_mem_desc.getGroupbyColCount()};
408  const size_t row_size{query_mem_desc.getRowSize()};
409  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
410 
411  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
412  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
413 
414  const auto query_mem_desc_fixedup =
416 
417  if (query_mem_desc.hasKeylessHash()) {
418  CHECK(warp_size >= 1);
419  CHECK(key_count == 1 || warp_size == 1);
420  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
421  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
422  ++bin, buffer_ptr += row_size) {
423  initColumnPerRow(query_mem_desc_fixedup,
424  &buffer_ptr[col_base_off],
425  bin,
426  init_vals,
427  agg_bitmap_size);
428  }
429  }
430  return;
431  }
432 
433  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
434  ++bin, buffer_ptr += row_size) {
435  fill_empty_key(buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
436  initColumnPerRow(query_mem_desc_fixedup,
437  &buffer_ptr[col_base_off],
438  bin,
439  init_vals,
440  agg_bitmap_size);
441  }
442 }
443 
444 namespace {
445 
446 template <typename T>
447 int8_t* initColumnarBuffer(T* buffer_ptr, const T init_val, const uint32_t entry_count) {
448  static_assert(sizeof(T) <= sizeof(int64_t), "Unsupported template type");
449  for (uint32_t i = 0; i < entry_count; ++i) {
450  buffer_ptr[i] = init_val;
451  }
452  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
453 }
454 
455 } // namespace
456 
458  const QueryMemoryDescriptor& query_mem_desc,
459  int64_t* groups_buffer,
460  const std::vector<int64_t>& init_vals,
461  const Executor* executor) {
462  CHECK(groups_buffer);
463  for (const auto target_expr : executor->plan_state_->target_exprs_) {
464  const auto agg_info = get_target_info(target_expr, g_bigint_count);
465  CHECK(!is_distinct_target(agg_info));
466  }
467  const int32_t agg_col_count = query_mem_desc.getSlotCount();
468  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
469 
470  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
471  if (!query_mem_desc.hasKeylessHash()) {
472  const size_t key_count{query_mem_desc.getGroupbyColCount()};
473  for (size_t i = 0; i < key_count; ++i) {
474  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
475  EMPTY_KEY_64,
477  }
478  }
479 
481  // initializing all aggregate columns:
482  int32_t init_val_idx = 0;
483  for (int32_t i = 0; i < agg_col_count; ++i) {
484  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
485  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
486  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
487  case 1:
488  buffer_ptr = initColumnarBuffer<int8_t>(
489  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
490  break;
491  case 2:
492  buffer_ptr =
493  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
494  init_vals[init_val_idx++],
496  break;
497  case 4:
498  buffer_ptr =
499  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
500  init_vals[init_val_idx++],
502  break;
503  case 8:
504  buffer_ptr =
505  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
506  init_vals[init_val_idx++],
508  break;
509  case 0:
510  break;
511  default:
512  CHECK(false);
513  }
514 
515  buffer_ptr = align_to_int64(buffer_ptr);
516  }
517  }
518  }
519 }
520 
522  int8_t* row_ptr,
523  const size_t bin,
524  const std::vector<int64_t>& init_vals,
525  const std::vector<ssize_t>& bitmap_sizes) {
526  int8_t* col_ptr = row_ptr;
527  size_t init_vec_idx = 0;
528  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
529  col_ptr += query_mem_desc.getNextColOffInBytes(col_ptr, bin, col_idx++)) {
530  const ssize_t bm_sz{bitmap_sizes[col_idx]};
531  int64_t init_val{0};
532  if (!bm_sz || !query_mem_desc.isGroupBy()) {
533  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
534  CHECK_LT(init_vec_idx, init_vals.size());
535  init_val = init_vals[init_vec_idx++];
536  }
537  } else {
538  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
539  sizeof(int64_t));
540  init_val =
542  ++init_vec_idx;
543  }
544  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
545  case 1:
546  *col_ptr = static_cast<int8_t>(init_val);
547  break;
548  case 2:
549  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
550  break;
551  case 4:
552  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
553  break;
554  case 8:
555  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
556  break;
557  case 0:
558  continue;
559  default:
560  CHECK(false);
561  }
562  }
563 }
564 
566  const QueryMemoryDescriptor& query_mem_desc) {
567  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
568  return;
569  }
571 
572  size_t total_bytes_per_entry{0};
573  const size_t num_count_distinct_descs =
574  query_mem_desc.getCountDistinctDescriptorsSize();
575  for (size_t i = 0; i < num_count_distinct_descs; i++) {
576  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
577  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
578  continue;
579  }
580  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
581  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
582  }
583 
585  total_bytes_per_entry * query_mem_desc.getEntryCount();
586  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
588  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
590 
593 }
594 
595 // deferred is true for group by queries; initGroups will allocate a bitmap
596 // for each group slot
598  const QueryMemoryDescriptor& query_mem_desc,
599  const bool deferred,
600  const Executor* executor) {
601  const size_t agg_col_count{query_mem_desc.getSlotCount()};
602  std::vector<ssize_t> agg_bitmap_size(deferred ? agg_col_count : 0);
603 
604  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
605  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
606  ++target_idx) {
607  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
608  const auto agg_info = get_target_info(target_expr, g_bigint_count);
609  if (is_distinct_target(agg_info)) {
610  CHECK(agg_info.is_agg &&
611  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
612  CHECK(!agg_info.sql_type.is_varlen());
613 
614  const auto agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
615  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
616 
617  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
618  sizeof(int64_t));
619  const auto& count_distinct_desc =
620  query_mem_desc.getCountDistinctDescriptor(target_idx);
621  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
622  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
623  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
624  if (deferred) {
625  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
626  } else {
627  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
628  }
629  } else {
630  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::StdSet);
631  if (deferred) {
632  agg_bitmap_size[agg_col_idx] = -1;
633  } else {
634  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
635  }
636  }
637  }
638  }
639 
640  return agg_bitmap_size;
641 }
642 
643 int64_t QueryMemoryInitializer::allocateCountDistinctBitmap(const size_t bitmap_byte_sz) {
647  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
648  row_set_mem_owner_->addCountDistinctBuffer(
649  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
650  return reinterpret_cast<int64_t>(ptr);
651  }
652  return reinterpret_cast<int64_t>(
653  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz));
654 }
655 
657  auto count_distinct_set = new std::set<int64_t>();
658  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
659  return reinterpret_cast<int64_t>(count_distinct_set);
660 }
661 
662 #ifdef HAVE_CUDA
663 GpuGroupByBuffers QueryMemoryInitializer::prepareTopNHeapsDevBuffer(
664  const QueryMemoryDescriptor& query_mem_desc,
665  const CUdeviceptr init_agg_vals_dev_ptr,
666  const size_t n,
667  const int device_id,
668  const unsigned block_size_x,
669  const unsigned grid_size_x) {
671  const auto thread_count = block_size_x * grid_size_x;
672  const auto total_buff_size =
673  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
674  CUdeviceptr dev_buffer =
675  reinterpret_cast<CUdeviceptr>(device_allocator_->alloc(total_buff_size));
676 
677  std::vector<CUdeviceptr> dev_buffers(thread_count);
678 
679  for (size_t i = 0; i < thread_count; ++i) {
680  dev_buffers[i] = dev_buffer;
681  }
682 
683  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(CUdeviceptr));
685  reinterpret_cast<int8_t*>(dev_buffers.data()),
686  thread_count * sizeof(CUdeviceptr));
687 
689 
690  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
691  thread_count * sizeof(int64_t));
692 
694  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
695  (unsigned char)-1,
696  thread_count * n * sizeof(int64_t));
697 
699  reinterpret_cast<int64_t*>(
700  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
701  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
702  n * thread_count,
703  query_mem_desc.getGroupbyColCount(),
704  query_mem_desc.getEffectiveKeyWidth(),
705  query_mem_desc.getRowSize() / sizeof(int64_t),
706  query_mem_desc.hasKeylessHash(),
707  1,
708  block_size_x,
709  grid_size_x);
710 
711  return {reinterpret_cast<CUdeviceptr>(dev_ptr), dev_buffer};
712 }
713 
714 GpuGroupByBuffers QueryMemoryInitializer::createAndInitializeGroupByBufferGpu(
715  const RelAlgExecutionUnit& ra_exe_unit,
716  const QueryMemoryDescriptor& query_mem_desc,
717  const CUdeviceptr init_agg_vals_dev_ptr,
718  const int device_id,
719  const ExecutorDispatchMode dispatch_mode,
720  const unsigned block_size_x,
721  const unsigned grid_size_x,
722  const int8_t warp_size,
723  const bool can_sort_on_gpu,
724  const bool output_columnar,
725  RenderAllocator* render_allocator) {
726  if (query_mem_desc.useStreamingTopN()) {
727  if (render_allocator) {
729  }
730  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
731  CHECK(!output_columnar);
732 
733  return prepareTopNHeapsDevBuffer(
734  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
735  }
736 
737  auto dev_group_by_buffers = create_dev_group_by_buffers(device_allocator_,
739  query_mem_desc,
740  block_size_x,
741  grid_size_x,
742  device_id,
743  dispatch_mode,
744  num_rows_,
745  can_sort_on_gpu,
746  false,
747  ra_exe_unit.use_bump_allocator,
748  render_allocator);
749 
750  if (render_allocator) {
751  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
752  }
753  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
754  CHECK(!render_allocator);
755 
756  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
757  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
758  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
759  auto group_by_dev_buffer = dev_group_by_buffers.second;
760  const size_t col_count = query_mem_desc.getSlotCount();
761  int8_t* col_widths_dev_ptr{nullptr};
762  if (output_columnar) {
763  std::vector<int8_t> compact_col_widths(col_count);
764  for (size_t idx = 0; idx < col_count; ++idx) {
765  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
766  }
767  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
769  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
770  }
771  const int8_t warp_count =
772  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
773  for (size_t i = 0; i < getGroupByBuffersSize(); i += step) {
774  if (output_columnar) {
776  reinterpret_cast<int64_t*>(group_by_dev_buffer),
777  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
778  dev_group_by_buffers.entry_count,
779  query_mem_desc.getGroupbyColCount(),
780  col_count,
781  col_widths_dev_ptr,
782  /*need_padding = */ true,
783  query_mem_desc.hasKeylessHash(),
784  sizeof(int64_t),
785  block_size_x,
786  grid_size_x);
787  } else {
788  init_group_by_buffer_on_device(reinterpret_cast<int64_t*>(group_by_dev_buffer),
789  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
790  dev_group_by_buffers.entry_count,
791  query_mem_desc.getGroupbyColCount(),
792  query_mem_desc.getEffectiveKeyWidth(),
793  query_mem_desc.getRowSize() / sizeof(int64_t),
794  query_mem_desc.hasKeylessHash(),
795  warp_count,
796  block_size_x,
797  grid_size_x);
798  }
799  group_by_dev_buffer += groups_buffer_size;
800  }
801  }
802  return dev_group_by_buffers;
803 }
804 
805 GpuGroupByBuffers QueryMemoryInitializer::setupTableFunctionGpuBuffers(
806  const QueryMemoryDescriptor& query_mem_desc,
807  const int device_id,
808  const unsigned block_size_x,
809  const unsigned grid_size_x) {
810  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
811  CHECK_GT(num_columns, size_t(0));
812 
813  const size_t column_size = num_rows_ * sizeof(int64_t);
814  const size_t groups_buffer_size = num_columns * column_size;
815  const size_t mem_size =
816  groups_buffer_size * (query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
817 
818  int8_t* dev_buffers_allocation{nullptr};
819  dev_buffers_allocation = device_allocator_->alloc(mem_size);
820  CHECK(dev_buffers_allocation);
821 
822  CUdeviceptr dev_buffers_mem = reinterpret_cast<CUdeviceptr>(dev_buffers_allocation);
823  const size_t step{block_size_x};
824  const size_t num_ptrs{block_size_x * grid_size_x};
825  std::vector<CUdeviceptr> dev_buffers(num_columns * num_ptrs);
826  auto dev_buffer = dev_buffers_mem;
827  for (size_t i = 0; i < num_ptrs; i += step) {
828  for (size_t j = 0; j < step; j += 1) {
829  for (size_t k = 0; k < num_columns; k++) {
830  dev_buffers[(i + j) * num_columns + k] = dev_buffer + k * column_size;
831  }
832  }
833  if (!query_mem_desc.blocksShareMemory()) {
834  dev_buffer += groups_buffer_size;
835  }
836  }
837 
838  auto dev_ptr = device_allocator_->alloc(num_columns * num_ptrs * sizeof(CUdeviceptr));
840  reinterpret_cast<int8_t*>(dev_buffers.data()),
841  num_columns * num_ptrs * sizeof(CUdeviceptr));
842 
843  return {reinterpret_cast<CUdeviceptr>(dev_ptr), dev_buffers_mem, (size_t)num_rows_};
844 }
845 
846 void QueryMemoryInitializer::copyFromTableFunctionGpuBuffers(
847  Data_Namespace::DataMgr* data_mgr,
848  const QueryMemoryDescriptor& query_mem_desc,
849  const size_t entry_count,
850  const GpuGroupByBuffers& gpu_group_by_buffers,
851  const int device_id,
852  const unsigned block_size_x,
853  const unsigned grid_size_x) {
854  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
855  const size_t column_size = entry_count * sizeof(int64_t);
856  const size_t orig_column_size = gpu_group_by_buffers.entry_count * sizeof(int64_t);
857  int8_t* dev_buffer = reinterpret_cast<int8_t*>(gpu_group_by_buffers.second);
858  int8_t* host_buffer = reinterpret_cast<int8_t*>(group_by_buffers_[0]);
859  CHECK_LE(column_size, orig_column_size);
860  if (orig_column_size == column_size) {
861  copy_from_gpu(data_mgr,
862  host_buffer,
863  reinterpret_cast<CUdeviceptr>(dev_buffer),
864  column_size * num_columns,
865  device_id);
866  } else {
867  for (size_t k = 0; k < num_columns; ++k) {
868  copy_from_gpu(data_mgr,
869  host_buffer,
870  reinterpret_cast<CUdeviceptr>(dev_buffer),
871  column_size,
872  device_id);
873  dev_buffer += orig_column_size;
874  host_buffer += column_size;
875  }
876  }
877 }
878 
879 #endif
880 
882  const QueryMemoryDescriptor& query_mem_desc,
883  const ExecutorDeviceType device_type,
884  const Executor* executor) const {
885  return device_type == ExecutorDeviceType::CPU
886  ? 1
887  : executor->blockSize() *
888  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
889 }
890 
891 namespace {
892 
893 // in-place compaction of output buffer
895  const QueryMemoryDescriptor& query_mem_desc,
896  int8_t* projection_buffer,
897  const size_t projection_count) {
898  // the first column (row indices) remains unchanged.
899  CHECK(projection_count <= query_mem_desc.getEntryCount());
900  constexpr size_t row_index_width = sizeof(int64_t);
901  size_t buffer_offset1{projection_count * row_index_width};
902  // other columns are actual non-lazy columns for the projection:
903  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
904  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
905  auto column_proj_size =
906  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
907  auto buffer_offset2 = query_mem_desc.getColOffInBytes(i);
908  if (buffer_offset1 + column_proj_size >= buffer_offset2) {
909  // overlapping
910  std::memmove(projection_buffer + buffer_offset1,
911  projection_buffer + buffer_offset2,
912  column_proj_size);
913  } else {
914  std::memcpy(projection_buffer + buffer_offset1,
915  projection_buffer + buffer_offset2,
916  column_proj_size);
917  }
918  buffer_offset1 += align_to_int64(column_proj_size);
919  }
920  }
921 }
922 
923 } // namespace
924 
926  const QueryMemoryDescriptor& query_mem_desc,
927  const size_t projection_count) {
928  const auto num_allocated_rows =
929  std::min(projection_count, query_mem_desc.getEntryCount());
930 
931  // copy the results from the main buffer into projection_buffer
933  query_mem_desc,
934  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
935  num_allocated_rows);
936 
937  // update the entry count for the result set, and its underlying storage
938  CHECK(!result_sets_.empty());
939  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
940 }
941 
943  const QueryMemoryDescriptor& query_mem_desc,
944  Data_Namespace::DataMgr* data_mgr,
945  const GpuGroupByBuffers& gpu_group_by_buffers,
946  const size_t projection_count,
947  const int device_id) {
948  // store total number of allocated rows:
949  const auto num_allocated_rows =
950  std::min(projection_count, query_mem_desc.getEntryCount());
951 
952  // copy the results from the main buffer into projection_buffer
954  data_mgr,
955  gpu_group_by_buffers,
956  query_mem_desc,
957  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
958  num_allocated_rows,
959  device_id);
960 
961  // update the entry count for the result set, and its underlying storage
962  CHECK(!result_sets_.empty());
963  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
964 }
965 
967  Data_Namespace::DataMgr* data_mgr,
968  const QueryMemoryDescriptor& query_mem_desc,
969  const size_t entry_count,
970  const GpuGroupByBuffers& gpu_group_by_buffers,
971  const RelAlgExecutionUnit* ra_exe_unit,
972  const unsigned block_size_x,
973  const unsigned grid_size_x,
974  const int device_id,
975  const bool prepend_index_buffer) const {
976  const auto thread_count = block_size_x * grid_size_x;
977 
978  size_t total_buff_size{0};
979  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
980  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
981  total_buff_size =
982  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
983  } else {
984  total_buff_size =
985  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
986  }
989  total_buff_size,
990  gpu_group_by_buffers.second,
991  query_mem_desc,
992  block_size_x,
993  grid_size_x,
994  device_id,
995  prepend_index_buffer);
996 }
997 
999  const QueryMemoryDescriptor& query_mem_desc,
1000  const RelAlgExecutionUnit& ra_exe_unit) {
1001  CHECK_EQ(group_by_buffers_.size(), size_t(1));
1002 
1003  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1004  group_by_buffers_[0],
1005  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1006  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
1007  1);
1008  CHECK_EQ(rows_copy.size(),
1009  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1010  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
1011 }
1012 
1014  Data_Namespace::DataMgr* data_mgr,
1015  const QueryMemoryDescriptor& query_mem_desc,
1016  const GpuGroupByBuffers& gpu_group_by_buffers,
1017  const RelAlgExecutionUnit& ra_exe_unit,
1018  const unsigned total_thread_count,
1019  const int device_id) {
1020 #ifdef HAVE_CUDA
1022 
1023  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1024  data_mgr,
1025  reinterpret_cast<int64_t*>(gpu_group_by_buffers.second),
1026  ra_exe_unit,
1027  query_mem_desc,
1028  total_thread_count,
1029  device_id);
1030  CHECK_EQ(
1031  rows_copy.size(),
1032  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1033  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
1034 #else
1035  UNREACHABLE();
1036 #endif
1037 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:205
void initGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
RenderAllocator * getRenderAllocator(size_t device_id)
std::vector< ssize_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
#define EMPTY_KEY_64
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *cuda_allocator, const std::vector< int64_t *> &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:60
const int8_t const int64_t * num_rows
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
const int64_t const uint32_t const uint32_t const uint32_t const bool const int8_t warp_size
DeviceAllocator * device_allocator_
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
ExecutorDeviceType
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t *>> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const Executor *executor)
Streaming Top N algorithm.
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:78
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
unsigned long long CUdeviceptr
Definition: nocuda.h:27
size_t getCountDistinctDescriptorsSize() const
virtual void copyToDevice(int8_t *device_dst, const int8_t *host_src, const size_t num_bytes) const =0
#define UNREACHABLE()
Definition: Logger.h:241
#define CHECK_GE(x, y)
Definition: Logger.h:210
const int32_t groups_buffer_size
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
virtual int8_t * alloc(const size_t num_bytes)=0
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t agg_col_count, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
const int64_t const uint32_t groups_buffer_entry_count
#define CHECK_GT(x, y)
Definition: Logger.h:209
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
bool useCudaBuffers() const
Definition: RenderInfo.cpp:69
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
const auto getGroupByBuffersSize() const
const size_t limit
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t *> &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer)
size_t getNextColOffInBytes(const int8_t *col_ptr, const size_t bin, const size_t col_idx) const
CUdeviceptr second
Definition: GpuMemUtils.h:61
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr *> &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)
std::vector< int64_t > init_agg_vals_
const SortInfo sort_info
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
int8_t * allocate(const size_t num_bytes)
bool interleavedBins(const ExecutorDeviceType) const
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool countDistinctDescriptorsLogicallyEmpty() const
void copy_from_gpu(Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
bool g_bigint_count
size_t g_max_memory_allocation_size
Definition: Execute.cpp:100
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:129
const std::shared_ptr< Analyzer::Estimator > estimator
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
#define CHECK_LT(x, y)
Definition: Logger.h:207
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< Analyzer::Expr *> &target_exprs, const std::vector< int64_t > &table_frag_sizes)
#define CHECK_LE(x, y)
Definition: Logger.h:208
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
Definition: sqldefs.h:76
void copyGroupByBuffersFromGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:509
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr *> &targets, const QueryMemoryDescriptor &query_mem_desc)
size_t getAllocatedSize() const
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
#define CHECK(condition)
Definition: Logger.h:197
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
bool g_enable_watchdog
Definition: Execute.cpp:74
Basic constructors and methods of the row set interface.
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
size_t getColOffInBytes(const size_t col_idx) const
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void > > checked_int64_t
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
const int64_t * init_vals
const size_t offset
QueryDescriptionType getQueryDescriptionType() const
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
std::vector< std::unique_ptr< ResultSet > > result_sets_
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, RowSetMemoryOwner *mem_owner)
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
size_t getEffectiveKeyWidth() const
void initColumnPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const size_t bin, const std::vector< int64_t > &init_vals, const std::vector< ssize_t > &bitmap_sizes)
void sort_on_gpu(int64_t *val_buff, int32_t *key_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)