OmniSciDB  04ee39c94c
QueryMemoryInitializer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2019 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryInitializer.h"
18 
19 #include "Execute.h"
20 #include "GpuInitGroups.h"
21 #include "GpuMemUtils.h"
22 #include "ResultSet.h"
23 #include "Shared/Logger.h"
24 #include "StreamingTopN.h"
25 
26 #include <Shared/checked_alloc.h>
27 
28 namespace {
29 
30 inline void check_total_bitmap_memory(const QueryMemoryDescriptor& query_mem_desc) {
31  const int32_t groups_buffer_entry_count = query_mem_desc.getEntryCount();
32  if (g_enable_watchdog) {
33  checked_int64_t total_bytes_per_group = 0;
34  const size_t num_count_distinct_descs =
35  query_mem_desc.getCountDistinctDescriptorsSize();
36  for (size_t i = 0; i < num_count_distinct_descs; i++) {
37  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
38  if (count_distinct_desc.impl_type_ != CountDistinctImplType::Bitmap) {
39  continue;
40  }
41  total_bytes_per_group += count_distinct_desc.bitmapPaddedSizeBytes();
42  }
43  int64_t total_bytes{0};
44  // Using OutOfHostMemory until we can verify that SlabTooBig would also be properly
45  // caught
46  try {
47  total_bytes =
48  static_cast<int64_t>(total_bytes_per_group * groups_buffer_entry_count);
49  } catch (...) {
50  // Absurd amount of memory, merely computing the number of bits overflows int64_t.
51  // Don't bother to report the real amount, this is unlikely to ever happen.
52  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
53  }
54  if (total_bytes >= 2 * 1000 * 1000 * 1000L) {
55  throw OutOfHostMemory(total_bytes);
56  }
57  }
58 }
59 
60 int64_t* alloc_group_by_buffer(const size_t numBytes,
61  RenderAllocatorMap* render_allocator_map) {
62  if (render_allocator_map) {
63  // NOTE(adb): If we got here, we are performing an in-situ rendering query and are not
64  // using CUDA buffers. Therefore we need to allocate result set storage using CPU
65  // memory.
66  const auto gpu_idx = 0; // Only 1 GPU supported in CUDA-disabled rendering mode
67  auto render_allocator_ptr = render_allocator_map->getRenderAllocator(gpu_idx);
68  return reinterpret_cast<int64_t*>(render_allocator_ptr->alloc(numBytes));
69  } else {
70  return reinterpret_cast<int64_t*>(checked_malloc(numBytes));
71  }
72 }
73 
74 inline int64_t get_consistent_frag_size(const std::vector<uint64_t>& frag_offsets) {
75  if (frag_offsets.size() < 2) {
76  return ssize_t(-1);
77  }
78  const auto frag_size = frag_offsets[1] - frag_offsets[0];
79  for (size_t i = 2; i < frag_offsets.size(); ++i) {
80  const auto curr_size = frag_offsets[i] - frag_offsets[i - 1];
81  if (curr_size != frag_size) {
82  return int64_t(-1);
83  }
84  }
85  return !frag_size ? std::numeric_limits<int64_t>::max()
86  : static_cast<int64_t>(frag_size);
87 }
88 
89 inline std::vector<int64_t> get_consistent_frags_sizes(
90  const std::vector<std::vector<uint64_t>>& frag_offsets) {
91  if (frag_offsets.empty()) {
92  return {};
93  }
94  std::vector<int64_t> frag_sizes;
95  for (size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
96  std::vector<uint64_t> tab_offs;
97  for (auto& offsets : frag_offsets) {
98  tab_offs.push_back(offsets[tab_idx]);
99  }
100  frag_sizes.push_back(get_consistent_frag_size(tab_offs));
101  }
102  return frag_sizes;
103 }
104 
105 inline std::vector<int64_t> get_consistent_frags_sizes(
106  const std::vector<Analyzer::Expr*>& target_exprs,
107  const std::vector<int64_t>& table_frag_sizes) {
108  std::vector<int64_t> col_frag_sizes;
109  for (auto expr : target_exprs) {
110  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
111  if (col_var->get_rte_idx() < 0) {
112  CHECK_EQ(-1, col_var->get_rte_idx());
113  col_frag_sizes.push_back(int64_t(-1));
114  } else {
115  col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
116  }
117  } else {
118  col_frag_sizes.push_back(int64_t(-1));
119  }
120  }
121  return col_frag_sizes;
122 }
123 
124 inline std::vector<std::vector<int64_t>> get_col_frag_offsets(
125  const std::vector<Analyzer::Expr*>& target_exprs,
126  const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
127  std::vector<std::vector<int64_t>> col_frag_offsets;
128  for (auto& table_offsets : table_frag_offsets) {
129  std::vector<int64_t> col_offsets;
130  for (auto expr : target_exprs) {
131  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
132  if (col_var->get_rte_idx() < 0) {
133  CHECK_EQ(-1, col_var->get_rte_idx());
134  col_offsets.push_back(int64_t(-1));
135  } else {
136  CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
137  col_offsets.push_back(
138  static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
139  }
140  } else {
141  col_offsets.push_back(int64_t(-1));
142  }
143  }
144  col_frag_offsets.push_back(col_offsets);
145  }
146  return col_frag_offsets;
147 }
148 
149 } // namespace
150 
152  const RelAlgExecutionUnit& ra_exe_unit,
153  const QueryMemoryDescriptor& query_mem_desc,
154  const int device_id,
155  const ExecutorDeviceType device_type,
156  const ExecutorDispatchMode dispatch_mode,
157  const bool output_columnar,
158  const bool sort_on_gpu,
159  const int64_t num_rows,
160  const std::vector<std::vector<const int8_t*>>& col_buffers,
161  const std::vector<std::vector<uint64_t>>& frag_offsets,
162  RenderAllocatorMap* render_allocator_map,
163  RenderInfo* render_info,
164  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
165  DeviceAllocator* device_allocator,
166  const Executor* executor)
167  : num_rows_(num_rows)
168  , row_set_mem_owner_(row_set_mem_owner)
169  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
170  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
171  , count_distinct_bitmap_mem_(0)
172  , count_distinct_bitmap_mem_bytes_(0)
173  , count_distinct_bitmap_crt_ptr_(nullptr)
174  , count_distinct_bitmap_host_mem_(nullptr)
175  , device_allocator_(device_allocator) {
176  CHECK(!sort_on_gpu || output_columnar);
177 
178  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
179  if (consistent_frag_sizes.empty()) {
180  // No fragments in the input, no underlying buffers will be needed.
181  return;
182  }
183  if (!ra_exe_unit.use_bump_allocator) {
184  check_total_bitmap_memory(query_mem_desc);
185  }
186  if (device_type == ExecutorDeviceType::GPU) {
187  allocateCountDistinctGpuMem(query_mem_desc);
188  }
189 
190  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
191  allocateCountDistinctBuffers(query_mem_desc, false, executor);
192  if (render_info && render_info->useCudaBuffers()) {
193  return;
194  }
195  }
196 
197  if (ra_exe_unit.estimator) {
198  return;
199  }
200 
201  const auto thread_count = device_type == ExecutorDeviceType::GPU
202  ? executor->blockSize() * executor->gridSize()
203  : 1;
204 
205  size_t group_buffer_size{0};
206  if (ra_exe_unit.use_bump_allocator) {
207  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
208  // the fragment
209  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
210  group_buffer_size = num_rows * query_mem_desc.getRowSize();
211  } else {
212  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
213  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
214  }
215  } else {
216  group_buffer_size =
217  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
218  }
219  CHECK_GE(group_buffer_size, size_t(0));
220 
221  std::unique_ptr<int64_t, CheckedAllocDeleter> group_by_buffer_template;
222  if (!query_mem_desc.lazyInitGroups(device_type)) {
223  group_by_buffer_template.reset(
224  static_cast<int64_t*>(checked_malloc(group_buffer_size)));
225 
226  if (output_columnar) {
228  query_mem_desc, group_by_buffer_template.get(), init_agg_vals_, executor);
229  } else {
230  auto rows_ptr = group_by_buffer_template.get();
231  auto actual_entry_count = query_mem_desc.getEntryCount();
232  auto warp_size =
233  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
234  if (use_streaming_top_n(ra_exe_unit, query_mem_desc.didOutputColumnar())) {
235  const auto node_count_size = thread_count * sizeof(int64_t);
236  memset(rows_ptr, 0, node_count_size);
237  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
238  const auto rows_offset =
240  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
241  rows_ptr += rows_offset / sizeof(int64_t);
242  actual_entry_count = n * thread_count;
243  warp_size = 1;
244  }
245  initGroups(query_mem_desc,
246  rows_ptr,
248  actual_entry_count,
249  warp_size,
250  executor);
251  }
252  }
253 
254  if (query_mem_desc.interleavedBins(device_type)) {
255  CHECK(query_mem_desc.hasKeylessHash());
256  }
257 
258  const auto step = device_type == ExecutorDeviceType::GPU &&
259  query_mem_desc.threadsShareMemory() &&
260  query_mem_desc.isGroupBy()
261  ? executor->blockSize()
262  : size_t(1);
263  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
264  query_mem_desc.hasKeylessHash()
265  ? query_mem_desc.getEntryCount()
266  : size_t(0);
267  const auto actual_group_buffer_size =
268  group_buffer_size + index_buffer_qw * sizeof(int64_t);
269  CHECK_GE(actual_group_buffer_size, group_buffer_size);
270  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
271 
272  for (size_t i = 0; i < group_buffers_count; i += step) {
273  auto group_by_buffer =
274  alloc_group_by_buffer(actual_group_buffer_size, render_allocator_map);
275  if (!query_mem_desc.lazyInitGroups(device_type)) {
276  CHECK(group_by_buffer_template);
277  memcpy(group_by_buffer + index_buffer_qw,
278  group_by_buffer_template.get(),
279  group_buffer_size);
280  }
281  if (!render_allocator_map) {
282  row_set_mem_owner_->addGroupByBuffer(group_by_buffer);
283  }
284  group_by_buffers_.push_back(group_by_buffer);
285  for (size_t j = 1; j < step; ++j) {
286  group_by_buffers_.push_back(nullptr);
287  }
288  const auto column_frag_offsets =
289  get_col_frag_offsets(ra_exe_unit.target_exprs, frag_offsets);
290  const auto column_frag_sizes =
291  get_consistent_frags_sizes(ra_exe_unit.target_exprs, consistent_frag_sizes);
292  result_sets_.emplace_back(
293  new ResultSet(target_exprs_to_infos(ra_exe_unit.target_exprs, query_mem_desc),
294  executor->getColLazyFetchInfo(ra_exe_unit.target_exprs),
295  col_buffers,
296  column_frag_offsets,
297  column_frag_sizes,
298  device_type,
299  device_id,
302  executor));
303  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
304  executor->plan_state_->init_agg_vals_);
305  for (size_t j = 1; j < step; ++j) {
306  result_sets_.emplace_back(nullptr);
307  }
308  }
309 }
310 
312  int64_t* groups_buffer,
313  const std::vector<int64_t>& init_vals,
314  const int32_t groups_buffer_entry_count,
315  const size_t warp_size,
316  const Executor* executor) {
317  const size_t key_count{query_mem_desc.groupColWidthsSize()};
318  const size_t row_size{query_mem_desc.getRowSize()};
319  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
320 
321  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
322  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
323 
324  const auto query_mem_desc_fixedup =
326 
327  if (query_mem_desc.hasKeylessHash()) {
328  CHECK(warp_size >= 1);
329  CHECK(key_count == 1);
330  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
331  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
332  ++bin, buffer_ptr += row_size) {
333  initColumnPerRow(query_mem_desc_fixedup,
334  &buffer_ptr[col_base_off],
335  bin,
336  init_vals,
337  agg_bitmap_size);
338  }
339  }
340  return;
341  }
342 
343  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
344  ++bin, buffer_ptr += row_size) {
345  fill_empty_key(buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
346  initColumnPerRow(query_mem_desc_fixedup,
347  &buffer_ptr[col_base_off],
348  bin,
349  init_vals,
350  agg_bitmap_size);
351  }
352 }
353 
354 namespace {
355 
356 template <typename T>
357 int8_t* initColumnarBuffer(T* buffer_ptr, const T init_val, const uint32_t entry_count) {
358  static_assert(sizeof(T) <= sizeof(int64_t), "Unsupported template type");
359  for (uint32_t i = 0; i < entry_count; ++i) {
360  buffer_ptr[i] = init_val;
361  }
362  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
363 }
364 
365 } // namespace
366 
368  const QueryMemoryDescriptor& query_mem_desc,
369  int64_t* groups_buffer,
370  const std::vector<int64_t>& init_vals,
371  const Executor* executor) {
372  CHECK(groups_buffer);
373  for (const auto target_expr : executor->plan_state_->target_exprs_) {
374  const auto agg_info = get_target_info(target_expr, g_bigint_count);
375  CHECK(!is_distinct_target(agg_info));
376  }
377  const int32_t agg_col_count = query_mem_desc.getSlotCount();
378  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
379 
380  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
381  if (!query_mem_desc.hasKeylessHash()) {
382  const size_t key_count{query_mem_desc.groupColWidthsSize()};
383  for (size_t i = 0; i < key_count; ++i) {
384  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
385  EMPTY_KEY_64,
387  }
388  }
389  // initializing all aggregate columns:
390  int32_t init_val_idx = 0;
391  for (int32_t i = 0; i < agg_col_count; ++i) {
392  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
393  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
394  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
395  case 1:
396  buffer_ptr = initColumnarBuffer<int8_t>(
397  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
398  break;
399  case 2:
400  buffer_ptr = initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
401  init_vals[init_val_idx++],
403  break;
404  case 4:
405  buffer_ptr = initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
406  init_vals[init_val_idx++],
408  break;
409  case 8:
410  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
411  init_vals[init_val_idx++],
413  break;
414  case 0:
415  break;
416  default:
417  CHECK(false);
418  }
419 
420  buffer_ptr = align_to_int64(buffer_ptr);
421  }
422  }
423 }
424 
426  int8_t* row_ptr,
427  const size_t bin,
428  const std::vector<int64_t>& init_vals,
429  const std::vector<ssize_t>& bitmap_sizes) {
430  int8_t* col_ptr = row_ptr;
431  size_t init_vec_idx = 0;
432  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
433  col_ptr += query_mem_desc.getNextColOffInBytes(col_ptr, bin, col_idx++)) {
434  const ssize_t bm_sz{bitmap_sizes[col_idx]};
435  int64_t init_val{0};
436  if (!bm_sz || !query_mem_desc.isGroupBy()) {
437  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
438  CHECK_LT(init_vec_idx, init_vals.size());
439  init_val = init_vals[init_vec_idx++];
440  }
441  } else {
442  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
443  sizeof(int64_t));
444  init_val =
446  ++init_vec_idx;
447  }
448  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
449  case 1:
450  *col_ptr = static_cast<int8_t>(init_val);
451  break;
452  case 2:
453  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
454  break;
455  case 4:
456  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
457  break;
458  case 8:
459  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
460  break;
461  case 0:
462  continue;
463  default:
464  CHECK(false);
465  }
466  }
467 }
468 
470  const QueryMemoryDescriptor& query_mem_desc) {
471  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
472  return;
473  }
475 
476  size_t total_bytes_per_entry{0};
477  const size_t num_count_distinct_descs =
478  query_mem_desc.getCountDistinctDescriptorsSize();
479  for (size_t i = 0; i < num_count_distinct_descs; i++) {
480  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
481  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
482  continue;
483  }
484  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
485  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
486  }
487 
489  total_bytes_per_entry * query_mem_desc.getEntryCount();
490  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
492  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
494 
496  static_cast<int8_t*>(checked_malloc(count_distinct_bitmap_mem_bytes_));
497  row_set_mem_owner_->addCountDistinctBuffer(
499 }
500 
501 // deferred is true for group by queries; initGroups will allocate a bitmap
502 // for each group slot
504  const QueryMemoryDescriptor& query_mem_desc,
505  const bool deferred,
506  const Executor* executor) {
507  const size_t agg_col_count{query_mem_desc.getSlotCount()};
508  std::vector<ssize_t> agg_bitmap_size(deferred ? agg_col_count : 0);
509 
510  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
511  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
512  ++target_idx) {
513  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
514  const auto agg_info = get_target_info(target_expr, g_bigint_count);
515  if (is_distinct_target(agg_info)) {
516  CHECK(agg_info.is_agg &&
517  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
518  CHECK(!agg_info.sql_type.is_varlen());
519 
520  const auto agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
521  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
522 
523  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
524  sizeof(int64_t));
525  const auto& count_distinct_desc =
526  query_mem_desc.getCountDistinctDescriptor(target_idx);
527  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
528  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
529  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
530  if (deferred) {
531  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
532  } else {
533  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
534  }
535  } else {
536  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::StdSet);
537  if (deferred) {
538  agg_bitmap_size[agg_col_idx] = -1;
539  } else {
540  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
541  }
542  }
543  }
544  }
545 
546  return agg_bitmap_size;
547 }
548 
549 int64_t QueryMemoryInitializer::allocateCountDistinctBitmap(const size_t bitmap_byte_sz) {
553  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
554  row_set_mem_owner_->addCountDistinctBuffer(ptr, bitmap_byte_sz, false);
555  return reinterpret_cast<int64_t>(ptr);
556  }
557  auto count_distinct_buffer = static_cast<int8_t*>(checked_calloc(bitmap_byte_sz, 1));
558  row_set_mem_owner_->addCountDistinctBuffer(count_distinct_buffer, bitmap_byte_sz, true);
559  return reinterpret_cast<int64_t>(count_distinct_buffer);
560 }
561 
563  auto count_distinct_set = new std::set<int64_t>();
564  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
565  return reinterpret_cast<int64_t>(count_distinct_set);
566 }
567 
568 #ifdef HAVE_CUDA
569 GpuGroupByBuffers QueryMemoryInitializer::prepareTopNHeapsDevBuffer(
570  const QueryMemoryDescriptor& query_mem_desc,
571  const CUdeviceptr init_agg_vals_dev_ptr,
572  const size_t n,
573  const int device_id,
574  const unsigned block_size_x,
575  const unsigned grid_size_x) {
577  const auto thread_count = block_size_x * grid_size_x;
578  const auto total_buff_size =
579  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
580  CUdeviceptr dev_buffer =
581  reinterpret_cast<CUdeviceptr>(device_allocator_->alloc(total_buff_size));
582 
583  std::vector<CUdeviceptr> dev_buffers(thread_count);
584 
585  for (size_t i = 0; i < thread_count; ++i) {
586  dev_buffers[i] = dev_buffer;
587  }
588 
589  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(CUdeviceptr));
591  reinterpret_cast<int8_t*>(dev_buffers.data()),
592  thread_count * sizeof(CUdeviceptr));
593 
595 
596  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
597  thread_count * sizeof(int64_t));
598 
600  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
601  (unsigned char)-1,
602  thread_count * n * sizeof(int64_t));
603 
605  reinterpret_cast<int64_t*>(
606  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
607  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
608  n * thread_count,
609  query_mem_desc.groupColWidthsSize(),
610  query_mem_desc.getEffectiveKeyWidth(),
611  query_mem_desc.getRowSize() / sizeof(int64_t),
612  query_mem_desc.hasKeylessHash(),
613  1,
614  block_size_x,
615  grid_size_x);
616 
617  return {reinterpret_cast<CUdeviceptr>(dev_ptr), dev_buffer};
618 }
619 
620 GpuGroupByBuffers QueryMemoryInitializer::createAndInitializeGroupByBufferGpu(
621  const RelAlgExecutionUnit& ra_exe_unit,
622  const QueryMemoryDescriptor& query_mem_desc,
623  const CUdeviceptr init_agg_vals_dev_ptr,
624  const int device_id,
625  const ExecutorDispatchMode dispatch_mode,
626  const unsigned block_size_x,
627  const unsigned grid_size_x,
628  const int8_t warp_size,
629  const bool can_sort_on_gpu,
630  const bool output_columnar,
631  RenderAllocator* render_allocator) {
632  if (use_streaming_top_n(ra_exe_unit, query_mem_desc.didOutputColumnar())) {
633  if (render_allocator) {
635  }
636  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
637  CHECK(!output_columnar);
638 
639  return prepareTopNHeapsDevBuffer(
640  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
641  }
642 
643  auto dev_group_by_buffers = create_dev_group_by_buffers(device_allocator_,
645  query_mem_desc,
646  block_size_x,
647  grid_size_x,
648  device_id,
649  dispatch_mode,
650  num_rows_,
651  can_sort_on_gpu,
652  false,
653  ra_exe_unit.use_bump_allocator,
654  render_allocator);
655 
656  if (render_allocator) {
657  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
658  }
659  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
660  CHECK(!render_allocator);
661 
662  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
663  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
664  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
665  auto group_by_dev_buffer = dev_group_by_buffers.second;
666  const size_t col_count = query_mem_desc.getSlotCount();
667  int8_t* col_widths_dev_ptr{nullptr};
668  if (output_columnar) {
669  std::vector<int8_t> compact_col_widths(col_count);
670  for (size_t idx = 0; idx < col_count; ++idx) {
671  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
672  }
673  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
675  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
676  }
677  const int8_t warp_count =
678  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
679  for (size_t i = 0; i < getGroupByBuffersSize(); i += step) {
680  if (output_columnar) {
682  reinterpret_cast<int64_t*>(group_by_dev_buffer),
683  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
684  dev_group_by_buffers.entry_count,
685  query_mem_desc.groupColWidthsSize(),
686  col_count,
687  col_widths_dev_ptr,
688  /*need_padding = */ true,
689  query_mem_desc.hasKeylessHash(),
690  sizeof(int64_t),
691  block_size_x,
692  grid_size_x);
693  } else {
694  init_group_by_buffer_on_device(reinterpret_cast<int64_t*>(group_by_dev_buffer),
695  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
696  dev_group_by_buffers.entry_count,
697  query_mem_desc.groupColWidthsSize(),
698  query_mem_desc.getEffectiveKeyWidth(),
699  query_mem_desc.getRowSize() / sizeof(int64_t),
700  query_mem_desc.hasKeylessHash(),
701  warp_count,
702  block_size_x,
703  grid_size_x);
704  }
705  group_by_dev_buffer += groups_buffer_size;
706  }
707  }
708  return dev_group_by_buffers;
709 }
710 #endif
711 
713  const QueryMemoryDescriptor& query_mem_desc,
714  const ExecutorDeviceType device_type,
715  const Executor* executor) const {
716  return device_type == ExecutorDeviceType::CPU
717  ? 1
718  : executor->blockSize() *
719  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
720 }
721 
722 namespace {
723 
724 // in-place compaction of output buffer
726  const QueryMemoryDescriptor& query_mem_desc,
727  int8_t* projection_buffer,
728  const size_t projection_count) {
729  // the first column (row indices) remains unchanged.
730  CHECK(projection_count <= query_mem_desc.getEntryCount());
731  constexpr size_t row_index_width = sizeof(int64_t);
732  size_t buffer_offset1{projection_count * row_index_width};
733  // other columns are actual non-lazy columns for the projection:
734  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
735  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
736  auto column_proj_size =
737  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
738  auto buffer_offset2 = query_mem_desc.getColOffInBytes(i);
739  if (buffer_offset1 + column_proj_size >= buffer_offset2) {
740  // overlapping
741  std::memmove(projection_buffer + buffer_offset1,
742  projection_buffer + buffer_offset2,
743  column_proj_size);
744  } else {
745  std::memcpy(projection_buffer + buffer_offset1,
746  projection_buffer + buffer_offset2,
747  column_proj_size);
748  }
749  buffer_offset1 += align_to_int64(column_proj_size);
750  }
751  }
752 }
753 
754 } // namespace
755 
757  const QueryMemoryDescriptor& query_mem_desc,
758  const size_t projection_count) {
759  const auto num_allocated_rows =
760  std::min(projection_count, query_mem_desc.getEntryCount());
761 
762  // copy the results from the main buffer into projection_buffer
764  query_mem_desc,
765  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
766  num_allocated_rows);
767 
768  // update the entry count for the result set, and its underlying storage
769  CHECK(!result_sets_.empty());
770  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
771 }
772 
774  const QueryMemoryDescriptor& query_mem_desc,
775  Data_Namespace::DataMgr* data_mgr,
776  const GpuGroupByBuffers& gpu_group_by_buffers,
777  const size_t projection_count,
778  const int device_id) {
779  // store total number of allocated rows:
780  const auto num_allocated_rows =
781  std::min(projection_count, query_mem_desc.getEntryCount());
782 
783  // copy the results from the main buffer into projection_buffer
785  data_mgr,
786  gpu_group_by_buffers,
787  query_mem_desc,
788  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
789  num_allocated_rows,
790  device_id);
791 
792  // update the entry count for the result set, and its underlying storage
793  CHECK(!result_sets_.empty());
794  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
795 }
796 
798  Data_Namespace::DataMgr* data_mgr,
799  const QueryMemoryDescriptor& query_mem_desc,
800  const size_t entry_count,
801  const GpuGroupByBuffers& gpu_group_by_buffers,
802  const RelAlgExecutionUnit& ra_exe_unit,
803  const unsigned block_size_x,
804  const unsigned grid_size_x,
805  const int device_id,
806  const bool prepend_index_buffer) const {
807  const auto thread_count = block_size_x * grid_size_x;
808 
809  size_t total_buff_size{0};
810  if (use_streaming_top_n(ra_exe_unit, query_mem_desc.didOutputColumnar())) {
811  const size_t n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
812  total_buff_size =
813  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
814  } else {
815  total_buff_size =
816  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
817  }
820  total_buff_size,
821  gpu_group_by_buffers.second,
822  query_mem_desc,
823  block_size_x,
824  grid_size_x,
825  device_id,
826  prepend_index_buffer);
827 }
828 
830  const QueryMemoryDescriptor& query_mem_desc,
831  const RelAlgExecutionUnit& ra_exe_unit) {
832  CHECK_EQ(group_by_buffers_.size(), size_t(1));
833 
834  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
836  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
837  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
838  1);
839  CHECK_EQ(rows_copy.size(),
840  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
841  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
842 }
843 
845  Data_Namespace::DataMgr* data_mgr,
846  const QueryMemoryDescriptor& query_mem_desc,
847  const GpuGroupByBuffers& gpu_group_by_buffers,
848  const RelAlgExecutionUnit& ra_exe_unit,
849  const unsigned total_thread_count,
850  const int device_id) {
851 #ifdef HAVE_CUDA
853 
854  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
855  data_mgr,
856  reinterpret_cast<int64_t*>(gpu_group_by_buffers.second),
857  ra_exe_unit,
858  query_mem_desc,
859  total_thread_count,
860  device_id);
861  CHECK_EQ(
862  rows_copy.size(),
863  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
864  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
865 #else
866  UNREACHABLE();
867 #endif
868 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:195
void initGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
RenderAllocator * getRenderAllocator(size_t device_id)
bool use_streaming_top_n(const RelAlgExecutionUnit &ra_exe_unit, const bool output_columnar)
std::vector< ssize_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
#define EMPTY_KEY_64
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *cuda_allocator, const std::vector< int64_t *> &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:61
const int8_t const int64_t * num_rows
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map)
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
const int64_t const uint32_t const uint32_t const uint32_t const bool const int8_t warp_size
DeviceAllocator * device_allocator_
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
ExecutorDeviceType
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t *>> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const Executor *executor)
Streaming Top N algorithm.
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:65
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
unsigned long long CUdeviceptr
Definition: nocuda.h:27
size_t getCountDistinctDescriptorsSize() const
virtual void copyToDevice(int8_t *device_dst, const int8_t *host_src, const size_t num_bytes) const =0
#define UNREACHABLE()
Definition: Logger.h:231
#define CHECK_GE(x, y)
Definition: Logger.h:200
const int32_t groups_buffer_size
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
virtual int8_t * alloc(const size_t num_bytes)=0
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t agg_col_count, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
const int64_t const uint32_t groups_buffer_entry_count
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
size_t g_max_memory_allocation_size
Definition: Execute.cpp:92
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
bool useCudaBuffers() const
Definition: RenderInfo.cpp:60
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
const auto getGroupByBuffersSize() const
const size_t limit
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t *> &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer)
size_t getNextColOffInBytes(const int8_t *col_ptr, const size_t bin, const size_t col_idx) const
CUdeviceptr second
Definition: GpuMemUtils.h:61
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr *> &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)
std::vector< int64_t > init_agg_vals_
const SortInfo sort_info
void * checked_malloc(const size_t size)
Definition: checked_alloc.h:40
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
bool interleavedBins(const ExecutorDeviceType) const
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool countDistinctDescriptorsLogicallyEmpty() const
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
bool g_bigint_count
void * checked_calloc(const size_t nmemb, const size_t size)
Definition: checked_alloc.h:48
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:116
const std::shared_ptr< Analyzer::Estimator > estimator
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void > > checked_int64_t
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
#define CHECK_LT(x, y)
Definition: Logger.h:197
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< Analyzer::Expr *> &target_exprs, const std::vector< int64_t > &table_frag_sizes)
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
Definition: sqldefs.h:71
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:452
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr *> &targets, const QueryMemoryDescriptor &query_mem_desc)
size_t getAllocatedSize() const
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
#define CHECK(condition)
Definition: Logger.h:187
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
bool g_enable_watchdog
Definition: Execute.cpp:69
Basic constructors and methods of the row set interface.
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
size_t getColOffInBytes(const size_t col_idx) const
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
const int64_t * init_vals
void copyGroupByBuffersFromGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
const size_t offset
std::vector< std::unique_ptr< ResultSet > > result_sets_
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
size_t getEffectiveKeyWidth() const
void initColumnPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const size_t bin, const std::vector< int64_t > &init_vals, const std::vector< ssize_t > &bitmap_sizes)
void sort_on_gpu(int64_t *val_buff, int32_t *key_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)