OmniSciDB  8fa3bf436f
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
QueryMemoryInitializer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryInitializer.h"
18 
19 #include "Execute.h"
20 #include "GpuInitGroups.h"
21 #include "GpuMemUtils.h"
22 #include "Logger/Logger.h"
24 #include "ResultSet.h"
25 #include "StreamingTopN.h"
26 
27 #include <Shared/checked_alloc.h>
28 
29 // 8 GB, the limit of perfect hash group by under normal conditions
30 int64_t g_bitmap_memory_limit{8LL * 1000 * 1000 * 1000};
31 
32 namespace {
33 
35  const int32_t groups_buffer_entry_count = query_mem_desc.getEntryCount();
36  checked_int64_t total_bytes_per_group = 0;
37  const size_t num_count_distinct_descs =
38  query_mem_desc.getCountDistinctDescriptorsSize();
39  for (size_t i = 0; i < num_count_distinct_descs; i++) {
40  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
41  if (count_distinct_desc.impl_type_ != CountDistinctImplType::Bitmap) {
42  continue;
43  }
44  total_bytes_per_group += count_distinct_desc.bitmapPaddedSizeBytes();
45  }
46  int64_t total_bytes{0};
47  // Using OutOfHostMemory until we can verify that SlabTooBig would also be properly
48  // caught
49  try {
50  total_bytes = static_cast<int64_t>(total_bytes_per_group * groups_buffer_entry_count);
51  } catch (...) {
52  // Absurd amount of memory, merely computing the number of bits overflows int64_t.
53  // Don't bother to report the real amount, this is unlikely to ever happen.
54  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
55  }
56  if (total_bytes >= g_bitmap_memory_limit) {
57  throw OutOfHostMemory(total_bytes);
58  }
59 }
60 
61 int64_t* alloc_group_by_buffer(const size_t numBytes,
62  RenderAllocatorMap* render_allocator_map,
63  const size_t thread_idx,
64  RowSetMemoryOwner* mem_owner) {
65  if (render_allocator_map) {
66  // NOTE(adb): If we got here, we are performing an in-situ rendering query and are not
67  // using CUDA buffers. Therefore we need to allocate result set storage using CPU
68  // memory.
69  const auto gpu_idx = 0; // Only 1 GPU supported in CUDA-disabled rendering mode
70  auto render_allocator_ptr = render_allocator_map->getRenderAllocator(gpu_idx);
71  return reinterpret_cast<int64_t*>(render_allocator_ptr->alloc(numBytes));
72  } else {
73  return reinterpret_cast<int64_t*>(mem_owner->allocate(numBytes, thread_idx));
74  }
75 }
76 
77 inline int64_t get_consistent_frag_size(const std::vector<uint64_t>& frag_offsets) {
78  if (frag_offsets.size() < 2) {
79  return int64_t(-1);
80  }
81  const auto frag_size = frag_offsets[1] - frag_offsets[0];
82  for (size_t i = 2; i < frag_offsets.size(); ++i) {
83  const auto curr_size = frag_offsets[i] - frag_offsets[i - 1];
84  if (curr_size != frag_size) {
85  return int64_t(-1);
86  }
87  }
88  return !frag_size ? std::numeric_limits<int64_t>::max()
89  : static_cast<int64_t>(frag_size);
90 }
91 
92 inline std::vector<int64_t> get_consistent_frags_sizes(
93  const std::vector<std::vector<uint64_t>>& frag_offsets) {
94  if (frag_offsets.empty()) {
95  return {};
96  }
97  std::vector<int64_t> frag_sizes;
98  for (size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
99  std::vector<uint64_t> tab_offs;
100  for (auto& offsets : frag_offsets) {
101  tab_offs.push_back(offsets[tab_idx]);
102  }
103  frag_sizes.push_back(get_consistent_frag_size(tab_offs));
104  }
105  return frag_sizes;
106 }
107 
108 inline std::vector<int64_t> get_consistent_frags_sizes(
109  const std::vector<Analyzer::Expr*>& target_exprs,
110  const std::vector<int64_t>& table_frag_sizes) {
111  std::vector<int64_t> col_frag_sizes;
112  for (auto expr : target_exprs) {
113  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
114  if (col_var->get_rte_idx() < 0) {
115  CHECK_EQ(-1, col_var->get_rte_idx());
116  col_frag_sizes.push_back(int64_t(-1));
117  } else {
118  col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
119  }
120  } else {
121  col_frag_sizes.push_back(int64_t(-1));
122  }
123  }
124  return col_frag_sizes;
125 }
126 
127 inline std::vector<std::vector<int64_t>> get_col_frag_offsets(
128  const std::vector<Analyzer::Expr*>& target_exprs,
129  const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
130  std::vector<std::vector<int64_t>> col_frag_offsets;
131  for (auto& table_offsets : table_frag_offsets) {
132  std::vector<int64_t> col_offsets;
133  for (auto expr : target_exprs) {
134  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
135  if (col_var->get_rte_idx() < 0) {
136  CHECK_EQ(-1, col_var->get_rte_idx());
137  col_offsets.push_back(int64_t(-1));
138  } else {
139  CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
140  col_offsets.push_back(
141  static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
142  }
143  } else {
144  col_offsets.push_back(int64_t(-1));
145  }
146  }
147  col_frag_offsets.push_back(col_offsets);
148  }
149  return col_frag_offsets;
150 }
151 
152 } // namespace
153 
154 // Row-based execution constructor
156  const RelAlgExecutionUnit& ra_exe_unit,
158  const int device_id,
159  const ExecutorDeviceType device_type,
160  const ExecutorDispatchMode dispatch_mode,
161  const bool output_columnar,
162  const bool sort_on_gpu,
163  const int64_t num_rows,
164  const std::vector<std::vector<const int8_t*>>& col_buffers,
165  const std::vector<std::vector<uint64_t>>& frag_offsets,
166  RenderAllocatorMap* render_allocator_map,
167  RenderInfo* render_info,
168  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
169  DeviceAllocator* device_allocator,
170  const size_t thread_idx,
171  const Executor* executor)
172  : num_rows_(num_rows)
173  , row_set_mem_owner_(row_set_mem_owner)
174  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
175  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
180  , device_allocator_(device_allocator)
181  , thread_idx_(thread_idx) {
182  CHECK(!sort_on_gpu || output_columnar);
183 
184  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
185  if (consistent_frag_sizes.empty()) {
186  // No fragments in the input, no underlying buffers will be needed.
187  return;
188  }
189  if (!ra_exe_unit.use_bump_allocator) {
190  check_total_bitmap_memory(query_mem_desc);
191  }
192  if (device_type == ExecutorDeviceType::GPU) {
193  allocateCountDistinctGpuMem(query_mem_desc);
194  }
195 
196  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
197  allocateCountDistinctBuffers(query_mem_desc, false, executor);
198  allocateTDigests(query_mem_desc, false, executor);
199  if (render_info && render_info->useCudaBuffers()) {
200  return;
201  }
202  }
203 
204  if (ra_exe_unit.estimator) {
205  return;
206  }
207 
208  const auto thread_count = device_type == ExecutorDeviceType::GPU
209  ? executor->blockSize() * executor->gridSize()
210  : 1;
211 
212  size_t group_buffer_size{0};
213  if (ra_exe_unit.use_bump_allocator) {
214  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
215  // the fragment
216  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
217  group_buffer_size = num_rows * query_mem_desc.getRowSize();
218  } else {
219  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
220  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
221  }
222  } else {
223  group_buffer_size =
224  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
225  }
226  CHECK_GE(group_buffer_size, size_t(0));
227 
228  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
229  int64_t* group_by_buffer_template{nullptr};
230  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
231  group_by_buffer_template = reinterpret_cast<int64_t*>(
232  row_set_mem_owner_->allocate(group_buffer_size, thread_idx_));
233  initGroupByBuffer(group_by_buffer_template,
234  ra_exe_unit,
235  query_mem_desc,
236  device_type,
237  output_columnar,
238  executor);
239  }
240 
241  if (query_mem_desc.interleavedBins(device_type)) {
242  CHECK(query_mem_desc.hasKeylessHash());
243  }
244 
245  const auto step = device_type == ExecutorDeviceType::GPU &&
246  query_mem_desc.threadsShareMemory() &&
247  query_mem_desc.isGroupBy()
248  ? executor->blockSize()
249  : size_t(1);
250  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
251  query_mem_desc.hasKeylessHash()
252  ? query_mem_desc.getEntryCount()
253  : size_t(0);
254  const auto actual_group_buffer_size =
255  group_buffer_size + index_buffer_qw * sizeof(int64_t);
256  CHECK_GE(actual_group_buffer_size, group_buffer_size);
257 
258  for (size_t i = 0; i < group_buffers_count; i += step) {
259  auto group_by_buffer = alloc_group_by_buffer(actual_group_buffer_size,
260  render_allocator_map,
261  thread_idx_,
262  row_set_mem_owner_.get());
263  if (!query_mem_desc.lazyInitGroups(device_type)) {
264  if (group_by_buffer_template) {
265  memcpy(group_by_buffer + index_buffer_qw,
266  group_by_buffer_template,
267  group_buffer_size);
268  } else {
269  initGroupByBuffer(group_by_buffer + index_buffer_qw,
270  ra_exe_unit,
271  query_mem_desc,
272  device_type,
273  output_columnar,
274  executor);
275  }
276  }
277  group_by_buffers_.push_back(group_by_buffer);
278  for (size_t j = 1; j < step; ++j) {
279  group_by_buffers_.push_back(nullptr);
280  }
281  const auto column_frag_offsets =
282  get_col_frag_offsets(ra_exe_unit.target_exprs, frag_offsets);
283  const auto column_frag_sizes =
284  get_consistent_frags_sizes(ra_exe_unit.target_exprs, consistent_frag_sizes);
285  result_sets_.emplace_back(
286  new ResultSet(target_exprs_to_infos(ra_exe_unit.target_exprs, query_mem_desc),
287  executor->getColLazyFetchInfo(ra_exe_unit.target_exprs),
288  col_buffers,
289  column_frag_offsets,
290  column_frag_sizes,
291  device_type,
292  device_id,
295  executor->getCatalog(),
296  executor->blockSize(),
297  executor->gridSize()));
298  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
299  executor->plan_state_->init_agg_vals_);
300  for (size_t j = 1; j < step; ++j) {
301  result_sets_.emplace_back(nullptr);
302  }
303  }
304 }
305 
306 // Table functions execution constructor
308  const TableFunctionExecutionUnit& exe_unit,
310  const int device_id,
311  const ExecutorDeviceType device_type,
312  const int64_t num_rows,
313  const std::vector<std::vector<const int8_t*>>& col_buffers,
314  const std::vector<std::vector<uint64_t>>& frag_offsets,
315  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
316  DeviceAllocator* device_allocator,
317  const Executor* executor)
318  : num_rows_(num_rows)
319  , row_set_mem_owner_(row_set_mem_owner)
320  , init_agg_vals_(init_agg_val_vec(exe_unit.target_exprs, {}, query_mem_desc))
321  , num_buffers_(/*computeNumberOfBuffers(query_mem_desc, device_type, executor)*/ 1)
326  , device_allocator_(device_allocator)
327  , thread_idx_(0) {
328  // Table functions output columnar, basically treat this as a projection
329  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
330  if (consistent_frag_sizes.empty()) {
331  // No fragments in the input, no underlying buffers will be needed.
332  return;
333  }
334 
335  size_t group_buffer_size{0};
336  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
337  group_buffer_size = num_rows_ * num_columns * sizeof(int64_t);
338  CHECK_GE(group_buffer_size, size_t(0));
339 
340  const auto index_buffer_qw =
341  device_type == ExecutorDeviceType::GPU && query_mem_desc.hasKeylessHash()
342  ? query_mem_desc.getEntryCount()
343  : size_t(0);
344  const auto actual_group_buffer_size =
345  group_buffer_size + index_buffer_qw * sizeof(int64_t);
346  CHECK_GE(actual_group_buffer_size, group_buffer_size);
347 
348  CHECK_EQ(num_buffers_, size_t(1));
349  auto group_by_buffer = alloc_group_by_buffer(
350  actual_group_buffer_size, nullptr, thread_idx_, row_set_mem_owner.get());
351  if (!query_mem_desc.lazyInitGroups(device_type)) {
352  initColumnarGroups(
353  query_mem_desc, group_by_buffer + index_buffer_qw, init_agg_vals_, executor);
354  }
355  group_by_buffers_.push_back(group_by_buffer);
356 
357  const auto column_frag_offsets =
358  get_col_frag_offsets(exe_unit.target_exprs, frag_offsets);
359  const auto column_frag_sizes =
360  get_consistent_frags_sizes(exe_unit.target_exprs, consistent_frag_sizes);
361  result_sets_.emplace_back(
362  new ResultSet(target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
363  {},
364  col_buffers,
365  column_frag_offsets,
366  column_frag_sizes,
367  device_type,
368  device_id,
370  row_set_mem_owner_,
371  executor->getCatalog(),
372  executor->blockSize(),
373  executor->gridSize()));
374  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
375  init_agg_vals_);
376 }
377 
379  int64_t* buffer,
380  const RelAlgExecutionUnit& ra_exe_unit,
382  const ExecutorDeviceType device_type,
383  const bool output_columnar,
384  const Executor* executor) {
385  if (output_columnar) {
386  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor);
387  } else {
388  auto rows_ptr = buffer;
389  auto actual_entry_count = query_mem_desc.getEntryCount();
390  const auto thread_count = device_type == ExecutorDeviceType::GPU
391  ? executor->blockSize() * executor->gridSize()
392  : 1;
393  auto warp_size =
394  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
395  if (query_mem_desc.useStreamingTopN()) {
396  const auto node_count_size = thread_count * sizeof(int64_t);
397  memset(rows_ptr, 0, node_count_size);
398  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
399  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
400  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
401  rows_ptr += rows_offset / sizeof(int64_t);
402  actual_entry_count = n * thread_count;
403  warp_size = 1;
404  }
405  initRowGroups(query_mem_desc,
406  rows_ptr,
408  actual_entry_count,
409  warp_size,
410  executor);
411  }
412 }
413 
415  int64_t* groups_buffer,
416  const std::vector<int64_t>& init_vals,
417  const int32_t groups_buffer_entry_count,
418  const size_t warp_size,
419  const Executor* executor) {
420  const size_t key_count{query_mem_desc.getGroupbyColCount()};
421  const size_t row_size{query_mem_desc.getRowSize()};
422  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
423 
424  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
425  auto tdigest_deferred = allocateTDigests(query_mem_desc, true, executor);
426  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
427 
428  const auto query_mem_desc_fixedup =
430 
431  // not COUNT DISTINCT / APPROX_COUNT_DISTINCT / APPROX_MEDIAN
432  // we fallback to default implementation in that cases
433  if (!std::accumulate(agg_bitmap_size.begin(), agg_bitmap_size.end(), 0) &&
434  !std::accumulate(tdigest_deferred.begin(), tdigest_deferred.end(), 0) &&
436  std::vector<int8_t> sample_row(row_size - col_base_off);
437 
438  initColumnsPerRow(query_mem_desc_fixedup,
439  sample_row.data(),
440  init_vals,
441  agg_bitmap_size,
442  tdigest_deferred);
443 
444  if (query_mem_desc.hasKeylessHash()) {
445  CHECK(warp_size >= 1);
446  CHECK(key_count == 1 || warp_size == 1);
447  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
448  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
449  ++bin, buffer_ptr += row_size) {
450  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
451  }
452  }
453  return;
454  }
455 
456  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
457  ++bin, buffer_ptr += row_size) {
458  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
460  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
461  }
462  } else {
463  if (query_mem_desc.hasKeylessHash()) {
464  CHECK(warp_size >= 1);
465  CHECK(key_count == 1 || warp_size == 1);
466  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
467  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
468  ++bin, buffer_ptr += row_size) {
469  initColumnsPerRow(query_mem_desc_fixedup,
470  &buffer_ptr[col_base_off],
471  init_vals,
472  agg_bitmap_size,
473  tdigest_deferred);
474  }
475  }
476  return;
477  }
478 
479  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
480  ++bin, buffer_ptr += row_size) {
482  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
483  initColumnsPerRow(query_mem_desc_fixedup,
484  &buffer_ptr[col_base_off],
485  init_vals,
486  agg_bitmap_size,
487  tdigest_deferred);
488  }
489  }
490 }
491 
492 namespace {
493 
494 template <typename T>
495 int8_t* initColumnarBuffer(T* buffer_ptr, const T init_val, const uint32_t entry_count) {
496  static_assert(sizeof(T) <= sizeof(int64_t), "Unsupported template type");
497  for (uint32_t i = 0; i < entry_count; ++i) {
498  buffer_ptr[i] = init_val;
499  }
500  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
501 }
502 
503 } // namespace
504 
507  int64_t* groups_buffer,
508  const std::vector<int64_t>& init_vals,
509  const Executor* executor) {
510  CHECK(groups_buffer);
511  for (const auto target_expr : executor->plan_state_->target_exprs_) {
512  const auto agg_info = get_target_info(target_expr, g_bigint_count);
513  CHECK(!is_distinct_target(agg_info));
514  }
515  const int32_t agg_col_count = query_mem_desc.getSlotCount();
516  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
517 
518  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
519  if (!query_mem_desc.hasKeylessHash()) {
520  const size_t key_count{query_mem_desc.getGroupbyColCount()};
521  for (size_t i = 0; i < key_count; ++i) {
522  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
523  EMPTY_KEY_64,
524  groups_buffer_entry_count);
525  }
526  }
527 
529  // initializing all aggregate columns:
530  int32_t init_val_idx = 0;
531  for (int32_t i = 0; i < agg_col_count; ++i) {
532  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
533  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
534  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
535  case 1:
536  buffer_ptr = initColumnarBuffer<int8_t>(
537  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
538  break;
539  case 2:
540  buffer_ptr =
541  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
542  init_vals[init_val_idx++],
543  groups_buffer_entry_count);
544  break;
545  case 4:
546  buffer_ptr =
547  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
548  init_vals[init_val_idx++],
549  groups_buffer_entry_count);
550  break;
551  case 8:
552  buffer_ptr =
553  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
554  init_vals[init_val_idx++],
555  groups_buffer_entry_count);
556  break;
557  case 0:
558  break;
559  default:
560  CHECK(false);
561  }
562 
563  buffer_ptr = align_to_int64(buffer_ptr);
564  }
565  }
566  }
567 }
568 
571  int8_t* row_ptr,
572  const std::vector<int64_t>& init_vals,
573  const std::vector<int64_t>& bitmap_sizes,
574  const std::vector<bool>& tdigest_deferred) {
575  int8_t* col_ptr = row_ptr;
576  size_t init_vec_idx = 0;
577  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
578  col_ptr += query_mem_desc.getNextColOffInBytesRowOnly(col_ptr, col_idx++)) {
579  const int64_t bm_sz{bitmap_sizes[col_idx]};
580  int64_t init_val{0};
581  if (bm_sz && query_mem_desc.isGroupBy()) {
582  // COUNT DISTINCT / APPROX_COUNT_DISTINCT
583  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
584  sizeof(int64_t));
585  init_val =
587  ++init_vec_idx;
588  } else if (query_mem_desc.isGroupBy() && tdigest_deferred[col_idx]) {
589  // allocate for APPROX_MEDIAN only when slot is used
590  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest());
591  ++init_vec_idx;
592  } else {
593  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
594  CHECK_LT(init_vec_idx, init_vals.size());
595  init_val = init_vals[init_vec_idx++];
596  }
597  }
598  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
599  case 1:
600  *col_ptr = static_cast<int8_t>(init_val);
601  break;
602  case 2:
603  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
604  break;
605  case 4:
606  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
607  break;
608  case 8:
609  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
610  break;
611  case 0:
612  continue;
613  default:
614  CHECK(false);
615  }
616  }
617 }
618 
621  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
622  return;
623  }
625 
626  size_t total_bytes_per_entry{0};
627  const size_t num_count_distinct_descs =
628  query_mem_desc.getCountDistinctDescriptorsSize();
629  for (size_t i = 0; i < num_count_distinct_descs; i++) {
630  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
631  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
632  continue;
633  }
634  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
635  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
636  }
637 
639  total_bytes_per_entry * query_mem_desc.getEntryCount();
640  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
642  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
644 
647 }
648 
649 // deferred is true for group by queries; initGroups will allocate a bitmap
650 // for each group slot
653  const bool deferred,
654  const Executor* executor) {
655  const size_t agg_col_count{query_mem_desc.getSlotCount()};
656  std::vector<int64_t> agg_bitmap_size(deferred ? agg_col_count : 0);
657 
658  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
659  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
660  ++target_idx) {
661  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
662  const auto agg_info = get_target_info(target_expr, g_bigint_count);
663  if (is_distinct_target(agg_info)) {
664  CHECK(agg_info.is_agg &&
665  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
666  CHECK(!agg_info.sql_type.is_varlen());
667 
668  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
669  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
670 
671  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
672  sizeof(int64_t));
673  const auto& count_distinct_desc =
674  query_mem_desc.getCountDistinctDescriptor(target_idx);
675  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
676  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
677  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
678  if (deferred) {
679  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
680  } else {
681  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
682  }
683  } else {
684  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::StdSet);
685  if (deferred) {
686  agg_bitmap_size[agg_col_idx] = -1;
687  } else {
688  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
689  }
690  }
691  }
692  }
693 
694  return agg_bitmap_size;
695 }
696 
697 int64_t QueryMemoryInitializer::allocateCountDistinctBitmap(const size_t bitmap_byte_sz) {
701  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
702  row_set_mem_owner_->addCountDistinctBuffer(
703  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
704  return reinterpret_cast<int64_t>(ptr);
705  }
706  return reinterpret_cast<int64_t>(
707  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz, thread_idx_));
708 }
709 
711  auto count_distinct_set = new std::set<int64_t>();
712  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
713  return reinterpret_cast<int64_t>(count_distinct_set);
714 }
715 
718  const bool deferred,
719  const Executor* executor) {
720  size_t const slot_count = query_mem_desc.getSlotCount();
721  size_t const ntargets = executor->plan_state_->target_exprs_.size();
722  CHECK_GE(slot_count, ntargets);
723  std::vector<bool> tdigest_deferred(deferred ? slot_count : 0);
724 
725  for (size_t target_idx = 0; target_idx < ntargets; ++target_idx) {
726  auto const target_expr = executor->plan_state_->target_exprs_[target_idx];
727  if (auto const agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr)) {
728  if (agg_expr->get_aggtype() == kAPPROX_MEDIAN) {
729  size_t const agg_col_idx =
730  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
731  CHECK_LT(agg_col_idx, slot_count);
732  CHECK_EQ(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx),
733  static_cast<int8_t>(sizeof(int64_t)));
734  if (deferred) {
735  tdigest_deferred[agg_col_idx] = true;
736  } else {
737  // allocate for APPROX_MEDIAN only when slot is used
738  init_agg_vals_[agg_col_idx] =
739  reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest());
740  }
741  }
742  }
743  }
744  return tdigest_deferred;
745 }
746 
747 #ifdef HAVE_CUDA
748 GpuGroupByBuffers QueryMemoryInitializer::prepareTopNHeapsDevBuffer(
750  const CUdeviceptr init_agg_vals_dev_ptr,
751  const size_t n,
752  const int device_id,
753  const unsigned block_size_x,
754  const unsigned grid_size_x) {
756  const auto thread_count = block_size_x * grid_size_x;
757  const auto total_buff_size =
758  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
759  CUdeviceptr dev_buffer =
760  reinterpret_cast<CUdeviceptr>(device_allocator_->alloc(total_buff_size));
761 
762  std::vector<CUdeviceptr> dev_buffers(thread_count);
763 
764  for (size_t i = 0; i < thread_count; ++i) {
765  dev_buffers[i] = dev_buffer;
766  }
767 
768  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(CUdeviceptr));
770  reinterpret_cast<int8_t*>(dev_buffers.data()),
771  thread_count * sizeof(CUdeviceptr));
772 
774 
775  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
776  thread_count * sizeof(int64_t));
777 
779  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
780  (unsigned char)-1,
781  thread_count * n * sizeof(int64_t));
782 
784  reinterpret_cast<int64_t*>(
785  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
786  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
787  n * thread_count,
788  query_mem_desc.getGroupbyColCount(),
789  query_mem_desc.getEffectiveKeyWidth(),
790  query_mem_desc.getRowSize() / sizeof(int64_t),
791  query_mem_desc.hasKeylessHash(),
792  1,
793  block_size_x,
794  grid_size_x);
795 
796  return {reinterpret_cast<CUdeviceptr>(dev_ptr), dev_buffer};
797 }
798 
799 GpuGroupByBuffers QueryMemoryInitializer::createAndInitializeGroupByBufferGpu(
800  const RelAlgExecutionUnit& ra_exe_unit,
801  const QueryMemoryDescriptor& query_mem_desc,
802  const CUdeviceptr init_agg_vals_dev_ptr,
803  const int device_id,
804  const ExecutorDispatchMode dispatch_mode,
805  const unsigned block_size_x,
806  const unsigned grid_size_x,
807  const int8_t warp_size,
808  const bool can_sort_on_gpu,
809  const bool output_columnar,
810  RenderAllocator* render_allocator) {
811  if (query_mem_desc.useStreamingTopN()) {
812  if (render_allocator) {
814  }
815  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
816  CHECK(!output_columnar);
817 
818  return prepareTopNHeapsDevBuffer(
819  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
820  }
821 
822  auto dev_group_by_buffers = create_dev_group_by_buffers(device_allocator_,
824  query_mem_desc,
825  block_size_x,
826  grid_size_x,
827  device_id,
828  dispatch_mode,
829  num_rows_,
830  can_sort_on_gpu,
831  false,
832  ra_exe_unit.use_bump_allocator,
833  render_allocator);
834 
835  if (render_allocator) {
836  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
837  }
838  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
839  CHECK(!render_allocator);
840 
841  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
842  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
843  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
844  auto group_by_dev_buffer = dev_group_by_buffers.second;
845  const size_t col_count = query_mem_desc.getSlotCount();
846  int8_t* col_widths_dev_ptr{nullptr};
847  if (output_columnar) {
848  std::vector<int8_t> compact_col_widths(col_count);
849  for (size_t idx = 0; idx < col_count; ++idx) {
850  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
851  }
852  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
854  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
855  }
856  const int8_t warp_count =
857  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
858  for (size_t i = 0; i < getGroupByBuffersSize(); i += step) {
859  if (output_columnar) {
861  reinterpret_cast<int64_t*>(group_by_dev_buffer),
862  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
863  dev_group_by_buffers.entry_count,
864  query_mem_desc.getGroupbyColCount(),
865  col_count,
866  col_widths_dev_ptr,
867  /*need_padding = */ true,
868  query_mem_desc.hasKeylessHash(),
869  sizeof(int64_t),
870  block_size_x,
871  grid_size_x);
872  } else {
873  init_group_by_buffer_on_device(reinterpret_cast<int64_t*>(group_by_dev_buffer),
874  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
875  dev_group_by_buffers.entry_count,
876  query_mem_desc.getGroupbyColCount(),
877  query_mem_desc.getEffectiveKeyWidth(),
878  query_mem_desc.getRowSize() / sizeof(int64_t),
879  query_mem_desc.hasKeylessHash(),
880  warp_count,
881  block_size_x,
882  grid_size_x);
883  }
884  group_by_dev_buffer += groups_buffer_size;
885  }
886  }
887  return dev_group_by_buffers;
888 }
889 
890 GpuGroupByBuffers QueryMemoryInitializer::setupTableFunctionGpuBuffers(
891  const QueryMemoryDescriptor& query_mem_desc,
892  const int device_id,
893  const unsigned block_size_x,
894  const unsigned grid_size_x) {
895  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
896  CHECK_GT(num_columns, size_t(0));
897 
898  const size_t column_size = num_rows_ * sizeof(int64_t);
899  const size_t groups_buffer_size = num_columns * (column_size == 0 ? 1 : column_size);
900  const size_t mem_size =
901  groups_buffer_size * (query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
902 
903  int8_t* dev_buffers_allocation{nullptr};
904  dev_buffers_allocation = device_allocator_->alloc(mem_size);
905  CHECK(dev_buffers_allocation);
906 
907  CUdeviceptr dev_buffers_mem = reinterpret_cast<CUdeviceptr>(dev_buffers_allocation);
908  const size_t step{block_size_x};
909  const size_t num_ptrs{block_size_x * grid_size_x};
910  std::vector<CUdeviceptr> dev_buffers(num_columns * num_ptrs);
911  auto dev_buffer = dev_buffers_mem;
912  for (size_t i = 0; i < num_ptrs; i += step) {
913  for (size_t j = 0; j < step; j += 1) {
914  for (size_t k = 0; k < num_columns; k++) {
915  dev_buffers[(i + j) * num_columns + k] = dev_buffer + k * column_size;
916  }
917  }
918  if (!query_mem_desc.blocksShareMemory()) {
919  dev_buffer += groups_buffer_size;
920  }
921  }
922 
923  auto dev_ptr = device_allocator_->alloc(num_columns * num_ptrs * sizeof(CUdeviceptr));
925  reinterpret_cast<int8_t*>(dev_buffers.data()),
926  num_columns * num_ptrs * sizeof(CUdeviceptr));
927 
928  return {reinterpret_cast<CUdeviceptr>(dev_ptr), dev_buffers_mem, (size_t)num_rows_};
929 }
930 
931 void QueryMemoryInitializer::copyFromTableFunctionGpuBuffers(
932  Data_Namespace::DataMgr* data_mgr,
933  const QueryMemoryDescriptor& query_mem_desc,
934  const size_t entry_count,
935  const GpuGroupByBuffers& gpu_group_by_buffers,
936  const int device_id,
937  const unsigned block_size_x,
938  const unsigned grid_size_x) {
939  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
940  const size_t column_size = entry_count * sizeof(int64_t);
941  const size_t orig_column_size = gpu_group_by_buffers.entry_count * sizeof(int64_t);
942  int8_t* dev_buffer = reinterpret_cast<int8_t*>(gpu_group_by_buffers.second);
943  int8_t* host_buffer = reinterpret_cast<int8_t*>(group_by_buffers_[0]);
944  CHECK_LE(column_size, orig_column_size);
945  if (orig_column_size == column_size) {
946  copy_from_gpu(data_mgr,
947  host_buffer,
948  reinterpret_cast<CUdeviceptr>(dev_buffer),
949  column_size * num_columns,
950  device_id);
951  } else {
952  for (size_t k = 0; k < num_columns; ++k) {
953  copy_from_gpu(data_mgr,
954  host_buffer,
955  reinterpret_cast<CUdeviceptr>(dev_buffer),
956  column_size,
957  device_id);
958  dev_buffer += orig_column_size;
959  host_buffer += column_size;
960  }
961  }
962 }
963 
964 #endif
965 
967  const QueryMemoryDescriptor& query_mem_desc,
968  const ExecutorDeviceType device_type,
969  const Executor* executor) const {
970  return device_type == ExecutorDeviceType::CPU
971  ? 1
972  : executor->blockSize() *
973  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
974 }
975 
976 namespace {
977 
978 // in-place compaction of output buffer
980  const QueryMemoryDescriptor& query_mem_desc,
981  int8_t* projection_buffer,
982  const size_t projection_count) {
983  // the first column (row indices) remains unchanged.
984  CHECK(projection_count <= query_mem_desc.getEntryCount());
985  constexpr size_t row_index_width = sizeof(int64_t);
986  size_t buffer_offset1{projection_count * row_index_width};
987  // other columns are actual non-lazy columns for the projection:
988  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
989  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
990  auto column_proj_size =
991  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
992  auto buffer_offset2 = query_mem_desc.getColOffInBytes(i);
993  if (buffer_offset1 + column_proj_size >= buffer_offset2) {
994  // overlapping
995  std::memmove(projection_buffer + buffer_offset1,
996  projection_buffer + buffer_offset2,
997  column_proj_size);
998  } else {
999  std::memcpy(projection_buffer + buffer_offset1,
1000  projection_buffer + buffer_offset2,
1001  column_proj_size);
1002  }
1003  buffer_offset1 += align_to_int64(column_proj_size);
1004  }
1005  }
1006 }
1007 
1008 } // namespace
1009 
1011  const QueryMemoryDescriptor& query_mem_desc,
1012  const size_t projection_count) {
1013  const auto num_allocated_rows =
1014  std::min(projection_count, query_mem_desc.getEntryCount());
1015 
1016  // copy the results from the main buffer into projection_buffer
1018  query_mem_desc,
1019  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
1020  num_allocated_rows);
1021 
1022  // update the entry count for the result set, and its underlying storage
1023  CHECK(!result_sets_.empty());
1024  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1025 }
1026 
1028  const QueryMemoryDescriptor& query_mem_desc,
1029  Data_Namespace::DataMgr* data_mgr,
1030  const GpuGroupByBuffers& gpu_group_by_buffers,
1031  const size_t projection_count,
1032  const int device_id) {
1033  // store total number of allocated rows:
1034  const auto num_allocated_rows =
1035  std::min(projection_count, query_mem_desc.getEntryCount());
1036 
1037  // copy the results from the main buffer into projection_buffer
1039  data_mgr,
1040  gpu_group_by_buffers,
1041  query_mem_desc,
1042  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
1043  num_allocated_rows,
1044  device_id);
1045 
1046  // update the entry count for the result set, and its underlying storage
1047  CHECK(!result_sets_.empty());
1048  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1049 }
1050 
1052  Data_Namespace::DataMgr* data_mgr,
1053  const QueryMemoryDescriptor& query_mem_desc,
1054  const size_t entry_count,
1055  const GpuGroupByBuffers& gpu_group_by_buffers,
1056  const RelAlgExecutionUnit* ra_exe_unit,
1057  const unsigned block_size_x,
1058  const unsigned grid_size_x,
1059  const int device_id,
1060  const bool prepend_index_buffer) const {
1061  const auto thread_count = block_size_x * grid_size_x;
1062 
1063  size_t total_buff_size{0};
1064  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
1065  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
1066  total_buff_size =
1067  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1068  } else {
1069  total_buff_size =
1070  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
1071  }
1074  total_buff_size,
1075  gpu_group_by_buffers.second,
1076  query_mem_desc,
1077  block_size_x,
1078  grid_size_x,
1079  device_id,
1080  prepend_index_buffer);
1081 }
1082 
1084  const QueryMemoryDescriptor& query_mem_desc,
1085  const RelAlgExecutionUnit& ra_exe_unit) {
1086  CHECK_EQ(group_by_buffers_.size(), size_t(1));
1087 
1088  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1089  group_by_buffers_[0],
1090  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1091  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
1092  1);
1093  CHECK_EQ(rows_copy.size(),
1094  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1095  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
1096 }
1097 
1099  Data_Namespace::DataMgr* data_mgr,
1100  const QueryMemoryDescriptor& query_mem_desc,
1101  const GpuGroupByBuffers& gpu_group_by_buffers,
1102  const RelAlgExecutionUnit& ra_exe_unit,
1103  const unsigned total_thread_count,
1104  const int device_id) {
1105 #ifdef HAVE_CUDA
1107 
1108  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1109  data_mgr,
1110  reinterpret_cast<int64_t*>(gpu_group_by_buffers.second),
1111  ra_exe_unit,
1112  query_mem_desc,
1113  total_thread_count,
1114  device_id);
1115  CHECK_EQ(
1116  rows_copy.size(),
1117  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1118  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
1119 #else
1120  UNREACHABLE();
1121 #endif
1122 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:211
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
RenderAllocator * getRenderAllocator(size_t device_id)
bool countDistinctDescriptorsLogicallyEmpty() const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:69
#define EMPTY_KEY_64
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *cuda_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:60
DeviceAllocator * device_allocator_
ExecutorDeviceType
Streaming Top N algorithm.
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:79
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< bool > &tdigest_deferred)
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
unsigned long long CUdeviceptr
Definition: nocuda.h:27
int8_t * allocate(const size_t num_bytes, const size_t thread_idx=0) override
virtual void copyToDevice(int8_t *device_dst, const int8_t *host_src, const size_t num_bytes) const =0
#define UNREACHABLE()
Definition: Logger.h:247
std::vector< bool > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
#define CHECK_GE(x, y)
Definition: Logger.h:216
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getEffectiveKeyWidth() const
num_buffers_(1)
#define CHECK_GT(x, y)
Definition: Logger.h:215
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
const size_t limit
CUdeviceptr second
Definition: GpuMemUtils.h:61
int64_t g_bitmap_memory_limit
std::vector< int64_t > init_agg_vals_
const SortInfo sort_info
size_t getGroupbyColCount() const
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
void copy_from_gpu(Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
bool g_bigint_count
DEVICE auto accumulate(ARGS &&...args)
Definition: gpu_enabled.h:42
size_t g_max_memory_allocation_size
Definition: Execute.cpp:105
size_t getAllocatedSize() const
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:130
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
const std::shared_ptr< Analyzer::Estimator > estimator
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
size_t getCountDistinctDescriptorsSize() const
QueryDescriptionType getQueryDescriptionType() const
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:213
count_distinct_bitmap_mem_(0)
#define CHECK_LE(x, y)
Definition: Logger.h:214
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
size_t getNextColOffInBytesRowOnly(const int8_t *col_ptr, const size_t col_idx) const
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer)
Definition: sqldefs.h:76
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:486
void copyGroupByBuffersFromGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
count_distinct_bitmap_host_mem_(nullptr)
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
device_allocator_(device_allocator)
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:203
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
Basic constructors and methods of the row set interface.
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
bool g_optimize_row_initialization
Definition: Execute.cpp:95
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)
count_distinct_bitmap_crt_ptr_(nullptr)
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
const size_t offset
count_distinct_bitmap_mem_bytes_(0)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
size_t getColOffInBytes(const size_t col_idx) const
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, const size_t thread_idx, RowSetMemoryOwner *mem_owner)
std::vector< std::unique_ptr< ResultSet > > result_sets_
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)