OmniSciDB  17c254d2f8
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
QueryMemoryInitializer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2019 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryInitializer.h"
18 
19 #include "Execute.h"
20 #include "GpuInitGroups.h"
21 #include "GpuMemUtils.h"
23 #include "ResultSet.h"
24 #include "Shared/Logger.h"
25 #include "StreamingTopN.h"
26 
27 #include <Shared/checked_alloc.h>
28 
29 namespace {
30 
32  const int32_t groups_buffer_entry_count = query_mem_desc.getEntryCount();
33  if (g_enable_watchdog) {
34  checked_int64_t total_bytes_per_group = 0;
35  const size_t num_count_distinct_descs =
36  query_mem_desc.getCountDistinctDescriptorsSize();
37  for (size_t i = 0; i < num_count_distinct_descs; i++) {
38  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
39  if (count_distinct_desc.impl_type_ != CountDistinctImplType::Bitmap) {
40  continue;
41  }
42  total_bytes_per_group += count_distinct_desc.bitmapPaddedSizeBytes();
43  }
44  int64_t total_bytes{0};
45  // Using OutOfHostMemory until we can verify that SlabTooBig would also be properly
46  // caught
47  try {
48  total_bytes =
49  static_cast<int64_t>(total_bytes_per_group * groups_buffer_entry_count);
50  } catch (...) {
51  // Absurd amount of memory, merely computing the number of bits overflows int64_t.
52  // Don't bother to report the real amount, this is unlikely to ever happen.
53  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
54  }
55  if (total_bytes >= 2 * 1000 * 1000 * 1000L) {
56  throw OutOfHostMemory(total_bytes);
57  }
58  }
59 }
60 
61 int64_t* alloc_group_by_buffer(const size_t numBytes,
62  RenderAllocatorMap* render_allocator_map,
63  RowSetMemoryOwner* mem_owner) {
64  if (render_allocator_map) {
65  // NOTE(adb): If we got here, we are performing an in-situ rendering query and are not
66  // using CUDA buffers. Therefore we need to allocate result set storage using CPU
67  // memory.
68  const auto gpu_idx = 0; // Only 1 GPU supported in CUDA-disabled rendering mode
69  auto render_allocator_ptr = render_allocator_map->getRenderAllocator(gpu_idx);
70  return reinterpret_cast<int64_t*>(render_allocator_ptr->alloc(numBytes));
71  } else {
72  return reinterpret_cast<int64_t*>(mem_owner->allocate(numBytes));
73  }
74 }
75 
76 inline int64_t get_consistent_frag_size(const std::vector<uint64_t>& frag_offsets) {
77  if (frag_offsets.size() < 2) {
78  return ssize_t(-1);
79  }
80  const auto frag_size = frag_offsets[1] - frag_offsets[0];
81  for (size_t i = 2; i < frag_offsets.size(); ++i) {
82  const auto curr_size = frag_offsets[i] - frag_offsets[i - 1];
83  if (curr_size != frag_size) {
84  return int64_t(-1);
85  }
86  }
87  return !frag_size ? std::numeric_limits<int64_t>::max()
88  : static_cast<int64_t>(frag_size);
89 }
90 
91 inline std::vector<int64_t> get_consistent_frags_sizes(
92  const std::vector<std::vector<uint64_t>>& frag_offsets) {
93  if (frag_offsets.empty()) {
94  return {};
95  }
96  std::vector<int64_t> frag_sizes;
97  for (size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
98  std::vector<uint64_t> tab_offs;
99  for (auto& offsets : frag_offsets) {
100  tab_offs.push_back(offsets[tab_idx]);
101  }
102  frag_sizes.push_back(get_consistent_frag_size(tab_offs));
103  }
104  return frag_sizes;
105 }
106 
107 inline std::vector<int64_t> get_consistent_frags_sizes(
108  const std::vector<Analyzer::Expr*>& target_exprs,
109  const std::vector<int64_t>& table_frag_sizes) {
110  std::vector<int64_t> col_frag_sizes;
111  for (auto expr : target_exprs) {
112  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
113  if (col_var->get_rte_idx() < 0) {
114  CHECK_EQ(-1, col_var->get_rte_idx());
115  col_frag_sizes.push_back(int64_t(-1));
116  } else {
117  col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
118  }
119  } else {
120  col_frag_sizes.push_back(int64_t(-1));
121  }
122  }
123  return col_frag_sizes;
124 }
125 
126 inline std::vector<std::vector<int64_t>> get_col_frag_offsets(
127  const std::vector<Analyzer::Expr*>& target_exprs,
128  const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
129  std::vector<std::vector<int64_t>> col_frag_offsets;
130  for (auto& table_offsets : table_frag_offsets) {
131  std::vector<int64_t> col_offsets;
132  for (auto expr : target_exprs) {
133  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
134  if (col_var->get_rte_idx() < 0) {
135  CHECK_EQ(-1, col_var->get_rte_idx());
136  col_offsets.push_back(int64_t(-1));
137  } else {
138  CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
139  col_offsets.push_back(
140  static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
141  }
142  } else {
143  col_offsets.push_back(int64_t(-1));
144  }
145  }
146  col_frag_offsets.push_back(col_offsets);
147  }
148  return col_frag_offsets;
149 }
150 
151 } // namespace
152 
154  const RelAlgExecutionUnit& ra_exe_unit,
156  const int device_id,
157  const ExecutorDeviceType device_type,
158  const ExecutorDispatchMode dispatch_mode,
159  const bool output_columnar,
160  const bool sort_on_gpu,
161  const int64_t num_rows,
162  const std::vector<std::vector<const int8_t*>>& col_buffers,
163  const std::vector<std::vector<uint64_t>>& frag_offsets,
164  RenderAllocatorMap* render_allocator_map,
165  RenderInfo* render_info,
166  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
167  DeviceAllocator* device_allocator,
168  const Executor* executor)
169  : num_rows_(num_rows)
170  , row_set_mem_owner_(row_set_mem_owner)
171  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
172  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
177  , device_allocator_(device_allocator) {
178  CHECK(!sort_on_gpu || output_columnar);
179 
180  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
181  if (consistent_frag_sizes.empty()) {
182  // No fragments in the input, no underlying buffers will be needed.
183  return;
184  }
185  if (!ra_exe_unit.use_bump_allocator) {
186  check_total_bitmap_memory(query_mem_desc);
187  }
188  if (device_type == ExecutorDeviceType::GPU) {
189  allocateCountDistinctGpuMem(query_mem_desc);
190  }
191 
192  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
193  allocateCountDistinctBuffers(query_mem_desc, false, executor);
194  if (render_info && render_info->useCudaBuffers()) {
195  return;
196  }
197  }
198 
199  if (ra_exe_unit.estimator) {
200  return;
201  }
202 
203  const auto thread_count = device_type == ExecutorDeviceType::GPU
204  ? executor->blockSize() * executor->gridSize()
205  : 1;
206 
207  size_t group_buffer_size{0};
208  if (ra_exe_unit.use_bump_allocator) {
209  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
210  // the fragment
211  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
212  group_buffer_size = num_rows * query_mem_desc.getRowSize();
213  } else {
214  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
215  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
216  }
217  } else {
218  group_buffer_size =
219  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
220  }
221  CHECK_GE(group_buffer_size, size_t(0));
222 
223  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
224  int64_t* group_by_buffer_template{nullptr};
225  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
226  group_by_buffer_template =
227  reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(group_buffer_size));
228  initGroupByBuffer(group_by_buffer_template,
229  ra_exe_unit,
230  query_mem_desc,
231  device_type,
232  output_columnar,
233  executor);
234  }
235 
236  if (query_mem_desc.interleavedBins(device_type)) {
237  CHECK(query_mem_desc.hasKeylessHash());
238  }
239 
240  const auto step = device_type == ExecutorDeviceType::GPU &&
241  query_mem_desc.threadsShareMemory() &&
242  query_mem_desc.isGroupBy()
243  ? executor->blockSize()
244  : size_t(1);
245  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
246  query_mem_desc.hasKeylessHash()
247  ? query_mem_desc.getEntryCount()
248  : size_t(0);
249  const auto actual_group_buffer_size =
250  group_buffer_size + index_buffer_qw * sizeof(int64_t);
251  CHECK_GE(actual_group_buffer_size, group_buffer_size);
252 
253  for (size_t i = 0; i < group_buffers_count; i += step) {
254  auto group_by_buffer = alloc_group_by_buffer(
255  actual_group_buffer_size, render_allocator_map, row_set_mem_owner_.get());
256  if (!query_mem_desc.lazyInitGroups(device_type)) {
257  if (group_by_buffer_template) {
258  memcpy(group_by_buffer + index_buffer_qw,
259  group_by_buffer_template,
260  group_buffer_size);
261  } else {
262  initGroupByBuffer(group_by_buffer + index_buffer_qw,
263  ra_exe_unit,
264  query_mem_desc,
265  device_type,
266  output_columnar,
267  executor);
268  }
269  }
270  group_by_buffers_.push_back(group_by_buffer);
271  for (size_t j = 1; j < step; ++j) {
272  group_by_buffers_.push_back(nullptr);
273  }
274  const auto column_frag_offsets =
275  get_col_frag_offsets(ra_exe_unit.target_exprs, frag_offsets);
276  const auto column_frag_sizes =
277  get_consistent_frags_sizes(ra_exe_unit.target_exprs, consistent_frag_sizes);
278  result_sets_.emplace_back(
279  new ResultSet(target_exprs_to_infos(ra_exe_unit.target_exprs, query_mem_desc),
280  executor->getColLazyFetchInfo(ra_exe_unit.target_exprs),
281  col_buffers,
282  column_frag_offsets,
283  column_frag_sizes,
284  device_type,
285  device_id,
288  executor));
289  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
290  executor->plan_state_->init_agg_vals_);
291  for (size_t j = 1; j < step; ++j) {
292  result_sets_.emplace_back(nullptr);
293  }
294  }
295 }
296 
298  const TableFunctionExecutionUnit& exe_unit,
300  const int device_id,
301  const ExecutorDeviceType device_type,
302  const int64_t num_rows,
303  const std::vector<std::vector<const int8_t*>>& col_buffers,
304  const std::vector<std::vector<uint64_t>>& frag_offsets,
305  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
306  DeviceAllocator* device_allocator,
307  const Executor* executor)
308  : num_rows_(num_rows)
309  , row_set_mem_owner_(row_set_mem_owner)
310  , init_agg_vals_(init_agg_val_vec(exe_unit.target_exprs, {}, query_mem_desc))
311  , num_buffers_(/*computeNumberOfBuffers(query_mem_desc, device_type, executor)*/ 1)
316  , device_allocator_(device_allocator) {
317  // Table functions output columnar, basically treat this as a projection
318 
319  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
320  if (consistent_frag_sizes.empty()) {
321  // No fragments in the input, no underlying buffers will be needed.
322  return;
323  }
324 
325  size_t group_buffer_size{0};
326  // TODO(adb): this is going to give us an index buffer and then the target buffers. this
327  // might not be desireable -- revisit
328  group_buffer_size = query_mem_desc.getBufferSizeBytes(device_type, num_rows_);
329  CHECK_GE(group_buffer_size, size_t(0));
330 
331  const auto index_buffer_qw =
332  device_type == ExecutorDeviceType::GPU && query_mem_desc.hasKeylessHash()
333  ? query_mem_desc.getEntryCount()
334  : size_t(0);
335  const auto actual_group_buffer_size =
336  group_buffer_size + index_buffer_qw * sizeof(int64_t);
337  CHECK_GE(actual_group_buffer_size, group_buffer_size);
338 
339  CHECK_EQ(num_buffers_, size_t(1));
340  auto group_by_buffer =
341  alloc_group_by_buffer(actual_group_buffer_size, nullptr, row_set_mem_owner.get());
342  if (!query_mem_desc.lazyInitGroups(device_type)) {
343  initColumnarGroups(
344  query_mem_desc, group_by_buffer + index_buffer_qw, init_agg_vals_, executor);
345  }
346  group_by_buffers_.push_back(group_by_buffer);
347 
348  const auto column_frag_offsets =
349  get_col_frag_offsets(exe_unit.target_exprs, frag_offsets);
350  const auto column_frag_sizes =
351  get_consistent_frags_sizes(exe_unit.target_exprs, consistent_frag_sizes);
352  result_sets_.emplace_back(
353  new ResultSet(target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
354  {},
355  col_buffers,
356  column_frag_offsets,
357  column_frag_sizes,
358  device_type,
359  device_id,
361  row_set_mem_owner_,
362  executor));
363  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
364  init_agg_vals_);
365 }
366 
368  int64_t* buffer,
369  const RelAlgExecutionUnit& ra_exe_unit,
371  const ExecutorDeviceType device_type,
372  const bool output_columnar,
373  const Executor* executor) {
374  if (output_columnar) {
375  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor);
376  } else {
377  auto rows_ptr = buffer;
378  auto actual_entry_count = query_mem_desc.getEntryCount();
379  const auto thread_count = device_type == ExecutorDeviceType::GPU
380  ? executor->blockSize() * executor->gridSize()
381  : 1;
382  auto warp_size =
383  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
384  if (query_mem_desc.useStreamingTopN()) {
385  const auto node_count_size = thread_count * sizeof(int64_t);
386  memset(rows_ptr, 0, node_count_size);
387  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
388  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
389  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
390  rows_ptr += rows_offset / sizeof(int64_t);
391  actual_entry_count = n * thread_count;
392  warp_size = 1;
393  }
394  initGroups(query_mem_desc,
395  rows_ptr,
397  actual_entry_count,
398  warp_size,
399  executor);
400  }
401 }
402 
404  int64_t* groups_buffer,
405  const std::vector<int64_t>& init_vals,
406  const int32_t groups_buffer_entry_count,
407  const size_t warp_size,
408  const Executor* executor) {
409  const size_t key_count{query_mem_desc.getGroupbyColCount()};
410  const size_t row_size{query_mem_desc.getRowSize()};
411  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
412 
413  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
414  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
415 
416  const auto query_mem_desc_fixedup =
418 
419  if (query_mem_desc.hasKeylessHash()) {
420  CHECK(warp_size >= 1);
421  CHECK(key_count == 1 || warp_size == 1);
422  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
423  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
424  ++bin, buffer_ptr += row_size) {
425  initColumnPerRow(query_mem_desc_fixedup,
426  &buffer_ptr[col_base_off],
427  bin,
428  init_vals,
429  agg_bitmap_size);
430  }
431  }
432  return;
433  }
434 
435  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
436  ++bin, buffer_ptr += row_size) {
437  fill_empty_key(buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
438  initColumnPerRow(query_mem_desc_fixedup,
439  &buffer_ptr[col_base_off],
440  bin,
441  init_vals,
442  agg_bitmap_size);
443  }
444 }
445 
446 namespace {
447 
448 template <typename T>
449 int8_t* initColumnarBuffer(T* buffer_ptr, const T init_val, const uint32_t entry_count) {
450  static_assert(sizeof(T) <= sizeof(int64_t), "Unsupported template type");
451  for (uint32_t i = 0; i < entry_count; ++i) {
452  buffer_ptr[i] = init_val;
453  }
454  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
455 }
456 
457 } // namespace
458 
461  int64_t* groups_buffer,
462  const std::vector<int64_t>& init_vals,
463  const Executor* executor) {
464  CHECK(groups_buffer);
465  for (const auto target_expr : executor->plan_state_->target_exprs_) {
466  const auto agg_info = get_target_info(target_expr, g_bigint_count);
467  CHECK(!is_distinct_target(agg_info));
468  }
469  const int32_t agg_col_count = query_mem_desc.getSlotCount();
470  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
471 
472  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
473  if (!query_mem_desc.hasKeylessHash()) {
474  const size_t key_count{query_mem_desc.getGroupbyColCount()};
475  for (size_t i = 0; i < key_count; ++i) {
476  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
477  EMPTY_KEY_64,
479  }
480  }
481 
483  // initializing all aggregate columns:
484  int32_t init_val_idx = 0;
485  for (int32_t i = 0; i < agg_col_count; ++i) {
486  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
487  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
488  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
489  case 1:
490  buffer_ptr = initColumnarBuffer<int8_t>(
491  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
492  break;
493  case 2:
494  buffer_ptr =
495  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
496  init_vals[init_val_idx++],
498  break;
499  case 4:
500  buffer_ptr =
501  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
502  init_vals[init_val_idx++],
504  break;
505  case 8:
506  buffer_ptr =
507  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
508  init_vals[init_val_idx++],
510  break;
511  case 0:
512  break;
513  default:
514  CHECK(false);
515  }
516 
517  buffer_ptr = align_to_int64(buffer_ptr);
518  }
519  }
520  }
521 }
522 
524  int8_t* row_ptr,
525  const size_t bin,
526  const std::vector<int64_t>& init_vals,
527  const std::vector<ssize_t>& bitmap_sizes) {
528  int8_t* col_ptr = row_ptr;
529  size_t init_vec_idx = 0;
530  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
531  col_ptr += query_mem_desc.getNextColOffInBytes(col_ptr, bin, col_idx++)) {
532  const ssize_t bm_sz{bitmap_sizes[col_idx]};
533  int64_t init_val{0};
534  if (!bm_sz || !query_mem_desc.isGroupBy()) {
535  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
536  CHECK_LT(init_vec_idx, init_vals.size());
537  init_val = init_vals[init_vec_idx++];
538  }
539  } else {
540  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
541  sizeof(int64_t));
542  init_val =
544  ++init_vec_idx;
545  }
546  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
547  case 1:
548  *col_ptr = static_cast<int8_t>(init_val);
549  break;
550  case 2:
551  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
552  break;
553  case 4:
554  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
555  break;
556  case 8:
557  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
558  break;
559  case 0:
560  continue;
561  default:
562  CHECK(false);
563  }
564  }
565 }
566 
569  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
570  return;
571  }
573 
574  size_t total_bytes_per_entry{0};
575  const size_t num_count_distinct_descs =
576  query_mem_desc.getCountDistinctDescriptorsSize();
577  for (size_t i = 0; i < num_count_distinct_descs; i++) {
578  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
579  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
580  continue;
581  }
582  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
583  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
584  }
585 
587  total_bytes_per_entry * query_mem_desc.getEntryCount();
588  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
590  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
592 
593  // TODO(adb): use allocator
595  static_cast<int8_t*>(checked_malloc(count_distinct_bitmap_mem_bytes_));
596  row_set_mem_owner_->addCountDistinctBuffer(
598 }
599 
600 // deferred is true for group by queries; initGroups will allocate a bitmap
601 // for each group slot
604  const bool deferred,
605  const Executor* executor) {
606  const size_t agg_col_count{query_mem_desc.getSlotCount()};
607  std::vector<ssize_t> agg_bitmap_size(deferred ? agg_col_count : 0);
608 
609  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
610  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
611  ++target_idx) {
612  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
613  const auto agg_info = get_target_info(target_expr, g_bigint_count);
614  if (is_distinct_target(agg_info)) {
615  CHECK(agg_info.is_agg &&
616  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
617  CHECK(!agg_info.sql_type.is_varlen());
618 
619  const auto agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
620  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
621 
622  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
623  sizeof(int64_t));
624  const auto& count_distinct_desc =
625  query_mem_desc.getCountDistinctDescriptor(target_idx);
626  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
627  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
628  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
629  if (deferred) {
630  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
631  } else {
632  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
633  }
634  } else {
635  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::StdSet);
636  if (deferred) {
637  agg_bitmap_size[agg_col_idx] = -1;
638  } else {
639  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
640  }
641  }
642  }
643  }
644 
645  return agg_bitmap_size;
646 }
647 
648 int64_t QueryMemoryInitializer::allocateCountDistinctBitmap(const size_t bitmap_byte_sz) {
652  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
653  row_set_mem_owner_->addCountDistinctBuffer(ptr, bitmap_byte_sz, false);
654  return reinterpret_cast<int64_t>(ptr);
655  }
656  auto count_distinct_buffer = static_cast<int8_t*>(checked_calloc(bitmap_byte_sz, 1));
657  row_set_mem_owner_->addCountDistinctBuffer(count_distinct_buffer, bitmap_byte_sz, true);
658  return reinterpret_cast<int64_t>(count_distinct_buffer);
659 }
660 
662  auto count_distinct_set = new std::set<int64_t>();
663  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
664  return reinterpret_cast<int64_t>(count_distinct_set);
665 }
666 
667 #ifdef HAVE_CUDA
668 GpuGroupByBuffers QueryMemoryInitializer::prepareTopNHeapsDevBuffer(
670  const CUdeviceptr init_agg_vals_dev_ptr,
671  const size_t n,
672  const int device_id,
673  const unsigned block_size_x,
674  const unsigned grid_size_x) {
676  const auto thread_count = block_size_x * grid_size_x;
677  const auto total_buff_size =
678  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
679  CUdeviceptr dev_buffer =
680  reinterpret_cast<CUdeviceptr>(device_allocator_->alloc(total_buff_size));
681 
682  std::vector<CUdeviceptr> dev_buffers(thread_count);
683 
684  for (size_t i = 0; i < thread_count; ++i) {
685  dev_buffers[i] = dev_buffer;
686  }
687 
688  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(CUdeviceptr));
689  device_allocator_->copyToDevice(dev_ptr,
690  reinterpret_cast<int8_t*>(dev_buffers.data()),
691  thread_count * sizeof(CUdeviceptr));
692 
694 
695  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
696  thread_count * sizeof(int64_t));
697 
698  device_allocator_->setDeviceMem(
699  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
700  (unsigned char)-1,
701  thread_count * n * sizeof(int64_t));
702 
704  reinterpret_cast<int64_t*>(
705  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
706  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
707  n * thread_count,
708  query_mem_desc.getGroupbyColCount(),
709  query_mem_desc.getEffectiveKeyWidth(),
710  query_mem_desc.getRowSize() / sizeof(int64_t),
711  query_mem_desc.hasKeylessHash(),
712  1,
713  block_size_x,
714  grid_size_x);
715 
716  return {reinterpret_cast<CUdeviceptr>(dev_ptr), dev_buffer};
717 }
718 
719 GpuGroupByBuffers QueryMemoryInitializer::createAndInitializeGroupByBufferGpu(
720  const RelAlgExecutionUnit& ra_exe_unit,
721  const QueryMemoryDescriptor& query_mem_desc,
722  const CUdeviceptr init_agg_vals_dev_ptr,
723  const int device_id,
724  const ExecutorDispatchMode dispatch_mode,
725  const unsigned block_size_x,
726  const unsigned grid_size_x,
727  const int8_t warp_size,
728  const bool can_sort_on_gpu,
729  const bool output_columnar,
730  RenderAllocator* render_allocator) {
731  if (query_mem_desc.useStreamingTopN()) {
732  if (render_allocator) {
734  }
735  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
736  CHECK(!output_columnar);
737 
738  return prepareTopNHeapsDevBuffer(
739  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
740  }
741 
742  auto dev_group_by_buffers = create_dev_group_by_buffers(device_allocator_,
744  query_mem_desc,
745  block_size_x,
746  grid_size_x,
747  device_id,
748  dispatch_mode,
749  num_rows_,
750  can_sort_on_gpu,
751  false,
752  ra_exe_unit.use_bump_allocator,
753  render_allocator);
754 
755  if (render_allocator) {
756  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
757  }
758  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
759  CHECK(!render_allocator);
760 
761  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
762  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
763  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
764  auto group_by_dev_buffer = dev_group_by_buffers.second;
765  const size_t col_count = query_mem_desc.getSlotCount();
766  int8_t* col_widths_dev_ptr{nullptr};
767  if (output_columnar) {
768  std::vector<int8_t> compact_col_widths(col_count);
769  for (size_t idx = 0; idx < col_count; ++idx) {
770  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
771  }
772  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
773  device_allocator_->copyToDevice(
774  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
775  }
776  const int8_t warp_count =
777  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
778  for (size_t i = 0; i < getGroupByBuffersSize(); i += step) {
779  if (output_columnar) {
781  reinterpret_cast<int64_t*>(group_by_dev_buffer),
782  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
783  dev_group_by_buffers.entry_count,
784  query_mem_desc.getGroupbyColCount(),
785  col_count,
786  col_widths_dev_ptr,
787  /*need_padding = */ true,
788  query_mem_desc.hasKeylessHash(),
789  sizeof(int64_t),
790  block_size_x,
791  grid_size_x);
792  } else {
793  init_group_by_buffer_on_device(reinterpret_cast<int64_t*>(group_by_dev_buffer),
794  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
795  dev_group_by_buffers.entry_count,
796  query_mem_desc.getGroupbyColCount(),
797  query_mem_desc.getEffectiveKeyWidth(),
798  query_mem_desc.getRowSize() / sizeof(int64_t),
799  query_mem_desc.hasKeylessHash(),
800  warp_count,
801  block_size_x,
802  grid_size_x);
803  }
804  group_by_dev_buffer += groups_buffer_size;
805  }
806  }
807  return dev_group_by_buffers;
808 }
809 
810 GpuGroupByBuffers QueryMemoryInitializer::setupTableFunctionGpuBuffers(
811  const QueryMemoryDescriptor& query_mem_desc,
812  const int device_id,
813  const unsigned block_size_x,
814  const unsigned grid_size_x) {
817  query_mem_desc,
818  block_size_x,
819  grid_size_x,
820  device_id,
822  num_rows_,
823  false,
824  false,
825  false,
826  nullptr);
827 }
828 
829 #endif
830 
832  const QueryMemoryDescriptor& query_mem_desc,
833  const ExecutorDeviceType device_type,
834  const Executor* executor) const {
835  return device_type == ExecutorDeviceType::CPU
836  ? 1
837  : executor->blockSize() *
838  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
839 }
840 
841 namespace {
842 
843 // in-place compaction of output buffer
845  const QueryMemoryDescriptor& query_mem_desc,
846  int8_t* projection_buffer,
847  const size_t projection_count) {
848  // the first column (row indices) remains unchanged.
849  CHECK(projection_count <= query_mem_desc.getEntryCount());
850  constexpr size_t row_index_width = sizeof(int64_t);
851  size_t buffer_offset1{projection_count * row_index_width};
852  // other columns are actual non-lazy columns for the projection:
853  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
854  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
855  auto column_proj_size =
856  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
857  auto buffer_offset2 = query_mem_desc.getColOffInBytes(i);
858  if (buffer_offset1 + column_proj_size >= buffer_offset2) {
859  // overlapping
860  std::memmove(projection_buffer + buffer_offset1,
861  projection_buffer + buffer_offset2,
862  column_proj_size);
863  } else {
864  std::memcpy(projection_buffer + buffer_offset1,
865  projection_buffer + buffer_offset2,
866  column_proj_size);
867  }
868  buffer_offset1 += align_to_int64(column_proj_size);
869  }
870  }
871 }
872 
873 } // namespace
874 
876  const QueryMemoryDescriptor& query_mem_desc,
877  const size_t projection_count) {
878  const auto num_allocated_rows =
879  std::min(projection_count, query_mem_desc.getEntryCount());
880 
881  // copy the results from the main buffer into projection_buffer
883  query_mem_desc,
884  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
885  num_allocated_rows);
886 
887  // update the entry count for the result set, and its underlying storage
888  CHECK(!result_sets_.empty());
889  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
890 }
891 
893  const QueryMemoryDescriptor& query_mem_desc,
894  Data_Namespace::DataMgr* data_mgr,
895  const GpuGroupByBuffers& gpu_group_by_buffers,
896  const size_t projection_count,
897  const int device_id) {
898  // store total number of allocated rows:
899  const auto num_allocated_rows =
900  std::min(projection_count, query_mem_desc.getEntryCount());
901 
902  // copy the results from the main buffer into projection_buffer
904  data_mgr,
905  gpu_group_by_buffers,
906  query_mem_desc,
907  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
908  num_allocated_rows,
909  device_id);
910 
911  // update the entry count for the result set, and its underlying storage
912  CHECK(!result_sets_.empty());
913  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
914 }
915 
917  Data_Namespace::DataMgr* data_mgr,
918  const QueryMemoryDescriptor& query_mem_desc,
919  const size_t entry_count,
920  const GpuGroupByBuffers& gpu_group_by_buffers,
921  const RelAlgExecutionUnit* ra_exe_unit,
922  const unsigned block_size_x,
923  const unsigned grid_size_x,
924  const int device_id,
925  const bool prepend_index_buffer) const {
926  const auto thread_count = block_size_x * grid_size_x;
927 
928  size_t total_buff_size{0};
929  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
930  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
931  total_buff_size =
932  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
933  } else {
934  total_buff_size =
935  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
936  }
939  total_buff_size,
940  gpu_group_by_buffers.second,
941  query_mem_desc,
942  block_size_x,
943  grid_size_x,
944  device_id,
945  prepend_index_buffer);
946 }
947 
949  const QueryMemoryDescriptor& query_mem_desc,
950  const RelAlgExecutionUnit& ra_exe_unit) {
951  CHECK_EQ(group_by_buffers_.size(), size_t(1));
952 
953  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
955  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
956  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
957  1);
958  CHECK_EQ(rows_copy.size(),
959  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
960  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
961 }
962 
964  Data_Namespace::DataMgr* data_mgr,
965  const QueryMemoryDescriptor& query_mem_desc,
966  const GpuGroupByBuffers& gpu_group_by_buffers,
967  const RelAlgExecutionUnit& ra_exe_unit,
968  const unsigned total_thread_count,
969  const int device_id) {
970 #ifdef HAVE_CUDA
972 
973  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
974  data_mgr,
975  reinterpret_cast<int64_t*>(gpu_group_by_buffers.second),
976  ra_exe_unit,
977  query_mem_desc,
978  total_thread_count,
979  device_id);
980  CHECK_EQ(
981  rows_copy.size(),
982  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
983  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
984 #else
985  UNREACHABLE();
986 #endif
987 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:205
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
void initGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
RenderAllocator * getRenderAllocator(size_t device_id)
std::vector< ssize_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
const int32_t groups_buffer_size return groups_buffer
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
bool countDistinctDescriptorsLogicallyEmpty() const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:69
#define EMPTY_KEY_64
const int8_t const int64_t * num_rows
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *cuda_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:61
ExecutorDeviceType
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
Streaming Top N algorithm.
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:66
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
unsigned long long CUdeviceptr
Definition: nocuda.h:27
#define UNREACHABLE()
Definition: Logger.h:241
#define CHECK_GE(x, y)
Definition: Logger.h:210
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
size_t getNextColOffInBytes(const int8_t *col_ptr, const size_t bin, const size_t col_idx) const
size_t getEffectiveKeyWidth() const
const int64_t const uint32_t groups_buffer_entry_count
num_buffers_(1)
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
size_t g_max_memory_allocation_size
Definition: Execute.cpp:99
bool g_enable_watchdog
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
WindowFunctionContext *WindowProjectNodeContext::s_active_window_function_ nullptr
const size_t limit
CUdeviceptr second
Definition: GpuMemUtils.h:61
CHECK(cgen_state)
std::vector< int64_t > init_agg_vals_
const SortInfo sort_info
size_t getGroupbyColCount() const
void * checked_malloc(const size_t size)
Definition: checked_alloc.h:44
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
int8_t * allocate(const size_t num_bytes)
bool lazyInitGroups(const ExecutorDeviceType) const
bool g_bigint_count
void * checked_calloc(const size_t nmemb, const size_t size)
Definition: checked_alloc.h:52
size_t getAllocatedSize() const
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:117
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
const std::shared_ptr< Analyzer::Estimator > estimator
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
size_t getCountDistinctDescriptorsSize() const
QueryDescriptionType getQueryDescriptionType() const
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:207
count_distinct_bitmap_mem_(0)
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer)
Definition: sqldefs.h:76
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:492
void copyGroupByBuffersFromGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
count_distinct_bitmap_host_mem_(nullptr)
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
device_allocator_(device_allocator)
bool interleavedBins(const ExecutorDeviceType) const
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const Executor *executor)
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
Basic constructors and methods of the row set interface.
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)
count_distinct_bitmap_crt_ptr_(nullptr)
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
const int64_t * init_vals
const size_t offset
count_distinct_bitmap_mem_bytes_(0)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
size_t getColOffInBytes(const size_t col_idx) const
std::vector< std::unique_ptr< ResultSet > > result_sets_
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, RowSetMemoryOwner *mem_owner)
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
void initColumnPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const size_t bin, const std::vector< int64_t > &init_vals, const std::vector< ssize_t > &bitmap_sizes)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)