OmniSciDB  1dac507f6e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
QueryMemoryInitializer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2019 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryInitializer.h"
18 
19 #include "Execute.h"
20 #include "GpuInitGroups.h"
21 #include "GpuMemUtils.h"
23 #include "ResultSet.h"
24 #include "Shared/Logger.h"
25 #include "StreamingTopN.h"
26 
27 #include <Shared/checked_alloc.h>
28 
29 namespace {
30 
32  const int32_t groups_buffer_entry_count = query_mem_desc.getEntryCount();
33  if (g_enable_watchdog) {
34  checked_int64_t total_bytes_per_group = 0;
35  const size_t num_count_distinct_descs =
36  query_mem_desc.getCountDistinctDescriptorsSize();
37  for (size_t i = 0; i < num_count_distinct_descs; i++) {
38  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
39  if (count_distinct_desc.impl_type_ != CountDistinctImplType::Bitmap) {
40  continue;
41  }
42  total_bytes_per_group += count_distinct_desc.bitmapPaddedSizeBytes();
43  }
44  int64_t total_bytes{0};
45  // Using OutOfHostMemory until we can verify that SlabTooBig would also be properly
46  // caught
47  try {
48  total_bytes =
49  static_cast<int64_t>(total_bytes_per_group * groups_buffer_entry_count);
50  } catch (...) {
51  // Absurd amount of memory, merely computing the number of bits overflows int64_t.
52  // Don't bother to report the real amount, this is unlikely to ever happen.
53  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
54  }
55  if (total_bytes >= 2 * 1000 * 1000 * 1000L) {
56  throw OutOfHostMemory(total_bytes);
57  }
58  }
59 }
60 
61 int64_t* alloc_group_by_buffer(const size_t numBytes,
62  RenderAllocatorMap* render_allocator_map) {
63  if (render_allocator_map) {
64  // NOTE(adb): If we got here, we are performing an in-situ rendering query and are not
65  // using CUDA buffers. Therefore we need to allocate result set storage using CPU
66  // memory.
67  const auto gpu_idx = 0; // Only 1 GPU supported in CUDA-disabled rendering mode
68  auto render_allocator_ptr = render_allocator_map->getRenderAllocator(gpu_idx);
69  return reinterpret_cast<int64_t*>(render_allocator_ptr->alloc(numBytes));
70  } else {
71  return reinterpret_cast<int64_t*>(checked_malloc(numBytes));
72  }
73 }
74 
75 inline int64_t get_consistent_frag_size(const std::vector<uint64_t>& frag_offsets) {
76  if (frag_offsets.size() < 2) {
77  return ssize_t(-1);
78  }
79  const auto frag_size = frag_offsets[1] - frag_offsets[0];
80  for (size_t i = 2; i < frag_offsets.size(); ++i) {
81  const auto curr_size = frag_offsets[i] - frag_offsets[i - 1];
82  if (curr_size != frag_size) {
83  return int64_t(-1);
84  }
85  }
86  return !frag_size ? std::numeric_limits<int64_t>::max()
87  : static_cast<int64_t>(frag_size);
88 }
89 
90 inline std::vector<int64_t> get_consistent_frags_sizes(
91  const std::vector<std::vector<uint64_t>>& frag_offsets) {
92  if (frag_offsets.empty()) {
93  return {};
94  }
95  std::vector<int64_t> frag_sizes;
96  for (size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
97  std::vector<uint64_t> tab_offs;
98  for (auto& offsets : frag_offsets) {
99  tab_offs.push_back(offsets[tab_idx]);
100  }
101  frag_sizes.push_back(get_consistent_frag_size(tab_offs));
102  }
103  return frag_sizes;
104 }
105 
106 inline std::vector<int64_t> get_consistent_frags_sizes(
107  const std::vector<Analyzer::Expr*>& target_exprs,
108  const std::vector<int64_t>& table_frag_sizes) {
109  std::vector<int64_t> col_frag_sizes;
110  for (auto expr : target_exprs) {
111  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
112  if (col_var->get_rte_idx() < 0) {
113  CHECK_EQ(-1, col_var->get_rte_idx());
114  col_frag_sizes.push_back(int64_t(-1));
115  } else {
116  col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
117  }
118  } else {
119  col_frag_sizes.push_back(int64_t(-1));
120  }
121  }
122  return col_frag_sizes;
123 }
124 
125 inline std::vector<std::vector<int64_t>> get_col_frag_offsets(
126  const std::vector<Analyzer::Expr*>& target_exprs,
127  const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
128  std::vector<std::vector<int64_t>> col_frag_offsets;
129  for (auto& table_offsets : table_frag_offsets) {
130  std::vector<int64_t> col_offsets;
131  for (auto expr : target_exprs) {
132  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
133  if (col_var->get_rte_idx() < 0) {
134  CHECK_EQ(-1, col_var->get_rte_idx());
135  col_offsets.push_back(int64_t(-1));
136  } else {
137  CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
138  col_offsets.push_back(
139  static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
140  }
141  } else {
142  col_offsets.push_back(int64_t(-1));
143  }
144  }
145  col_frag_offsets.push_back(col_offsets);
146  }
147  return col_frag_offsets;
148 }
149 
150 } // namespace
151 
153  const RelAlgExecutionUnit& ra_exe_unit,
155  const int device_id,
156  const ExecutorDeviceType device_type,
157  const ExecutorDispatchMode dispatch_mode,
158  const bool output_columnar,
159  const bool sort_on_gpu,
160  const int64_t num_rows,
161  const std::vector<std::vector<const int8_t*>>& col_buffers,
162  const std::vector<std::vector<uint64_t>>& frag_offsets,
163  RenderAllocatorMap* render_allocator_map,
164  RenderInfo* render_info,
165  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
166  DeviceAllocator* device_allocator,
167  const Executor* executor)
168  : num_rows_(num_rows)
169  , row_set_mem_owner_(row_set_mem_owner)
170  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
171  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
176  , device_allocator_(device_allocator) {
177  CHECK(!sort_on_gpu || output_columnar);
178 
179  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
180  if (consistent_frag_sizes.empty()) {
181  // No fragments in the input, no underlying buffers will be needed.
182  return;
183  }
184  if (!ra_exe_unit.use_bump_allocator) {
185  check_total_bitmap_memory(query_mem_desc);
186  }
187  if (device_type == ExecutorDeviceType::GPU) {
188  allocateCountDistinctGpuMem(query_mem_desc);
189  }
190 
191  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
192  allocateCountDistinctBuffers(query_mem_desc, false, executor);
193  if (render_info && render_info->useCudaBuffers()) {
194  return;
195  }
196  }
197 
198  if (ra_exe_unit.estimator) {
199  return;
200  }
201 
202  const auto thread_count = device_type == ExecutorDeviceType::GPU
203  ? executor->blockSize() * executor->gridSize()
204  : 1;
205 
206  size_t group_buffer_size{0};
207  if (ra_exe_unit.use_bump_allocator) {
208  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
209  // the fragment
210  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
211  group_buffer_size = num_rows * query_mem_desc.getRowSize();
212  } else {
213  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
214  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
215  }
216  } else {
217  group_buffer_size =
218  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
219  }
220  CHECK_GE(group_buffer_size, size_t(0));
221 
222  std::unique_ptr<int64_t, CheckedAllocDeleter> group_by_buffer_template;
223  if (!query_mem_desc.lazyInitGroups(device_type)) {
224  group_by_buffer_template.reset(
225  static_cast<int64_t*>(checked_malloc(group_buffer_size)));
226 
227  if (output_columnar) {
229  query_mem_desc, group_by_buffer_template.get(), init_agg_vals_, executor);
230  } else {
231  auto rows_ptr = group_by_buffer_template.get();
232  auto actual_entry_count = query_mem_desc.getEntryCount();
233  auto warp_size =
234  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
235  if (use_streaming_top_n(ra_exe_unit, query_mem_desc.didOutputColumnar())) {
236  const auto node_count_size = thread_count * sizeof(int64_t);
237  memset(rows_ptr, 0, node_count_size);
238  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
239  const auto rows_offset =
241  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
242  rows_ptr += rows_offset / sizeof(int64_t);
243  actual_entry_count = n * thread_count;
244  warp_size = 1;
245  }
246  initGroups(query_mem_desc,
247  rows_ptr,
249  actual_entry_count,
250  warp_size,
251  executor);
252  }
253  }
254 
255  if (query_mem_desc.interleavedBins(device_type)) {
256  CHECK(query_mem_desc.hasKeylessHash());
257  }
258 
259  const auto step = device_type == ExecutorDeviceType::GPU &&
260  query_mem_desc.threadsShareMemory() &&
261  query_mem_desc.isGroupBy()
262  ? executor->blockSize()
263  : size_t(1);
264  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
265  query_mem_desc.hasKeylessHash()
266  ? query_mem_desc.getEntryCount()
267  : size_t(0);
268  const auto actual_group_buffer_size =
269  group_buffer_size + index_buffer_qw * sizeof(int64_t);
270  CHECK_GE(actual_group_buffer_size, group_buffer_size);
271  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
272 
273  for (size_t i = 0; i < group_buffers_count; i += step) {
274  auto group_by_buffer =
275  alloc_group_by_buffer(actual_group_buffer_size, render_allocator_map);
276  if (!query_mem_desc.lazyInitGroups(device_type)) {
277  CHECK(group_by_buffer_template);
278  memcpy(group_by_buffer + index_buffer_qw,
279  group_by_buffer_template.get(),
280  group_buffer_size);
281  }
282  if (!render_allocator_map) {
283  row_set_mem_owner_->addGroupByBuffer(group_by_buffer);
284  }
285  group_by_buffers_.push_back(group_by_buffer);
286  for (size_t j = 1; j < step; ++j) {
287  group_by_buffers_.push_back(nullptr);
288  }
289  const auto column_frag_offsets =
290  get_col_frag_offsets(ra_exe_unit.target_exprs, frag_offsets);
291  const auto column_frag_sizes =
292  get_consistent_frags_sizes(ra_exe_unit.target_exprs, consistent_frag_sizes);
293  result_sets_.emplace_back(
294  new ResultSet(target_exprs_to_infos(ra_exe_unit.target_exprs, query_mem_desc),
295  executor->getColLazyFetchInfo(ra_exe_unit.target_exprs),
296  col_buffers,
297  column_frag_offsets,
298  column_frag_sizes,
299  device_type,
300  device_id,
303  executor));
304  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
305  executor->plan_state_->init_agg_vals_);
306  for (size_t j = 1; j < step; ++j) {
307  result_sets_.emplace_back(nullptr);
308  }
309  }
310 }
311 
313  const TableFunctionExecutionUnit& exe_unit,
315  const int device_id,
316  const ExecutorDeviceType device_type,
317  const int64_t num_rows,
318  const std::vector<std::vector<const int8_t*>>& col_buffers,
319  const std::vector<std::vector<uint64_t>>& frag_offsets,
320  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
321  DeviceAllocator* device_allocator,
322  const Executor* executor)
323  : num_rows_(num_rows)
324  , row_set_mem_owner_(row_set_mem_owner)
325  , init_agg_vals_(init_agg_val_vec(exe_unit.target_exprs, {}, query_mem_desc))
326  , num_buffers_(/*computeNumberOfBuffers(query_mem_desc, device_type, executor)*/ 1)
331  , device_allocator_(device_allocator) {
332  // Table functions output columnar, basically treat this as a projection
333 
334  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
335  if (consistent_frag_sizes.empty()) {
336  // No fragments in the input, no underlying buffers will be needed.
337  return;
338  }
339 
340  size_t group_buffer_size{0};
341  // TODO(adb): this is going to give us an index buffer and then the target buffers. this
342  // might not be desireable -- revisit
343  group_buffer_size = query_mem_desc.getBufferSizeBytes(device_type, num_rows_);
344  CHECK_GE(group_buffer_size, size_t(0));
345 
346  std::unique_ptr<int64_t, CheckedAllocDeleter> group_by_buffer_template;
347  if (!query_mem_desc.lazyInitGroups(device_type)) {
348  group_by_buffer_template.reset(
349  static_cast<int64_t*>(checked_malloc(group_buffer_size)));
350  initColumnarGroups(
351  query_mem_desc, group_by_buffer_template.get(), init_agg_vals_, executor);
352  }
353 
354  const auto index_buffer_qw =
355  device_type == ExecutorDeviceType::GPU && query_mem_desc.hasKeylessHash()
356  ? query_mem_desc.getEntryCount()
357  : size_t(0);
358  const auto actual_group_buffer_size =
359  group_buffer_size + index_buffer_qw * sizeof(int64_t);
360  CHECK_GE(actual_group_buffer_size, group_buffer_size);
361 
362  CHECK_EQ(num_buffers_, size_t(1));
363  auto group_by_buffer = alloc_group_by_buffer(actual_group_buffer_size, nullptr);
364  if (!query_mem_desc.lazyInitGroups(device_type)) {
365  memcpy(group_by_buffer + index_buffer_qw,
366  group_by_buffer_template.get(),
367  group_buffer_size);
368  }
369  group_by_buffers_.push_back(group_by_buffer);
370  row_set_mem_owner_->addGroupByBuffer(group_by_buffer);
371 
372  const auto column_frag_offsets =
373  get_col_frag_offsets(exe_unit.target_exprs, frag_offsets);
374  const auto column_frag_sizes =
375  get_consistent_frags_sizes(exe_unit.target_exprs, consistent_frag_sizes);
376  result_sets_.emplace_back(
377  new ResultSet(target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
378  {},
379  col_buffers,
380  column_frag_offsets,
381  column_frag_sizes,
382  device_type,
383  device_id,
385  row_set_mem_owner_,
386  executor));
387  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
388  init_agg_vals_);
389 }
390 
392  int64_t* groups_buffer,
393  const std::vector<int64_t>& init_vals,
394  const int32_t groups_buffer_entry_count,
395  const size_t warp_size,
396  const Executor* executor) {
397  const size_t key_count{query_mem_desc.getGroupbyColCount()};
398  const size_t row_size{query_mem_desc.getRowSize()};
399  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
400 
401  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
402  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
403 
404  const auto query_mem_desc_fixedup =
406 
407  if (query_mem_desc.hasKeylessHash()) {
408  CHECK(warp_size >= 1);
409  CHECK(key_count == 1);
410  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
411  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
412  ++bin, buffer_ptr += row_size) {
413  initColumnPerRow(query_mem_desc_fixedup,
414  &buffer_ptr[col_base_off],
415  bin,
416  init_vals,
417  agg_bitmap_size);
418  }
419  }
420  return;
421  }
422 
423  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
424  ++bin, buffer_ptr += row_size) {
425  fill_empty_key(buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
426  initColumnPerRow(query_mem_desc_fixedup,
427  &buffer_ptr[col_base_off],
428  bin,
429  init_vals,
430  agg_bitmap_size);
431  }
432 }
433 
434 namespace {
435 
436 template <typename T>
437 int8_t* initColumnarBuffer(T* buffer_ptr, const T init_val, const uint32_t entry_count) {
438  static_assert(sizeof(T) <= sizeof(int64_t), "Unsupported template type");
439  for (uint32_t i = 0; i < entry_count; ++i) {
440  buffer_ptr[i] = init_val;
441  }
442  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
443 }
444 
445 } // namespace
446 
449  int64_t* groups_buffer,
450  const std::vector<int64_t>& init_vals,
451  const Executor* executor) {
452  CHECK(groups_buffer);
453  for (const auto target_expr : executor->plan_state_->target_exprs_) {
454  const auto agg_info = get_target_info(target_expr, g_bigint_count);
455  CHECK(!is_distinct_target(agg_info));
456  }
457  const int32_t agg_col_count = query_mem_desc.getSlotCount();
458  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
459 
460  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
461  if (!query_mem_desc.hasKeylessHash()) {
462  const size_t key_count{query_mem_desc.getGroupbyColCount()};
463  for (size_t i = 0; i < key_count; ++i) {
464  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
465  EMPTY_KEY_64,
467  }
468  }
469  // initializing all aggregate columns:
470  int32_t init_val_idx = 0;
471  for (int32_t i = 0; i < agg_col_count; ++i) {
472  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
473  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
474  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
475  case 1:
476  buffer_ptr = initColumnarBuffer<int8_t>(
477  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
478  break;
479  case 2:
480  buffer_ptr = initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
481  init_vals[init_val_idx++],
483  break;
484  case 4:
485  buffer_ptr = initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
486  init_vals[init_val_idx++],
488  break;
489  case 8:
490  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
491  init_vals[init_val_idx++],
493  break;
494  case 0:
495  break;
496  default:
497  CHECK(false);
498  }
499 
500  buffer_ptr = align_to_int64(buffer_ptr);
501  }
502  }
503 }
504 
506  int8_t* row_ptr,
507  const size_t bin,
508  const std::vector<int64_t>& init_vals,
509  const std::vector<ssize_t>& bitmap_sizes) {
510  int8_t* col_ptr = row_ptr;
511  size_t init_vec_idx = 0;
512  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
513  col_ptr += query_mem_desc.getNextColOffInBytes(col_ptr, bin, col_idx++)) {
514  const ssize_t bm_sz{bitmap_sizes[col_idx]};
515  int64_t init_val{0};
516  if (!bm_sz || !query_mem_desc.isGroupBy()) {
517  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
518  CHECK_LT(init_vec_idx, init_vals.size());
519  init_val = init_vals[init_vec_idx++];
520  }
521  } else {
522  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
523  sizeof(int64_t));
524  init_val =
526  ++init_vec_idx;
527  }
528  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
529  case 1:
530  *col_ptr = static_cast<int8_t>(init_val);
531  break;
532  case 2:
533  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
534  break;
535  case 4:
536  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
537  break;
538  case 8:
539  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
540  break;
541  case 0:
542  continue;
543  default:
544  CHECK(false);
545  }
546  }
547 }
548 
551  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
552  return;
553  }
555 
556  size_t total_bytes_per_entry{0};
557  const size_t num_count_distinct_descs =
558  query_mem_desc.getCountDistinctDescriptorsSize();
559  for (size_t i = 0; i < num_count_distinct_descs; i++) {
560  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
561  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
562  continue;
563  }
564  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
565  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
566  }
567 
569  total_bytes_per_entry * query_mem_desc.getEntryCount();
570  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
572  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
574 
576  static_cast<int8_t*>(checked_malloc(count_distinct_bitmap_mem_bytes_));
577  row_set_mem_owner_->addCountDistinctBuffer(
579 }
580 
581 // deferred is true for group by queries; initGroups will allocate a bitmap
582 // for each group slot
585  const bool deferred,
586  const Executor* executor) {
587  const size_t agg_col_count{query_mem_desc.getSlotCount()};
588  std::vector<ssize_t> agg_bitmap_size(deferred ? agg_col_count : 0);
589 
590  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
591  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
592  ++target_idx) {
593  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
594  const auto agg_info = get_target_info(target_expr, g_bigint_count);
595  if (is_distinct_target(agg_info)) {
596  CHECK(agg_info.is_agg &&
597  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
598  CHECK(!agg_info.sql_type.is_varlen());
599 
600  const auto agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
601  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
602 
603  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
604  sizeof(int64_t));
605  const auto& count_distinct_desc =
606  query_mem_desc.getCountDistinctDescriptor(target_idx);
607  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
608  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
609  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
610  if (deferred) {
611  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
612  } else {
613  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
614  }
615  } else {
616  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::StdSet);
617  if (deferred) {
618  agg_bitmap_size[agg_col_idx] = -1;
619  } else {
620  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
621  }
622  }
623  }
624  }
625 
626  return agg_bitmap_size;
627 }
628 
629 int64_t QueryMemoryInitializer::allocateCountDistinctBitmap(const size_t bitmap_byte_sz) {
633  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
634  row_set_mem_owner_->addCountDistinctBuffer(ptr, bitmap_byte_sz, false);
635  return reinterpret_cast<int64_t>(ptr);
636  }
637  auto count_distinct_buffer = static_cast<int8_t*>(checked_calloc(bitmap_byte_sz, 1));
638  row_set_mem_owner_->addCountDistinctBuffer(count_distinct_buffer, bitmap_byte_sz, true);
639  return reinterpret_cast<int64_t>(count_distinct_buffer);
640 }
641 
643  auto count_distinct_set = new std::set<int64_t>();
644  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
645  return reinterpret_cast<int64_t>(count_distinct_set);
646 }
647 
648 #ifdef HAVE_CUDA
649 GpuGroupByBuffers QueryMemoryInitializer::prepareTopNHeapsDevBuffer(
651  const CUdeviceptr init_agg_vals_dev_ptr,
652  const size_t n,
653  const int device_id,
654  const unsigned block_size_x,
655  const unsigned grid_size_x) {
657  const auto thread_count = block_size_x * grid_size_x;
658  const auto total_buff_size =
659  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
660  CUdeviceptr dev_buffer =
661  reinterpret_cast<CUdeviceptr>(device_allocator_->alloc(total_buff_size));
662 
663  std::vector<CUdeviceptr> dev_buffers(thread_count);
664 
665  for (size_t i = 0; i < thread_count; ++i) {
666  dev_buffers[i] = dev_buffer;
667  }
668 
669  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(CUdeviceptr));
670  device_allocator_->copyToDevice(dev_ptr,
671  reinterpret_cast<int8_t*>(dev_buffers.data()),
672  thread_count * sizeof(CUdeviceptr));
673 
675 
676  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
677  thread_count * sizeof(int64_t));
678 
679  device_allocator_->setDeviceMem(
680  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
681  (unsigned char)-1,
682  thread_count * n * sizeof(int64_t));
683 
685  reinterpret_cast<int64_t*>(
686  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
687  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
688  n * thread_count,
689  query_mem_desc.getGroupbyColCount(),
690  query_mem_desc.getEffectiveKeyWidth(),
691  query_mem_desc.getRowSize() / sizeof(int64_t),
692  query_mem_desc.hasKeylessHash(),
693  1,
694  block_size_x,
695  grid_size_x);
696 
697  return {reinterpret_cast<CUdeviceptr>(dev_ptr), dev_buffer};
698 }
699 
700 GpuGroupByBuffers QueryMemoryInitializer::createAndInitializeGroupByBufferGpu(
701  const RelAlgExecutionUnit& ra_exe_unit,
702  const QueryMemoryDescriptor& query_mem_desc,
703  const CUdeviceptr init_agg_vals_dev_ptr,
704  const int device_id,
705  const ExecutorDispatchMode dispatch_mode,
706  const unsigned block_size_x,
707  const unsigned grid_size_x,
708  const int8_t warp_size,
709  const bool can_sort_on_gpu,
710  const bool output_columnar,
711  RenderAllocator* render_allocator) {
712  if (use_streaming_top_n(ra_exe_unit, query_mem_desc.didOutputColumnar())) {
713  if (render_allocator) {
715  }
716  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
717  CHECK(!output_columnar);
718 
719  return prepareTopNHeapsDevBuffer(
720  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
721  }
722 
723  auto dev_group_by_buffers = create_dev_group_by_buffers(device_allocator_,
725  query_mem_desc,
726  block_size_x,
727  grid_size_x,
728  device_id,
729  dispatch_mode,
730  num_rows_,
731  can_sort_on_gpu,
732  false,
733  ra_exe_unit.use_bump_allocator,
734  render_allocator);
735 
736  if (render_allocator) {
737  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
738  }
739  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
740  CHECK(!render_allocator);
741 
742  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
743  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
744  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
745  auto group_by_dev_buffer = dev_group_by_buffers.second;
746  const size_t col_count = query_mem_desc.getSlotCount();
747  int8_t* col_widths_dev_ptr{nullptr};
748  if (output_columnar) {
749  std::vector<int8_t> compact_col_widths(col_count);
750  for (size_t idx = 0; idx < col_count; ++idx) {
751  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
752  }
753  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
754  device_allocator_->copyToDevice(
755  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
756  }
757  const int8_t warp_count =
758  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
759  for (size_t i = 0; i < getGroupByBuffersSize(); i += step) {
760  if (output_columnar) {
762  reinterpret_cast<int64_t*>(group_by_dev_buffer),
763  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
764  dev_group_by_buffers.entry_count,
765  query_mem_desc.getGroupbyColCount(),
766  col_count,
767  col_widths_dev_ptr,
768  /*need_padding = */ true,
769  query_mem_desc.hasKeylessHash(),
770  sizeof(int64_t),
771  block_size_x,
772  grid_size_x);
773  } else {
774  init_group_by_buffer_on_device(reinterpret_cast<int64_t*>(group_by_dev_buffer),
775  reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
776  dev_group_by_buffers.entry_count,
777  query_mem_desc.getGroupbyColCount(),
778  query_mem_desc.getEffectiveKeyWidth(),
779  query_mem_desc.getRowSize() / sizeof(int64_t),
780  query_mem_desc.hasKeylessHash(),
781  warp_count,
782  block_size_x,
783  grid_size_x);
784  }
785  group_by_dev_buffer += groups_buffer_size;
786  }
787  }
788  return dev_group_by_buffers;
789 }
790 
791 GpuGroupByBuffers QueryMemoryInitializer::setupTableFunctionGpuBuffers(
792  const QueryMemoryDescriptor& query_mem_desc,
793  const int device_id,
794  const unsigned block_size_x,
795  const unsigned grid_size_x) {
798  query_mem_desc,
799  block_size_x,
800  grid_size_x,
801  device_id,
803  num_rows_,
804  false,
805  false,
806  false,
807  nullptr);
808 }
809 
810 #endif
811 
813  const QueryMemoryDescriptor& query_mem_desc,
814  const ExecutorDeviceType device_type,
815  const Executor* executor) const {
816  return device_type == ExecutorDeviceType::CPU
817  ? 1
818  : executor->blockSize() *
819  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
820 }
821 
822 namespace {
823 
824 // in-place compaction of output buffer
826  const QueryMemoryDescriptor& query_mem_desc,
827  int8_t* projection_buffer,
828  const size_t projection_count) {
829  // the first column (row indices) remains unchanged.
830  CHECK(projection_count <= query_mem_desc.getEntryCount());
831  constexpr size_t row_index_width = sizeof(int64_t);
832  size_t buffer_offset1{projection_count * row_index_width};
833  // other columns are actual non-lazy columns for the projection:
834  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
835  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
836  auto column_proj_size =
837  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
838  auto buffer_offset2 = query_mem_desc.getColOffInBytes(i);
839  if (buffer_offset1 + column_proj_size >= buffer_offset2) {
840  // overlapping
841  std::memmove(projection_buffer + buffer_offset1,
842  projection_buffer + buffer_offset2,
843  column_proj_size);
844  } else {
845  std::memcpy(projection_buffer + buffer_offset1,
846  projection_buffer + buffer_offset2,
847  column_proj_size);
848  }
849  buffer_offset1 += align_to_int64(column_proj_size);
850  }
851  }
852 }
853 
854 } // namespace
855 
857  const QueryMemoryDescriptor& query_mem_desc,
858  const size_t projection_count) {
859  const auto num_allocated_rows =
860  std::min(projection_count, query_mem_desc.getEntryCount());
861 
862  // copy the results from the main buffer into projection_buffer
864  query_mem_desc,
865  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
866  num_allocated_rows);
867 
868  // update the entry count for the result set, and its underlying storage
869  CHECK(!result_sets_.empty());
870  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
871 }
872 
874  const QueryMemoryDescriptor& query_mem_desc,
875  Data_Namespace::DataMgr* data_mgr,
876  const GpuGroupByBuffers& gpu_group_by_buffers,
877  const size_t projection_count,
878  const int device_id) {
879  // store total number of allocated rows:
880  const auto num_allocated_rows =
881  std::min(projection_count, query_mem_desc.getEntryCount());
882 
883  // copy the results from the main buffer into projection_buffer
885  data_mgr,
886  gpu_group_by_buffers,
887  query_mem_desc,
888  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
889  num_allocated_rows,
890  device_id);
891 
892  // update the entry count for the result set, and its underlying storage
893  CHECK(!result_sets_.empty());
894  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
895 }
896 
898  Data_Namespace::DataMgr* data_mgr,
899  const QueryMemoryDescriptor& query_mem_desc,
900  const size_t entry_count,
901  const GpuGroupByBuffers& gpu_group_by_buffers,
902  const RelAlgExecutionUnit* ra_exe_unit,
903  const unsigned block_size_x,
904  const unsigned grid_size_x,
905  const int device_id,
906  const bool prepend_index_buffer) const {
907  const auto thread_count = block_size_x * grid_size_x;
908 
909  size_t total_buff_size{0};
910  if (ra_exe_unit &&
911  use_streaming_top_n(*ra_exe_unit, query_mem_desc.didOutputColumnar())) {
912  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
913  total_buff_size =
914  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
915  } else {
916  total_buff_size =
917  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
918  }
921  total_buff_size,
922  gpu_group_by_buffers.second,
923  query_mem_desc,
924  block_size_x,
925  grid_size_x,
926  device_id,
927  prepend_index_buffer);
928 }
929 
931  const QueryMemoryDescriptor& query_mem_desc,
932  const RelAlgExecutionUnit& ra_exe_unit) {
933  CHECK_EQ(group_by_buffers_.size(), size_t(1));
934 
935  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
937  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
938  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
939  1);
940  CHECK_EQ(rows_copy.size(),
941  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
942  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
943 }
944 
946  Data_Namespace::DataMgr* data_mgr,
947  const QueryMemoryDescriptor& query_mem_desc,
948  const GpuGroupByBuffers& gpu_group_by_buffers,
949  const RelAlgExecutionUnit& ra_exe_unit,
950  const unsigned total_thread_count,
951  const int device_id) {
952 #ifdef HAVE_CUDA
954 
955  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
956  data_mgr,
957  reinterpret_cast<int64_t*>(gpu_group_by_buffers.second),
958  ra_exe_unit,
959  query_mem_desc,
960  total_thread_count,
961  device_id);
962  CHECK_EQ(
963  rows_copy.size(),
964  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
965  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
966 #else
967  UNREACHABLE();
968 #endif
969 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:198
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
void initGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
RenderAllocator * getRenderAllocator(size_t device_id)
bool use_streaming_top_n(const RelAlgExecutionUnit &ra_exe_unit, const bool output_columnar)
std::vector< ssize_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
const int32_t groups_buffer_size return groups_buffer
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
bool countDistinctDescriptorsLogicallyEmpty() const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:66
#define EMPTY_KEY_64
const int8_t const int64_t * num_rows
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map)
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *cuda_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:61
ExecutorDeviceType
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
Streaming Top N algorithm.
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:65
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
unsigned long long CUdeviceptr
Definition: nocuda.h:27
#define UNREACHABLE()
Definition: Logger.h:234
#define CHECK_GE(x, y)
Definition: Logger.h:203
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
size_t getNextColOffInBytes(const int8_t *col_ptr, const size_t bin, const size_t col_idx) const
size_t getEffectiveKeyWidth() const
const int64_t const uint32_t groups_buffer_entry_count
num_buffers_(1)
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
size_t g_max_memory_allocation_size
Definition: Execute.cpp:95
bool g_enable_watchdog
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
const size_t limit
CUdeviceptr second
Definition: GpuMemUtils.h:61
CHECK(cgen_state)
std::vector< int64_t > init_agg_vals_
const SortInfo sort_info
size_t getGroupbyColCount() const
void * checked_malloc(const size_t size)
Definition: checked_alloc.h:40
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
bool lazyInitGroups(const ExecutorDeviceType) const
bool g_bigint_count
const int32_t groups_buffer_size return nullptr
void * checked_calloc(const size_t nmemb, const size_t size)
Definition: checked_alloc.h:48
size_t getAllocatedSize() const
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:116
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
const std::shared_ptr< Analyzer::Estimator > estimator
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void > > checked_int64_t
size_t getCountDistinctDescriptorsSize() const
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:200
count_distinct_bitmap_mem_(0)
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer)
Definition: sqldefs.h:71
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:452
void copyGroupByBuffersFromGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
count_distinct_bitmap_host_mem_(nullptr)
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
device_allocator_(device_allocator)
bool interleavedBins(const ExecutorDeviceType) const
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const Executor *executor)
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
Basic constructors and methods of the row set interface.
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)
count_distinct_bitmap_crt_ptr_(nullptr)
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
const int64_t * init_vals
const size_t offset
count_distinct_bitmap_mem_bytes_(0)
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
size_t getColOffInBytes(const size_t col_idx) const
std::vector< std::unique_ptr< ResultSet > > result_sets_
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
void initColumnPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const size_t bin, const std::vector< int64_t > &init_vals, const std::vector< ssize_t > &bitmap_sizes)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)