OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryMemoryInitializer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "QueryMemoryInitializer.h"
18 #include "Execute.h"
19 #include "GpuInitGroups.h"
20 #include "Logger/Logger.h"
23 #include "Shared/checked_alloc.h"
24 #include "StreamingTopN.h"
25 #include "Utils/FlatBuffer.h"
26 
27 // 8 GB, the limit of perfect hash group by under normal conditions
28 int64_t g_bitmap_memory_limit{8LL * 1000 * 1000 * 1000};
29 
30 namespace {
31 
33  const size_t groups_buffer_entry_count = query_mem_desc.getEntryCount();
34  checked_int64_t total_bytes_per_group = 0;
35  const size_t num_count_distinct_descs =
36  query_mem_desc.getCountDistinctDescriptorsSize();
37  for (size_t i = 0; i < num_count_distinct_descs; i++) {
38  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
39  if (count_distinct_desc.impl_type_ != CountDistinctImplType::Bitmap) {
40  continue;
41  }
42  total_bytes_per_group += count_distinct_desc.bitmapPaddedSizeBytes();
43  }
44  int64_t total_bytes{0};
45  // Using OutOfHostMemory until we can verify that SlabTooBig would also be properly
46  // caught
47  try {
48  total_bytes = static_cast<int64_t>(total_bytes_per_group * groups_buffer_entry_count);
49  } catch (...) {
50  // Absurd amount of memory, merely computing the number of bits overflows int64_t.
51  // Don't bother to report the real amount, this is unlikely to ever happen.
52  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
53  }
54  if (total_bytes >= g_bitmap_memory_limit) {
55  throw OutOfHostMemory(total_bytes);
56  }
57 }
58 
59 std::pair<int64_t*, bool> alloc_group_by_buffer(
60  const size_t numBytes,
61  RenderAllocatorMap* render_allocator_map,
62  const size_t thread_idx,
63  RowSetMemoryOwner* mem_owner,
64  const bool reuse_existing_buffer_for_thread) {
65  if (render_allocator_map) {
66  // NOTE(adb): If we got here, we are performing an in-situ rendering query and are not
67  // using CUDA buffers. Therefore we need to allocate result set storage using CPU
68  // memory.
69  const auto gpu_idx = 0; // Only 1 GPU supported in CUDA-disabled rendering mode
70  auto render_allocator_ptr = render_allocator_map->getRenderAllocator(gpu_idx);
71  return std::make_pair(
72  reinterpret_cast<int64_t*>(render_allocator_ptr->alloc(numBytes)), false);
73  } else if (reuse_existing_buffer_for_thread) {
74  return mem_owner->allocateCachedGroupByBuffer(numBytes, thread_idx);
75  }
76  return std::make_pair(
77  reinterpret_cast<int64_t*>(mem_owner->allocate(numBytes, thread_idx)), false);
78 }
79 
80 inline int64_t get_consistent_frag_size(const std::vector<uint64_t>& frag_offsets) {
81  if (frag_offsets.size() < 2) {
82  return int64_t(-1);
83  }
84  const auto frag_size = frag_offsets[1] - frag_offsets[0];
85  for (size_t i = 2; i < frag_offsets.size(); ++i) {
86  const auto curr_size = frag_offsets[i] - frag_offsets[i - 1];
87  if (curr_size != frag_size) {
88  return int64_t(-1);
89  }
90  }
91  return !frag_size ? std::numeric_limits<int64_t>::max()
92  : static_cast<int64_t>(frag_size);
93 }
94 
95 inline std::vector<int64_t> get_consistent_frags_sizes(
96  const std::vector<std::vector<uint64_t>>& frag_offsets) {
97  if (frag_offsets.empty()) {
98  return {};
99  }
100  std::vector<int64_t> frag_sizes;
101  for (size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
102  std::vector<uint64_t> tab_offs;
103  for (auto& offsets : frag_offsets) {
104  tab_offs.push_back(offsets[tab_idx]);
105  }
106  frag_sizes.push_back(get_consistent_frag_size(tab_offs));
107  }
108  return frag_sizes;
109 }
110 
111 inline std::vector<int64_t> get_consistent_frags_sizes(
112  const std::vector<Analyzer::Expr*>& target_exprs,
113  const std::vector<int64_t>& table_frag_sizes) {
114  std::vector<int64_t> col_frag_sizes;
115  for (auto expr : target_exprs) {
116  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
117  if (col_var->get_rte_idx() < 0) {
118  CHECK_EQ(-1, col_var->get_rte_idx());
119  col_frag_sizes.push_back(int64_t(-1));
120  } else {
121  col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
122  }
123  } else {
124  col_frag_sizes.push_back(int64_t(-1));
125  }
126  }
127  return col_frag_sizes;
128 }
129 
130 inline std::vector<std::vector<int64_t>> get_col_frag_offsets(
131  const std::vector<Analyzer::Expr*>& target_exprs,
132  const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
133  std::vector<std::vector<int64_t>> col_frag_offsets;
134  for (auto& table_offsets : table_frag_offsets) {
135  std::vector<int64_t> col_offsets;
136  for (auto expr : target_exprs) {
137  if (const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
138  if (col_var->get_rte_idx() < 0) {
139  CHECK_EQ(-1, col_var->get_rte_idx());
140  col_offsets.push_back(int64_t(-1));
141  } else {
142  CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
143  col_offsets.push_back(
144  static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
145  }
146  } else {
147  col_offsets.push_back(int64_t(-1));
148  }
149  }
150  col_frag_offsets.push_back(col_offsets);
151  }
152  return col_frag_offsets;
153 }
154 
155 // Return the RelAlg input index of outer_table_id based on ra_exe_unit.input_descs.
156 // Used by UNION queries to get the target_exprs corresponding to the current subquery.
157 int get_input_idx(RelAlgExecutionUnit const& ra_exe_unit,
158  const shared::TableKey& outer_table_key) {
159  auto match_table_key = [=](auto& desc) {
160  return outer_table_key == desc.getTableKey();
161  };
162  auto& input_descs = ra_exe_unit.input_descs;
163  auto itr = std::find_if(input_descs.begin(), input_descs.end(), match_table_key);
164  return itr == input_descs.end() ? 0 : itr->getNestLevel();
165 }
166 
168  const RelAlgExecutionUnit& ra_exe_unit) {
169  const size_t agg_col_count{query_mem_desc.getSlotCount()};
170  CHECK_GE(agg_col_count, ra_exe_unit.target_exprs.size());
171  for (size_t target_idx = 0; target_idx < ra_exe_unit.target_exprs.size();
172  ++target_idx) {
173  const auto target_expr = ra_exe_unit.target_exprs[target_idx];
174  const auto agg_info = get_target_info(target_expr, g_bigint_count);
175  if (is_distinct_target(agg_info)) {
176  CHECK(agg_info.is_agg &&
177  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kCOUNT_IF ||
178  agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
179  CHECK(!agg_info.sql_type.is_varlen());
180  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
181  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
182  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
183  sizeof(int64_t));
184  const auto& count_distinct_desc =
185  query_mem_desc.getCountDistinctDescriptor(target_idx);
186  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
187  }
188  }
189 }
190 
193  const RelAlgExecutionUnit& ra_exe_unit) {
195  if (!query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
196  agg_op_metadata.has_count_distinct = true;
197  }
198  std::for_each(
199  ra_exe_unit.target_exprs.begin(),
200  ra_exe_unit.target_exprs.end(),
201  [&agg_op_metadata](const Analyzer::Expr* expr) {
202  if (auto const* agg_expr = dynamic_cast<Analyzer::AggExpr const*>(expr)) {
203  if (agg_expr->get_aggtype() == kMODE) {
204  agg_op_metadata.has_mode = true;
205  } else if (agg_expr->get_aggtype() == kAPPROX_QUANTILE) {
206  agg_op_metadata.has_tdigest = true;
207  }
208  }
209  });
210  return agg_op_metadata;
211 }
212 
213 } // namespace
214 
215 // Row-based execution constructor
217  const RelAlgExecutionUnit& ra_exe_unit,
219  const int device_id,
220  const ExecutorDeviceType device_type,
221  const ExecutorDispatchMode dispatch_mode,
222  const bool output_columnar,
223  const bool sort_on_gpu,
224  const shared::TableKey& outer_table_key,
225  const int64_t num_rows,
226  const std::vector<std::vector<const int8_t*>>& col_buffers,
227  const std::vector<std::vector<uint64_t>>& frag_offsets,
228  RenderAllocatorMap* render_allocator_map,
229  RenderInfo* render_info,
230  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
231  DeviceAllocator* device_allocator,
232  const size_t thread_idx,
233  const Executor* executor)
234  : num_rows_(num_rows)
235  , row_set_mem_owner_(row_set_mem_owner)
236  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
237  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
244  , device_allocator_(device_allocator)
245  , thread_idx_(thread_idx) {
246  CHECK(!sort_on_gpu || output_columnar);
247  executor->logSystemCPUMemoryStatus("Before Query Memory Initialization", thread_idx);
248 
249  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
250  if (consistent_frag_sizes.empty()) {
251  // No fragments in the input, no underlying buffers will be needed.
252  return;
253  }
254 
255  TargetAggOpsMetadata agg_op_metadata =
256  collect_target_expr_metadata(query_mem_desc, ra_exe_unit);
257  if (agg_op_metadata.has_count_distinct) {
258  check_count_distinct_expr_metadata(query_mem_desc, ra_exe_unit);
259  if (!ra_exe_unit.use_bump_allocator) {
260  check_total_bitmap_memory(query_mem_desc);
261  }
262  if (device_type == ExecutorDeviceType::GPU) {
263  allocateCountDistinctGpuMem(query_mem_desc);
264  }
265  }
266 
267  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
268  if (agg_op_metadata.has_count_distinct) {
269  allocateCountDistinctBuffers(query_mem_desc, ra_exe_unit);
270  }
271  if (agg_op_metadata.has_mode) {
272  allocateModeBuffer(query_mem_desc, ra_exe_unit);
273  }
274  if (agg_op_metadata.has_tdigest) {
275  allocateTDigestsBuffer(query_mem_desc, ra_exe_unit);
276  }
277  if (render_info && render_info->useCudaBuffers()) {
278  return;
279  }
280  }
281 
282  if (query_mem_desc.isGroupBy()) {
283  if (agg_op_metadata.has_count_distinct) {
284  agg_op_metadata.count_distinct_buf_size =
285  calculateCountDistinctBufferSize(query_mem_desc, ra_exe_unit);
286  }
287  if (agg_op_metadata.has_mode) {
288  agg_op_metadata.mode_index_set =
289  initializeModeIndexSet(query_mem_desc, ra_exe_unit);
290  }
291  if (agg_op_metadata.has_tdigest) {
292  agg_op_metadata.qualtile_params =
293  initializeQuantileParams(query_mem_desc, ra_exe_unit);
294  }
295  }
296 
297  if (ra_exe_unit.estimator) {
298  return;
299  }
300 
301  const auto thread_count = device_type == ExecutorDeviceType::GPU
302  ? executor->blockSize() * executor->gridSize()
303  : 1;
304 
305  size_t group_buffer_size{0};
306  if (ra_exe_unit.use_bump_allocator) {
307  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
308  // the fragment
309  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
310  group_buffer_size = num_rows * query_mem_desc.getRowSize();
311  } else {
312  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
313  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
314  }
315  } else {
316  group_buffer_size =
317  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
318  }
319  CHECK_GE(group_buffer_size, size_t(0));
320 
321  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
322  int64_t* group_by_buffer_template{nullptr};
323  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
324  group_by_buffer_template = reinterpret_cast<int64_t*>(
325  row_set_mem_owner_->allocate(group_buffer_size, thread_idx_));
326  initGroupByBuffer(group_by_buffer_template,
327  ra_exe_unit,
328  query_mem_desc,
329  agg_op_metadata,
330  device_type,
331  output_columnar,
332  executor);
333  }
334 
335  if (query_mem_desc.interleavedBins(device_type)) {
336  CHECK(query_mem_desc.hasKeylessHash());
337  }
338 
339  const auto step = device_type == ExecutorDeviceType::GPU &&
340  query_mem_desc.threadsShareMemory() &&
341  query_mem_desc.isGroupBy()
342  ? executor->blockSize()
343  : size_t(1);
344  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
345  query_mem_desc.hasKeylessHash()
346  ? query_mem_desc.getEntryCount()
347  : size_t(0);
348  const auto actual_group_buffer_size =
349  group_buffer_size + index_buffer_qw * sizeof(int64_t);
350  CHECK_GE(actual_group_buffer_size, group_buffer_size);
351 
352  if (query_mem_desc.hasVarlenOutput()) {
353  const auto varlen_buffer_elem_size_opt = query_mem_desc.varlenOutputBufferElemSize();
354  CHECK(varlen_buffer_elem_size_opt); // TODO(adb): relax
355  auto const varlen_buffer_sz =
356  query_mem_desc.getEntryCount() * varlen_buffer_elem_size_opt.value();
357  auto varlen_output_buffer =
358  reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(varlen_buffer_sz));
359  num_buffers_ += 1;
360  group_by_buffers_.push_back(varlen_output_buffer);
361  }
362 
363  if (query_mem_desc.threadsCanReuseGroupByBuffers()) {
364  // Sanity checks, intra-thread buffer reuse should only
365  // occur on CPU for group-by queries, which also means
366  // that only one group-by buffer should be allocated
367  // (multiple-buffer allocation only occurs for GPU)
368  CHECK(device_type == ExecutorDeviceType::CPU);
369  CHECK(query_mem_desc.isGroupBy());
370  CHECK_EQ(group_buffers_count, size_t(1));
371  }
372 
373  // Group-by buffer reuse assumes 1 group-by-buffer per query step
374  // Multiple group-by-buffers should only be used on GPU,
375  // whereas buffer reuse only is done on CPU
376  CHECK(group_buffers_count <= 1 || !query_mem_desc.threadsCanReuseGroupByBuffers());
377  for (size_t i = 0; i < group_buffers_count; i += step) {
378  auto group_by_info =
379  alloc_group_by_buffer(actual_group_buffer_size,
380  render_allocator_map,
381  thread_idx_,
382  row_set_mem_owner_.get(),
383  query_mem_desc.threadsCanReuseGroupByBuffers());
384 
385  auto group_by_buffer = group_by_info.first;
386  const bool was_cached = group_by_info.second;
387  if (!was_cached) {
388  if (!query_mem_desc.lazyInitGroups(device_type)) {
389  if (group_by_buffer_template) {
390  memcpy(group_by_buffer + index_buffer_qw,
391  group_by_buffer_template,
392  group_buffer_size);
393  } else {
394  initGroupByBuffer(group_by_buffer + index_buffer_qw,
395  ra_exe_unit,
396  query_mem_desc,
397  agg_op_metadata,
398  device_type,
399  output_columnar,
400  executor);
401  }
402  }
403  }
404 
405  size_t old_size = group_by_buffers_.size();
406  group_by_buffers_.resize(old_size + std::max(size_t(1), step), nullptr);
407  group_by_buffers_[old_size] = group_by_buffer;
408 
409  const bool use_target_exprs_union =
410  ra_exe_unit.union_all && get_input_idx(ra_exe_unit, outer_table_key);
411  const auto& target_exprs = use_target_exprs_union ? ra_exe_unit.target_exprs_union
412  : ra_exe_unit.target_exprs;
413  const auto column_frag_offsets = get_col_frag_offsets(target_exprs, frag_offsets);
414  const auto column_frag_sizes =
415  get_consistent_frags_sizes(target_exprs, consistent_frag_sizes);
416 
417  old_size = result_sets_.size();
418  result_sets_.resize(old_size + std::max(size_t(1), step));
419  result_sets_[old_size] =
420  std::make_unique<ResultSet>(target_exprs_to_infos(target_exprs, query_mem_desc),
421  executor->getColLazyFetchInfo(target_exprs),
422  col_buffers,
423  column_frag_offsets,
424  column_frag_sizes,
425  device_type,
426  device_id,
427  thread_idx,
430  executor->blockSize(),
431  executor->gridSize());
432  result_sets_[old_size]->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
433  executor->plan_state_->init_agg_vals_,
435  }
436 }
437 
438 // Table functions execution constructor
440  const TableFunctionExecutionUnit& exe_unit,
442  const int device_id,
443  const ExecutorDeviceType device_type,
444  const int64_t num_rows,
445  const std::vector<std::vector<const int8_t*>>& col_buffers,
446  const std::vector<std::vector<uint64_t>>& frag_offsets,
447  std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
448  DeviceAllocator* device_allocator,
449  const Executor* executor)
450  : num_rows_(num_rows)
451  , row_set_mem_owner_(row_set_mem_owner)
452  , init_agg_vals_(init_agg_val_vec(exe_unit.target_exprs, {}, query_mem_desc))
453  , num_buffers_(1)
460  , device_allocator_(device_allocator)
461  , thread_idx_(0) {
462  // Table functions output columnar, basically treat this as a projection
463  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
464  if (consistent_frag_sizes.empty()) {
465  // No fragments in the input, no underlying buffers will be needed.
466  return;
467  }
468 
469  const size_t num_columns =
470  query_mem_desc.getBufferColSlotCount(); // shouldn't we use getColCount() ???
471  size_t total_group_by_buffer_size{0};
472  for (size_t i = 0; i < num_columns; ++i) {
473  auto ti = exe_unit.target_exprs[i]->get_type_info();
474  if (ti.usesFlatBuffer()) {
475  // See TableFunctionManager.h for info regarding flatbuffer
476  // memory managment.
477  auto slot_idx = query_mem_desc.getSlotIndexForSingleSlotCol(i);
478  CHECK(query_mem_desc.checkSlotUsesFlatBufferFormat(slot_idx));
479  checked_int64_t flatbuffer_size = query_mem_desc.getFlatBufferSize(slot_idx);
480  try {
481  total_group_by_buffer_size = align_to_int64(
482  static_cast<int64_t>(total_group_by_buffer_size + flatbuffer_size));
483  } catch (...) {
484  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
485  }
486  } else {
487  const checked_int64_t col_width = ti.get_size();
488  try {
489  const checked_int64_t group_buffer_size = col_width * num_rows_;
490  total_group_by_buffer_size = align_to_int64(
491  static_cast<int64_t>(group_buffer_size + total_group_by_buffer_size));
492  } catch (...) {
493  throw OutOfHostMemory(std::numeric_limits<int64_t>::max() / 8);
494  }
495  }
496  }
497 
498 #ifdef __SANITIZE_ADDRESS__
499  // AddressSanitizer will reject allocation sizes above 1 TiB
500 #define MAX_BUFFER_SIZE 0x10000000000ll
501 #else
502  // otherwise, we'll set the limit to 16 TiB, feel free to increase
503  // the limit if needed
504 #define MAX_BUFFER_SIZE 0x100000000000ll
505 #endif
506 
507  if (total_group_by_buffer_size >= MAX_BUFFER_SIZE) {
508  throw OutOfHostMemory(total_group_by_buffer_size);
509  }
510 
511  CHECK_EQ(num_buffers_, size_t(1));
512  auto group_by_buffer = alloc_group_by_buffer(total_group_by_buffer_size,
513  nullptr,
514  thread_idx_,
515  row_set_mem_owner.get(),
516  false)
517  .first;
518  group_by_buffers_.push_back(group_by_buffer);
519 
520  const auto column_frag_offsets =
521  get_col_frag_offsets(exe_unit.target_exprs, frag_offsets);
522  const auto column_frag_sizes =
523  get_consistent_frags_sizes(exe_unit.target_exprs, consistent_frag_sizes);
524  result_sets_.emplace_back(
525  new ResultSet(target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
526  /*col_lazy_fetch_info=*/{},
527  col_buffers,
528  column_frag_offsets,
529  column_frag_sizes,
530  device_type,
531  device_id,
532  -1, /*thread_idx*/
534  row_set_mem_owner_,
535  executor->blockSize(),
536  executor->gridSize()));
537  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
538  init_agg_vals_);
539 }
540 
542  int64_t* buffer,
543  const RelAlgExecutionUnit& ra_exe_unit,
545  TargetAggOpsMetadata& agg_op_metadata,
546  const ExecutorDeviceType device_type,
547  const bool output_columnar,
548  const Executor* executor) {
549  if (output_columnar) {
550  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor, ra_exe_unit);
551  } else {
552  auto rows_ptr = buffer;
553  auto actual_entry_count = query_mem_desc.getEntryCount();
554  const auto thread_count = device_type == ExecutorDeviceType::GPU
555  ? executor->blockSize() * executor->gridSize()
556  : 1;
557  auto warp_size =
558  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
559  if (query_mem_desc.useStreamingTopN()) {
560  const auto node_count_size = thread_count * sizeof(int64_t);
561  memset(rows_ptr, 0, node_count_size);
562  const auto n =
563  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit.value_or(0);
564  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
565  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
566  rows_ptr += rows_offset / sizeof(int64_t);
567  actual_entry_count = n * thread_count;
568  warp_size = 1;
569  }
570  initRowGroups(query_mem_desc,
571  rows_ptr,
573  agg_op_metadata,
574  actual_entry_count,
575  warp_size,
576  executor,
577  ra_exe_unit);
578  }
579 }
580 
582  int64_t* groups_buffer,
583  const std::vector<int64_t>& init_vals,
584  TargetAggOpsMetadata& agg_op_metadata,
585  const int32_t groups_buffer_entry_count,
586  const size_t warp_size,
587  const Executor* executor,
588  const RelAlgExecutionUnit& ra_exe_unit) {
589  const size_t key_count{query_mem_desc.getGroupbyColCount()};
590  const size_t row_size{query_mem_desc.getRowSize()};
591  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
592 
593  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
594  const auto query_mem_desc_fixedup =
596  // not COUNT DISTINCT / APPROX_COUNT_DISTINCT / APPROX_QUANTILE
597  // we use the default implementation in those agg ops
598  auto const key_sz = query_mem_desc.getEffectiveKeyWidth();
599  if (!(agg_op_metadata.has_count_distinct || agg_op_metadata.has_mode ||
600  agg_op_metadata.has_tdigest) &&
602  std::vector<int8_t> sample_row(row_size - col_base_off);
603  auto const num_available_cpu_threads =
604  std::min(query_mem_desc.getAvailableCpuThreads(),
605  static_cast<size_t>(std::max(cpu_threads(), 1)));
606  tbb::task_arena initialization_arena(num_available_cpu_threads);
607 
609  query_mem_desc_fixedup, sample_row.data(), init_vals, agg_op_metadata);
610 
611  if (query_mem_desc.hasKeylessHash()) {
612  CHECK(warp_size >= 1);
613  CHECK(key_count == 1 || warp_size == 1);
614  initialization_arena.execute([&] {
616  tbb::blocked_range<size_t>(0, groups_buffer_entry_count * warp_size),
617  [&](const tbb::blocked_range<size_t>& r) {
618  auto cur_row_buf = buffer_ptr + (row_size * r.begin());
619  for (size_t i = r.begin(); i != r.end(); ++i, cur_row_buf += row_size) {
620  memcpy(cur_row_buf + col_base_off, sample_row.data(), sample_row.size());
621  }
622  });
623  });
624  return;
625  }
626  initialization_arena.execute([&] {
628  tbb::blocked_range<size_t>(0, groups_buffer_entry_count),
629  [&](const tbb::blocked_range<size_t>& r) {
630  auto cur_row_buf = buffer_ptr + (row_size * r.begin());
631  for (size_t i = r.begin(); i != r.end(); ++i, cur_row_buf += row_size) {
632  memcpy(cur_row_buf + col_base_off, sample_row.data(), sample_row.size());
633  result_set::fill_empty_key(cur_row_buf, key_count, key_sz);
634  }
635  });
636  });
637  } else {
638  // todo(yoonmin): allow parallelization of `initColumnsPerRow`
639  if (query_mem_desc.hasKeylessHash()) {
640  CHECK(warp_size >= 1);
641  CHECK(key_count == 1 || warp_size == 1);
642  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
643  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
644  ++bin, buffer_ptr += row_size) {
645  initColumnsPerRow(query_mem_desc_fixedup,
646  &buffer_ptr[col_base_off],
647  init_vals,
648  agg_op_metadata);
649  }
650  }
651  return;
652  }
653 
654  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
655  ++bin, buffer_ptr += row_size) {
657  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
659  query_mem_desc_fixedup, &buffer_ptr[col_base_off], init_vals, agg_op_metadata);
660  }
661  }
662 }
663 
664 namespace {
665 
666 template <typename T>
667 int8_t* initColumnarBuffer(T* buffer_ptr, const T init_val, const uint32_t entry_count) {
668  static_assert(sizeof(T) <= sizeof(int64_t), "Unsupported template type");
669  for (uint32_t i = 0; i < entry_count; ++i) {
670  buffer_ptr[i] = init_val;
671  }
672  return reinterpret_cast<int8_t*>(buffer_ptr + entry_count);
673 }
674 
675 } // namespace
676 
679  int64_t* groups_buffer,
680  const std::vector<int64_t>& init_vals,
681  const Executor* executor,
682  const RelAlgExecutionUnit& ra_exe_unit) {
683  CHECK(groups_buffer);
684 
685  for (const auto target_expr : ra_exe_unit.target_exprs) {
686  const auto agg_info = get_target_info(target_expr, g_bigint_count);
687  CHECK(!is_distinct_target(agg_info));
688  }
689  const int32_t agg_col_count = query_mem_desc.getSlotCount();
690  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
691 
692  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
693  if (!query_mem_desc.hasKeylessHash()) {
694  const size_t key_count{query_mem_desc.getGroupbyColCount()};
695  for (size_t i = 0; i < key_count; ++i) {
696  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
697  EMPTY_KEY_64,
698  groups_buffer_entry_count);
699  }
700  }
701 
703  // initializing all aggregate columns:
704  int32_t init_val_idx = 0;
705  for (int32_t i = 0; i < agg_col_count; ++i) {
706  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
707  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
708  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
709  case 1:
710  buffer_ptr = initColumnarBuffer<int8_t>(
711  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
712  break;
713  case 2:
714  buffer_ptr =
715  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
716  init_vals[init_val_idx++],
717  groups_buffer_entry_count);
718  break;
719  case 4:
720  buffer_ptr =
721  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
722  init_vals[init_val_idx++],
723  groups_buffer_entry_count);
724  break;
725  case 8:
726  buffer_ptr =
727  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
728  init_vals[init_val_idx++],
729  groups_buffer_entry_count);
730  break;
731  case 0:
732  break;
733  default:
734  CHECK(false);
735  }
736 
737  buffer_ptr = align_to_int64(buffer_ptr);
738  }
739  }
740  }
741 }
742 
745  int8_t* row_ptr,
746  const std::vector<int64_t>& init_vals,
747  const TargetAggOpsMetadata& agg_op_metadata) {
748  int8_t* col_ptr = row_ptr;
749  size_t init_vec_idx = 0;
750  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
751  col_ptr += query_mem_desc.getNextColOffInBytesRowOnly(col_ptr, col_idx++)) {
752  int64_t init_val{0};
753  if (query_mem_desc.isGroupBy()) {
754  if (agg_op_metadata.has_count_distinct) {
755  // COUNT DISTINCT / APPROX_COUNT_DISTINCT
756  // create a data structure for count_distinct operator per entries
757  const int64_t bm_sz{agg_op_metadata.count_distinct_buf_size[col_idx]};
758  if (bm_sz) {
759  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
760  sizeof(int64_t));
761  init_val =
763  CHECK_NE(init_val, 0);
764  ++init_vec_idx;
765  }
766  } else if (agg_op_metadata.has_tdigest &&
767  agg_op_metadata.qualtile_params[col_idx]) {
768  auto const q = *agg_op_metadata.qualtile_params[col_idx];
769  // allocate for APPROX_QUANTILE only when slot is used
770  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
771  CHECK_NE(init_val, 0);
772  ++init_vec_idx;
773  } else if (agg_op_metadata.has_mode &&
774  agg_op_metadata.mode_index_set.count(col_idx)) {
775  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->allocateMode());
776  CHECK_NE(init_val, 0);
777  ++init_vec_idx;
778  }
779  }
780  auto const col_slot_width = query_mem_desc.getPaddedSlotWidthBytes(col_idx);
781  if (init_val == 0 && col_slot_width > 0) {
782  CHECK_LT(init_vec_idx, init_vals.size());
783  init_val = init_vals[init_vec_idx++];
784  }
785  switch (col_slot_width) {
786  case 1:
787  *col_ptr = static_cast<int8_t>(init_val);
788  break;
789  case 2:
790  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
791  break;
792  case 4:
793  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
794  break;
795  case 8:
796  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
797  break;
798  case 0:
799  continue;
800  default:
801  CHECK(false);
802  }
803  }
804 }
805 
808  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
809  return;
810  }
812 
813  size_t total_bytes_per_entry{0};
814  const size_t num_count_distinct_descs =
815  query_mem_desc.getCountDistinctDescriptorsSize();
816  for (size_t i = 0; i < num_count_distinct_descs; i++) {
817  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
818  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
819  continue;
820  }
821  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
822  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
823  }
824 
826  total_bytes_per_entry * query_mem_desc.getEntryCount();
830  reinterpret_cast<int8_t*>(count_distinct_bitmap_device_mem_ptr_),
834 }
835 
838  const RelAlgExecutionUnit& ra_exe_unit) const {
839  const size_t agg_col_count{query_mem_desc.getSlotCount()};
840  std::vector<int64_t> agg_bitmap_size(agg_col_count);
841  for (size_t target_idx = 0; target_idx < ra_exe_unit.target_exprs.size();
842  ++target_idx) {
843  const auto target_expr = ra_exe_unit.target_exprs[target_idx];
844  const auto agg_info = get_target_info(target_expr, g_bigint_count);
845  if (is_distinct_target(agg_info)) {
846  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
847  const auto& count_distinct_desc =
848  query_mem_desc.getCountDistinctDescriptor(target_idx);
849  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
850  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
851  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
852  } else {
853  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::UnorderedSet);
854  agg_bitmap_size[agg_col_idx] = -1;
855  }
856  }
857  }
858  return agg_bitmap_size;
859 }
860 
863  const RelAlgExecutionUnit& ra_exe_unit) {
864  for (size_t target_idx = 0; target_idx < ra_exe_unit.target_exprs.size();
865  ++target_idx) {
866  const auto target_expr = ra_exe_unit.target_exprs[target_idx];
867  const auto agg_info = get_target_info(target_expr, g_bigint_count);
868  if (is_distinct_target(agg_info)) {
869  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
870  const auto& count_distinct_desc =
871  query_mem_desc.getCountDistinctDescriptor(target_idx);
872  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
873  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
874  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
875  } else {
876  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::UnorderedSet);
877  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
878  }
879  }
880  }
881 }
882 
883 int64_t QueryMemoryInitializer::allocateCountDistinctBitmap(const size_t bitmap_byte_sz) {
887  count_distinct_bitmap_host_crt_ptr_ += bitmap_byte_sz;
888  row_set_mem_owner_->addCountDistinctBuffer(
889  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
890  return reinterpret_cast<int64_t>(ptr);
891  }
892  return reinterpret_cast<int64_t>(
893  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz, thread_idx_));
894 }
895 
897  auto count_distinct_set = new CountDistinctSet();
898  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
899  return reinterpret_cast<int64_t>(count_distinct_set);
900 }
901 
902 namespace {
903 
905  std::vector<Analyzer::Expr*> const& target_exprs,
906  SQLAgg const agg_type,
907  std::function<void(Analyzer::AggExpr const*, size_t)> lambda) {
908  for (size_t target_idx = 0; target_idx < target_exprs.size(); ++target_idx) {
909  auto const target_expr = target_exprs[target_idx];
910  if (auto const* agg_expr = dynamic_cast<Analyzer::AggExpr const*>(target_expr)) {
911  if (agg_expr->get_aggtype() == agg_type) {
912  lambda(agg_expr, target_idx);
913  }
914  }
915  }
916 }
917 
918 } // namespace
919 
922  const RelAlgExecutionUnit& ra_exe_unit) {
923  size_t const slot_count = query_mem_desc.getSlotCount();
924  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
925  ModeIndexSet mode_index_set;
927  ra_exe_unit.target_exprs,
928  kMODE,
929  [&](Analyzer::AggExpr const*, size_t const target_idx) {
930  size_t const agg_col_idx =
931  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
932  CHECK_LT(agg_col_idx, slot_count);
933  mode_index_set.emplace(agg_col_idx);
934  });
935  return mode_index_set;
936 }
937 
940  const RelAlgExecutionUnit& ra_exe_unit) {
941  size_t const slot_count = query_mem_desc.getSlotCount();
942  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
944  ra_exe_unit.target_exprs,
945  kMODE,
946  [&](Analyzer::AggExpr const*, size_t const target_idx) {
947  size_t const agg_col_idx =
948  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
949  CHECK_LT(agg_col_idx, slot_count);
950  AggMode* agg_mode = row_set_mem_owner_->allocateMode();
951  init_agg_vals_[agg_col_idx] = reinterpret_cast<int64_t>(agg_mode);
952  });
953 }
954 
955 std::vector<QueryMemoryInitializer::QuantileParam>
958  const RelAlgExecutionUnit& ra_exe_unit) {
959  size_t const slot_count = query_mem_desc.getSlotCount();
960  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
961  std::vector<QuantileParam> quantile_params(slot_count);
963  ra_exe_unit.target_exprs,
965  [&](Analyzer::AggExpr const* const agg_expr, size_t const target_idx) {
966  size_t const agg_col_idx =
967  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
968  CHECK_LT(agg_col_idx, slot_count);
969  CHECK_EQ(static_cast<int8_t>(sizeof(int64_t)),
970  query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx));
971  auto const q_expr =
972  dynamic_cast<Analyzer::Constant const*>(agg_expr->get_arg1().get());
973  CHECK(q_expr);
974  quantile_params[agg_col_idx] = q_expr->get_constval().doubleval;
975  });
976  return quantile_params;
977 }
978 
981  const RelAlgExecutionUnit& ra_exe_unit) {
982  size_t const slot_count = query_mem_desc.getSlotCount();
983  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
985  ra_exe_unit.target_exprs,
987  [&](Analyzer::AggExpr const* const agg_expr, size_t const target_idx) {
988  size_t const agg_col_idx =
989  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
990  CHECK_LT(agg_col_idx, slot_count);
991  CHECK_EQ(static_cast<int8_t>(sizeof(int64_t)),
992  query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx));
993  auto const q_expr =
994  dynamic_cast<Analyzer::Constant const*>(agg_expr->get_arg1().get());
995  CHECK(q_expr);
996  auto const q = q_expr->get_constval().doubleval;
997  // allocate for APPROX_QUANTILE only when slot is used
998  init_agg_vals_[agg_col_idx] =
999  reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
1000  });
1001 }
1002 
1005  const int8_t* init_agg_vals_dev_ptr,
1006  const size_t n,
1007  const int device_id,
1008  const unsigned block_size_x,
1009  const unsigned grid_size_x) {
1010 #ifdef HAVE_CUDA
1012  const auto thread_count = block_size_x * grid_size_x;
1013  const auto total_buff_size =
1014  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1015  int8_t* dev_buffer = device_allocator_->alloc(total_buff_size);
1016 
1017  std::vector<int8_t*> dev_buffers(thread_count);
1018 
1019  for (size_t i = 0; i < thread_count; ++i) {
1020  dev_buffers[i] = dev_buffer;
1021  }
1022 
1023  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(int8_t*));
1025  dev_ptr, dev_buffers.data(), thread_count * sizeof(int8_t*));
1026 
1027  CHECK(query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU));
1028 
1029  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
1030  thread_count * sizeof(int64_t));
1031 
1033  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
1034  (unsigned char)-1,
1035  thread_count * n * sizeof(int64_t));
1036 
1038  reinterpret_cast<int64_t*>(
1039  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
1040  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1041  n * thread_count,
1042  query_mem_desc.getGroupbyColCount(),
1043  query_mem_desc.getEffectiveKeyWidth(),
1044  query_mem_desc.getRowSize() / sizeof(int64_t),
1045  query_mem_desc.hasKeylessHash(),
1046  1,
1047  block_size_x,
1048  grid_size_x);
1049 
1050  return {dev_ptr, dev_buffer};
1051 #else
1052  UNREACHABLE();
1053  return {};
1054 #endif
1055 }
1056 
1058  const RelAlgExecutionUnit& ra_exe_unit,
1060  const int8_t* init_agg_vals_dev_ptr,
1061  const int device_id,
1062  const ExecutorDispatchMode dispatch_mode,
1063  const unsigned block_size_x,
1064  const unsigned grid_size_x,
1065  const int8_t warp_size,
1066  const bool can_sort_on_gpu,
1067  const bool output_columnar,
1068  RenderAllocator* render_allocator) {
1069 #ifdef HAVE_CUDA
1070  if (query_mem_desc.useStreamingTopN()) {
1071  if (render_allocator) {
1073  }
1074  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit.value_or(0);
1075  CHECK(!output_columnar);
1076 
1078  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
1079  }
1080 
1081  auto dev_group_by_buffers =
1084  query_mem_desc,
1085  block_size_x,
1086  grid_size_x,
1087  device_id,
1088  dispatch_mode,
1089  num_rows_,
1090  can_sort_on_gpu,
1091  false,
1092  ra_exe_unit.use_bump_allocator,
1093  query_mem_desc.hasVarlenOutput(),
1094  render_allocator);
1095  if (query_mem_desc.hasVarlenOutput()) {
1096  CHECK(dev_group_by_buffers.varlen_output_buffer);
1098  reinterpret_cast<CUdeviceptr>(dev_group_by_buffers.varlen_output_buffer);
1099  CHECK(query_mem_desc.varlenOutputBufferElemSize());
1100  const size_t varlen_output_buf_bytes =
1101  query_mem_desc.getEntryCount() *
1102  query_mem_desc.varlenOutputBufferElemSize().value();
1104  row_set_mem_owner_->allocate(varlen_output_buf_bytes, thread_idx_);
1106  varlen_output_info_->gpu_start_address = static_cast<int64_t>(varlen_output_buffer_);
1108  }
1109  if (render_allocator) {
1110  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
1111  }
1112  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
1113  CHECK(!render_allocator);
1114 
1115  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
1116  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
1117  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
1118  auto group_by_dev_buffer = dev_group_by_buffers.data;
1119  const size_t col_count = query_mem_desc.getSlotCount();
1120  int8_t* col_widths_dev_ptr{nullptr};
1121  if (output_columnar) {
1122  std::vector<int8_t> compact_col_widths(col_count);
1123  for (size_t idx = 0; idx < col_count; ++idx) {
1124  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
1125  }
1126  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
1128  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
1129  }
1130  const int8_t warp_count =
1131  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
1132  const auto num_group_by_buffers =
1133  getGroupByBuffersSize() - (query_mem_desc.hasVarlenOutput() ? 1 : 0);
1134  for (size_t i = 0; i < num_group_by_buffers; i += step) {
1135  if (output_columnar) {
1137  reinterpret_cast<int64_t*>(group_by_dev_buffer),
1138  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1139  dev_group_by_buffers.entry_count,
1140  query_mem_desc.getGroupbyColCount(),
1141  col_count,
1142  col_widths_dev_ptr,
1143  /*need_padding = */ true,
1144  query_mem_desc.hasKeylessHash(),
1145  sizeof(int64_t),
1146  block_size_x,
1147  grid_size_x);
1148  } else {
1150  reinterpret_cast<int64_t*>(group_by_dev_buffer),
1151  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1152  dev_group_by_buffers.entry_count,
1153  query_mem_desc.getGroupbyColCount(),
1154  query_mem_desc.getEffectiveKeyWidth(),
1155  query_mem_desc.getRowSize() / sizeof(int64_t),
1156  query_mem_desc.hasKeylessHash(),
1157  warp_count,
1158  block_size_x,
1159  grid_size_x);
1160  }
1161  group_by_dev_buffer += groups_buffer_size;
1162  }
1163  }
1164  return dev_group_by_buffers;
1165 #else
1166  UNREACHABLE();
1167  return {};
1168 #endif
1169 }
1170 
1173  const int device_id,
1174  const unsigned block_size_x,
1175  const unsigned grid_size_x,
1176  const bool zero_initialize_buffers) {
1177  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
1178  CHECK_GT(num_columns, size_t(0));
1179  size_t total_group_by_buffer_size{0};
1180  const auto col_slot_context = query_mem_desc.getColSlotContext();
1181 
1182  std::vector<size_t> col_byte_offsets;
1183  col_byte_offsets.reserve(num_columns);
1184 
1185  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1186  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
1187  size_t group_buffer_size = num_rows_ * col_width;
1188  col_byte_offsets.emplace_back(total_group_by_buffer_size);
1189  total_group_by_buffer_size =
1190  align_to_int64(total_group_by_buffer_size + group_buffer_size);
1191  }
1192 
1193  int8_t* dev_buffers_allocation{nullptr};
1194  dev_buffers_allocation = device_allocator_->alloc(total_group_by_buffer_size);
1195  CHECK(dev_buffers_allocation);
1196  if (zero_initialize_buffers) {
1197  device_allocator_->zeroDeviceMem(dev_buffers_allocation, total_group_by_buffer_size);
1198  }
1199 
1200  auto dev_buffers_mem = dev_buffers_allocation;
1201  std::vector<int8_t*> dev_buffers(num_columns);
1202  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1203  dev_buffers[col_idx] = dev_buffers_allocation + col_byte_offsets[col_idx];
1204  }
1205  auto dev_ptrs = device_allocator_->alloc(num_columns * sizeof(CUdeviceptr));
1207  dev_ptrs, dev_buffers.data(), num_columns * sizeof(CUdeviceptr));
1208 
1209  return {dev_ptrs, dev_buffers_mem, (size_t)num_rows_};
1210 }
1211 
1213  Data_Namespace::DataMgr* data_mgr,
1214  const QueryMemoryDescriptor& query_mem_desc,
1215  const size_t entry_count,
1216  const GpuGroupByBuffers& gpu_group_by_buffers,
1217  const int device_id,
1218  const unsigned block_size_x,
1219  const unsigned grid_size_x) {
1220  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
1221 
1222  int8_t* dev_buffer = gpu_group_by_buffers.data;
1223  int8_t* host_buffer = reinterpret_cast<int8_t*>(group_by_buffers_[0]);
1224 
1225  const size_t original_entry_count = gpu_group_by_buffers.entry_count;
1226  CHECK_LE(entry_count, original_entry_count);
1227  size_t output_device_col_offset{0};
1228  size_t output_host_col_offset{0};
1229 
1230  const auto col_slot_context = query_mem_desc.getColSlotContext();
1231 
1232  auto allocator = std::make_unique<CudaAllocator>(
1233  data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id));
1234 
1235  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1236  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
1237  const size_t output_device_col_size = original_entry_count * col_width;
1238  const size_t output_host_col_size = entry_count * col_width;
1239  allocator->copyFromDevice(host_buffer + output_host_col_offset,
1240  dev_buffer + output_device_col_offset,
1241  output_host_col_size);
1242  output_device_col_offset =
1243  align_to_int64(output_device_col_offset + output_device_col_size);
1244  output_host_col_offset =
1245  align_to_int64(output_host_col_offset + output_host_col_size);
1246  }
1247 }
1248 
1250  const QueryMemoryDescriptor& query_mem_desc,
1251  const ExecutorDeviceType device_type,
1252  const Executor* executor) const {
1253  return device_type == ExecutorDeviceType::CPU
1254  ? 1
1255  : executor->blockSize() *
1256  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
1257 }
1258 
1259 namespace {
1260 
1261 // in-place compaction of output buffer
1263  const QueryMemoryDescriptor& query_mem_desc,
1264  int8_t* projection_buffer,
1265  const size_t projection_count) {
1266  // the first column (row indices) remains unchanged.
1267  CHECK(projection_count <= query_mem_desc.getEntryCount());
1268  constexpr size_t row_index_width = sizeof(int64_t);
1269  size_t buffer_offset1{projection_count * row_index_width};
1270  // other columns are actual non-lazy columns for the projection:
1271  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
1272  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
1273  auto column_proj_size =
1274  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
1275  auto buffer_offset2 = query_mem_desc.getColOffInBytes(i);
1276  if (buffer_offset1 + column_proj_size >= buffer_offset2) {
1277  // overlapping
1278  std::memmove(projection_buffer + buffer_offset1,
1279  projection_buffer + buffer_offset2,
1280  column_proj_size);
1281  } else {
1282  std::memcpy(projection_buffer + buffer_offset1,
1283  projection_buffer + buffer_offset2,
1284  column_proj_size);
1285  }
1286  buffer_offset1 += align_to_int64(column_proj_size);
1287  }
1288  }
1289 }
1290 
1291 } // namespace
1292 
1294  const QueryMemoryDescriptor& query_mem_desc,
1295  const size_t projection_count) {
1296  const auto num_allocated_rows =
1297  std::min(projection_count, query_mem_desc.getEntryCount());
1298  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1299 
1300  // copy the results from the main buffer into projection_buffer
1302  query_mem_desc,
1303  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1304  num_allocated_rows);
1305 
1306  // update the entry count for the result set, and its underlying storage
1307  CHECK(!result_sets_.empty());
1308  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1309 }
1310 
1312  const QueryMemoryDescriptor& query_mem_desc,
1313  Data_Namespace::DataMgr* data_mgr,
1314  const GpuGroupByBuffers& gpu_group_by_buffers,
1315  const size_t projection_count,
1316  const int device_id) {
1317  // store total number of allocated rows:
1318  const auto num_allocated_rows =
1319  std::min(projection_count, query_mem_desc.getEntryCount());
1320 
1321  // copy the results from the main buffer into projection_buffer
1322  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1324  data_mgr,
1325  gpu_group_by_buffers,
1326  query_mem_desc,
1327  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1328  num_allocated_rows,
1329  device_id);
1330 
1331  // update the entry count for the result set, and its underlying storage
1332  CHECK(!result_sets_.empty());
1333  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1334 }
1335 
1337  DeviceAllocator& device_allocator,
1338  const QueryMemoryDescriptor& query_mem_desc,
1339  const size_t entry_count,
1340  const GpuGroupByBuffers& gpu_group_by_buffers,
1341  const RelAlgExecutionUnit* ra_exe_unit,
1342  const unsigned block_size_x,
1343  const unsigned grid_size_x,
1344  const int device_id,
1345  const bool prepend_index_buffer) const {
1346  const auto thread_count = block_size_x * grid_size_x;
1347 
1348  size_t total_buff_size{0};
1349  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
1350  const size_t n =
1351  ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit.value_or(0);
1352  total_buff_size =
1353  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1354  } else {
1355  total_buff_size =
1356  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
1357  }
1358  copy_group_by_buffers_from_gpu(device_allocator,
1360  total_buff_size,
1361  gpu_group_by_buffers.data,
1362  query_mem_desc,
1363  block_size_x,
1364  grid_size_x,
1365  device_id,
1366  prepend_index_buffer,
1367  query_mem_desc.hasVarlenOutput());
1368 }
1369 
1371  const QueryMemoryDescriptor& query_mem_desc,
1372  const RelAlgExecutionUnit& ra_exe_unit) {
1373  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1374  CHECK_EQ(group_by_buffers_.size(), buffer_start_idx + 1);
1375 
1376  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1377  group_by_buffers_[buffer_start_idx],
1378  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1379  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit.value_or(0),
1380  1);
1381  CHECK_EQ(rows_copy.size(),
1382  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1383  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1384 }
1385 
1387  Data_Namespace::DataMgr* data_mgr,
1388  const QueryMemoryDescriptor& query_mem_desc,
1389  const GpuGroupByBuffers& gpu_group_by_buffers,
1390  const RelAlgExecutionUnit& ra_exe_unit,
1391  const unsigned total_thread_count,
1392  const int device_id) {
1393 #ifdef HAVE_CUDA
1395  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1396 
1397  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1398  data_mgr,
1399  reinterpret_cast<int64_t*>(gpu_group_by_buffers.data),
1400  ra_exe_unit,
1401  query_mem_desc,
1402  total_thread_count,
1403  device_id);
1404  CHECK_EQ(
1405  rows_copy.size(),
1406  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1407  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1408 #else
1409  UNREACHABLE();
1410 #endif
1411 }
1412 
1413 std::shared_ptr<VarlenOutputInfo> QueryMemoryInitializer::getVarlenOutputInfo() {
1414  if (varlen_output_info_) {
1415  return varlen_output_info_;
1416  }
1417 
1418  // shared_ptr so that both the ResultSet and QMI can hold on to the varlen info object
1419  // and update it as needed
1420  varlen_output_info_ = std::make_shared<VarlenOutputInfo>(VarlenOutputInfo{
1421  static_cast<int64_t>(varlen_output_buffer_), varlen_output_buffer_host_ptr_});
1422  return varlen_output_info_;
1423 }
GpuGroupByBuffers setupTableFunctionGpuBuffers(const QueryMemoryDescriptor &query_mem_desc, const int device_id, const unsigned block_size_x, const unsigned grid_size_x, const bool zero_initialize_buffers)
ModeIndexSet initializeModeIndexSet(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
std::vector< Analyzer::Expr * > target_exprs
SQLAgg
Definition: sqldefs.h:73
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *device_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:70
RenderAllocator * getRenderAllocator(size_t device_id)
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
bool countDistinctDescriptorsLogicallyEmpty() const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:54
CUdeviceptr count_distinct_bitmap_device_mem_ptr_
#define EMPTY_KEY_64
int8_t logical_size
GpuGroupByBuffers prepareTopNHeapsDevBuffer(const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
QueryMemoryInitializer::TargetAggOpsMetadata collect_target_expr_metadata(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
GpuGroupByBuffers createAndInitializeGroupByBufferGpu(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const int device_id, const ExecutorDispatchMode dispatch_mode, const unsigned block_size_x, const unsigned grid_size_x, const int8_t warp_size, const bool can_sort_on_gpu, const bool output_columnar, RenderAllocator *render_allocator)
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
DeviceAllocator * device_allocator_
size_t getAvailableCpuThreads() const
count_distinct_bitmap_host_mem_ptr_(nullptr)
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
const std::optional< bool > union_all
Streaming Top N algorithm.
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
void allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const shared::TableKey &outer_table_key, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
unsigned long long CUdeviceptr
Definition: nocuda.h:28
int8_t * allocate(const size_t num_bytes, const size_t thread_idx=0) override
std::vector< InputDescriptor > input_descs
#define UNREACHABLE()
Definition: Logger.h:338
#define CHECK_GE(x, y)
Definition: Logger.h:306
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
varlen_output_buffer_(0)
count_distinct_bitmap_device_mem_ptr_(0)
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getEffectiveKeyWidth() const
void eachAggregateTargetIdxOfType(std::vector< Analyzer::Expr * > const &target_exprs, SQLAgg const agg_type, std::function< void(Analyzer::AggExpr const *, size_t)> lambda)
num_buffers_(1)
#define CHECK_GT(x, y)
Definition: Logger.h:305
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
count_distinct_bitmap_mem_size_(0)
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
ExecutorDeviceType
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< QuantileParam > initializeQuantileParams(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
std::pair< int64_t *, bool > allocateCachedGroupByBuffer(const size_t num_bytes, const size_t thread_idx)
void check_count_distinct_expr_metadata(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
varlen_output_buffer_host_ptr_(nullptr)
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
const SlotSize & getSlotInfo(const size_t slot_idx) const
std::vector< Analyzer::Expr * > target_exprs_union
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
ExecutorDispatchMode
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
std::pair< int64_t *, bool > alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, const size_t thread_idx, RowSetMemoryOwner *mem_owner, const bool reuse_existing_buffer_for_thread)
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
std::vector< int64_t > calculateCountDistinctBufferSize(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit) const
std::vector< int64_t > init_agg_vals_
size_t getGroupbyColCount() const
#define CHECK_NE(x, y)
Definition: Logger.h:302
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
bool threadsCanReuseGroupByBuffers() const
std::optional< size_t > limit
count_distinct_bitmap_host_crt_ptr_(nullptr)
bool g_bigint_count
int64_t g_bitmap_memory_limit
size_t g_max_memory_allocation_size
Definition: Execute.cpp:124
size_t getAllocatedSize() const
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
const std::shared_ptr< Analyzer::Estimator > estimator
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)
size_t getCountDistinctDescriptorsSize() const
QueryDescriptionType getQueryDescriptionType() const
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::shared_ptr< Analyzer::Expr > get_arg1() const
Definition: Analyzer.h:1333
std::vector< int64_t * > group_by_buffers_
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const TargetAggOpsMetadata &agg_op_metadata)
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
void copyGroupByBuffersFromGpu(DeviceAllocator &device_allocator, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
std::optional< size_t > varlenOutputBufferElemSize() const
#define CHECK_LT(x, y)
Definition: Logger.h:303
std::shared_ptr< VarlenOutputInfo > getVarlenOutputInfo()
#define CHECK_LE(x, y)
Definition: Logger.h:304
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, TargetAggOpsMetadata &agg_expr_metadata, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
size_t getNextColOffInBytesRowOnly(const int8_t *col_ptr, const size_t col_idx) const
void allocateTDigestsBuffer(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void allocateModeBuffer(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
Definition: sqldefs.h:78
robin_hood::unordered_set< size_t > ModeIndexSet
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:766
#define MAX_BUFFER_SIZE
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
device_allocator_(device_allocator)
bool interleavedBins(const ExecutorDeviceType) const
const ColSlotContext & getColSlotContext() const
#define CHECK(condition)
Definition: Logger.h:291
void copyFromTableFunctionGpuBuffers(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
bool g_optimize_row_initialization
Definition: Execute.cpp:104
std::shared_ptr< VarlenOutputInfo > varlen_output_info_
constexpr double n
Definition: Utm.h:38
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
int cpu_threads()
Definition: thread_count.h:25
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
size_t getColOffInBytes(const size_t col_idx) const
void copy_group_by_buffers_from_gpu(DeviceAllocator &device_allocator, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const int8_t *group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)
std::vector< std::unique_ptr< ResultSet > > result_sets_
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, TargetAggOpsMetadata &agg_expr_metadata, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0
Definition: sqldefs.h:83
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
int get_input_idx(RelAlgExecutionUnit const &ra_exe_unit, const shared::TableKey &outer_table_key)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)