OmniSciDB  29e35f4d58
QueryMemoryInitializer Class Reference

#include <QueryMemoryInitializer.h>

+ Collaboration diagram for QueryMemoryInitializer:

Public Member Functions

 QueryMemoryInitializer (const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t *>> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const Executor *executor)
 
 QueryMemoryInitializer (const TableFunctionExecutionUnit &exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const int64_t num_rows, const std::vector< std::vector< const int8_t *>> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *device_allocator, const Executor *executor)
 
const auto getCountDistinctBitmapPtr () const
 
const auto getCountDistinctHostPtr () const
 
const auto getCountDistinctBitmapBytes () const
 
ResultSetgetResultSet (const size_t index) const
 
std::unique_ptr< ResultSetgetResultSetOwned (const size_t index)
 
void resetResultSet (const size_t index)
 
int64_t getAggInitValForIndex (const size_t index) const
 
const auto getGroupByBuffersPtr ()
 
const auto getGroupByBuffersSize () const
 
const auto getNumBuffers () const
 
void copyGroupByBuffersFromGpu (Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
 

Private Member Functions

void initGroups (const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
 
void initColumnarGroups (const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
 
void initColumnPerRow (const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const size_t bin, const std::vector< int64_t > &init_vals, const std::vector< ssize_t > &bitmap_sizes)
 
void allocateCountDistinctGpuMem (const QueryMemoryDescriptor &query_mem_desc)
 
std::vector< ssize_t > allocateCountDistinctBuffers (const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
 
int64_t allocateCountDistinctBitmap (const size_t bitmap_byte_sz)
 
int64_t allocateCountDistinctSet ()
 
size_t computeNumberOfBuffers (const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
 
void compactProjectionBuffersCpu (const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
 
void compactProjectionBuffersGpu (const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
 
void applyStreamingTopNOffsetCpu (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
void applyStreamingTopNOffsetGpu (Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
 

Private Attributes

const int64_t num_rows_
 
std::shared_ptr< RowSetMemoryOwnerrow_set_mem_owner_
 
std::vector< std::unique_ptr< ResultSet > > result_sets_
 
std::vector< int64_t > init_agg_vals_
 
const size_t num_buffers_
 
std::vector< int64_t * > group_by_buffers_
 
CUdeviceptr count_distinct_bitmap_mem_
 
size_t count_distinct_bitmap_mem_bytes_
 
int8_t * count_distinct_bitmap_crt_ptr_
 
int8_t * count_distinct_bitmap_host_mem_
 
DeviceAllocatordevice_allocator_ {nullptr}
 

Friends

class Executor
 
class QueryExecutionContext
 

Detailed Description

Definition at line 35 of file QueryMemoryInitializer.h.

Constructor & Destructor Documentation

◆ QueryMemoryInitializer() [1/2]

QueryMemoryInitializer::QueryMemoryInitializer ( const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const ExecutorDeviceType  device_type,
const ExecutorDispatchMode  dispatch_mode,
const bool  output_columnar,
const bool  sort_on_gpu,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t *>> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
RenderAllocatorMap render_allocator_map,
RenderInfo render_info,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
DeviceAllocator gpu_allocator,
const Executor executor 
)

Definition at line 152 of file QueryMemoryInitializer.cpp.

References anonymous_namespace{QueryMemoryInitializer.cpp}::alloc_group_by_buffer(), allocateCountDistinctBuffers(), allocateCountDistinctGpuMem(), CHECK, CHECK_GE, anonymous_namespace{QueryMemoryInitializer.cpp}::check_total_bitmap_memory(), checked_malloc(), QueryMemoryDescriptor::didOutputColumnar(), RelAlgExecutionUnit::estimator, ResultSet::fixupQueryMemoryDescriptor(), g_max_memory_allocation_size, anonymous_namespace{QueryMemoryInitializer.cpp}::get_col_frag_offsets(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_consistent_frags_sizes(), streaming_top_n::get_rows_offset_of_heaps(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), GPU, group_by_buffers_, QueryMemoryDescriptor::hasKeylessHash(), init_agg_vals_, initColumnarGroups(), initGroups(), QueryMemoryDescriptor::interleavedBins(), QueryMemoryDescriptor::isGroupBy(), KernelPerFragment, QueryMemoryDescriptor::lazyInitGroups(), SortInfo::limit, num_buffers_, SortInfo::offset, result_sets_, row_set_mem_owner_, RelAlgExecutionUnit::sort_info, RelAlgExecutionUnit::target_exprs, target_exprs_to_infos(), QueryMemoryDescriptor::threadsShareMemory(), RelAlgExecutionUnit::use_bump_allocator, use_streaming_top_n(), RenderInfo::useCudaBuffers(), and warp_size.

169  , row_set_mem_owner_(row_set_mem_owner)
170  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
171  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
176  , device_allocator_(device_allocator) {
177  CHECK(!sort_on_gpu || output_columnar);
178 
179  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
180  if (consistent_frag_sizes.empty()) {
181  // No fragments in the input, no underlying buffers will be needed.
182  return;
183  }
184  if (!ra_exe_unit.use_bump_allocator) {
185  check_total_bitmap_memory(query_mem_desc);
186  }
187  if (device_type == ExecutorDeviceType::GPU) {
188  allocateCountDistinctGpuMem(query_mem_desc);
189  }
190 
191  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
192  allocateCountDistinctBuffers(query_mem_desc, false, executor);
193  if (render_info && render_info->useCudaBuffers()) {
194  return;
195  }
196  }
197 
198  if (ra_exe_unit.estimator) {
199  return;
200  }
201 
202  const auto thread_count = device_type == ExecutorDeviceType::GPU
203  ? executor->blockSize() * executor->gridSize()
204  : 1;
205 
206  size_t group_buffer_size{0};
207  if (ra_exe_unit.use_bump_allocator) {
208  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
209  // the fragment
210  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
211  group_buffer_size = num_rows * query_mem_desc.getRowSize();
212  } else {
213  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
214  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
215  }
216  } else {
217  group_buffer_size =
218  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
219  }
220  CHECK_GE(group_buffer_size, size_t(0));
221 
222  std::unique_ptr<int64_t, CheckedAllocDeleter> group_by_buffer_template;
223  if (!query_mem_desc.lazyInitGroups(device_type)) {
224  group_by_buffer_template.reset(
225  static_cast<int64_t*>(checked_malloc(group_buffer_size)));
226 
227  if (output_columnar) {
229  query_mem_desc, group_by_buffer_template.get(), init_agg_vals_, executor);
230  } else {
231  auto rows_ptr = group_by_buffer_template.get();
232  auto actual_entry_count = query_mem_desc.getEntryCount();
233  auto warp_size =
234  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
235  if (use_streaming_top_n(ra_exe_unit, query_mem_desc.didOutputColumnar())) {
236  const auto node_count_size = thread_count * sizeof(int64_t);
237  memset(rows_ptr, 0, node_count_size);
238  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
239  const auto rows_offset =
241  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
242  rows_ptr += rows_offset / sizeof(int64_t);
243  actual_entry_count = n * thread_count;
244  warp_size = 1;
245  }
246  initGroups(query_mem_desc,
247  rows_ptr,
249  actual_entry_count,
250  warp_size,
251  executor);
252  }
253  }
254 
255  if (query_mem_desc.interleavedBins(device_type)) {
256  CHECK(query_mem_desc.hasKeylessHash());
257  }
258 
259  const auto step = device_type == ExecutorDeviceType::GPU &&
260  query_mem_desc.threadsShareMemory() &&
261  query_mem_desc.isGroupBy()
262  ? executor->blockSize()
263  : size_t(1);
264  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
265  query_mem_desc.hasKeylessHash()
266  ? query_mem_desc.getEntryCount()
267  : size_t(0);
268  const auto actual_group_buffer_size =
269  group_buffer_size + index_buffer_qw * sizeof(int64_t);
270  CHECK_GE(actual_group_buffer_size, group_buffer_size);
271  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
272 
273  for (size_t i = 0; i < group_buffers_count; i += step) {
274  auto group_by_buffer =
275  alloc_group_by_buffer(actual_group_buffer_size, render_allocator_map);
276  if (!query_mem_desc.lazyInitGroups(device_type)) {
277  CHECK(group_by_buffer_template);
278  memcpy(group_by_buffer + index_buffer_qw,
279  group_by_buffer_template.get(),
280  group_buffer_size);
281  }
282  if (!render_allocator_map) {
283  row_set_mem_owner_->addGroupByBuffer(group_by_buffer);
284  }
285  group_by_buffers_.push_back(group_by_buffer);
286  for (size_t j = 1; j < step; ++j) {
287  group_by_buffers_.push_back(nullptr);
288  }
289  const auto column_frag_offsets =
290  get_col_frag_offsets(ra_exe_unit.target_exprs, frag_offsets);
291  const auto column_frag_sizes =
292  get_consistent_frags_sizes(ra_exe_unit.target_exprs, consistent_frag_sizes);
293  result_sets_.emplace_back(
294  new ResultSet(target_exprs_to_infos(ra_exe_unit.target_exprs, query_mem_desc),
295  executor->getColLazyFetchInfo(ra_exe_unit.target_exprs),
296  col_buffers,
297  column_frag_offsets,
298  column_frag_sizes,
299  device_type,
300  device_id,
303  executor));
304  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
305  executor->plan_state_->init_agg_vals_);
306  for (size_t j = 1; j < step; ++j) {
307  result_sets_.emplace_back(nullptr);
308  }
309  }
310 }
std::vector< Analyzer::Expr * > target_exprs
void initGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
bool use_streaming_top_n(const RelAlgExecutionUnit &ra_exe_unit, const bool output_columnar)
std::vector< ssize_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
const int8_t const int64_t * num_rows
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map)
const int64_t const uint32_t const uint32_t const uint32_t const bool const int8_t warp_size
DeviceAllocator * device_allocator_
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
#define CHECK_GE(x, y)
Definition: Logger.h:206
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
size_t g_max_memory_allocation_size
Definition: Execute.cpp:95
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
bool useCudaBuffers() const
Definition: RenderInfo.cpp:66
const size_t limit
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr *> &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)
std::vector< int64_t > init_agg_vals_
const SortInfo sort_info
void * checked_malloc(const size_t size)
Definition: checked_alloc.h:40
bool interleavedBins(const ExecutorDeviceType) const
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
const std::shared_ptr< Analyzer::Estimator > estimator
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< Analyzer::Expr *> &target_exprs, const std::vector< int64_t > &table_frag_sizes)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:452
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr *> &targets, const QueryMemoryDescriptor &query_mem_desc)
#define CHECK(condition)
Definition: Logger.h:193
const size_t offset
std::vector< std::unique_ptr< ResultSet > > result_sets_
bool lazyInitGroups(const ExecutorDeviceType) const
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
void sort_on_gpu(int64_t *val_buff, int32_t *key_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)
+ Here is the call graph for this function:

◆ QueryMemoryInitializer() [2/2]

QueryMemoryInitializer::QueryMemoryInitializer ( const TableFunctionExecutionUnit exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const ExecutorDeviceType  device_type,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t *>> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
DeviceAllocator device_allocator,
const Executor executor 
)

Definition at line 312 of file QueryMemoryInitializer.cpp.

References anonymous_namespace{QueryMemoryInitializer.cpp}::alloc_group_by_buffer(), CHECK_EQ, CHECK_GE, checked_malloc(), count_distinct_bitmap_crt_ptr_, count_distinct_bitmap_host_mem_, count_distinct_bitmap_mem_, count_distinct_bitmap_mem_bytes_, device_allocator_, ResultSet::fixupQueryMemoryDescriptor(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_col_frag_offsets(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_consistent_frags_sizes(), GPU, group_by_buffers_, init_agg_vals_, initColumnarGroups(), num_buffers_, num_rows_, result_sets_, row_set_mem_owner_, and target_exprs_to_infos().

324  , row_set_mem_owner_(row_set_mem_owner)
325  , init_agg_vals_(init_agg_val_vec(exe_unit.target_exprs, {}, query_mem_desc))
326  , num_buffers_(/*computeNumberOfBuffers(query_mem_desc, device_type, executor)*/ 1)
331  , device_allocator_(device_allocator) {
332  // Table functions output columnar, basically treat this as a projection
333 
334  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
335  if (consistent_frag_sizes.empty()) {
336  // No fragments in the input, no underlying buffers will be needed.
337  return;
338  }
339 
340  size_t group_buffer_size{0};
341  // TODO(adb): this is going to give us an index buffer and then the target buffers. this
342  // might not be desireable -- revisit
343  group_buffer_size = query_mem_desc.getBufferSizeBytes(device_type, num_rows_);
344  CHECK_GE(group_buffer_size, size_t(0));
345 
346  std::unique_ptr<int64_t, CheckedAllocDeleter> group_by_buffer_template;
347  if (!query_mem_desc.lazyInitGroups(device_type)) {
348  group_by_buffer_template.reset(
349  static_cast<int64_t*>(checked_malloc(group_buffer_size)));
351  query_mem_desc, group_by_buffer_template.get(), init_agg_vals_, executor);
352  }
353 
354  const auto index_buffer_qw =
355  device_type == ExecutorDeviceType::GPU && query_mem_desc.hasKeylessHash()
356  ? query_mem_desc.getEntryCount()
357  : size_t(0);
358  const auto actual_group_buffer_size =
359  group_buffer_size + index_buffer_qw * sizeof(int64_t);
360  CHECK_GE(actual_group_buffer_size, group_buffer_size);
361 
362  CHECK_EQ(num_buffers_, size_t(1));
363  auto group_by_buffer = alloc_group_by_buffer(actual_group_buffer_size, nullptr);
364  if (!query_mem_desc.lazyInitGroups(device_type)) {
365  memcpy(group_by_buffer + index_buffer_qw,
366  group_by_buffer_template.get(),
367  group_buffer_size);
368  }
369  group_by_buffers_.push_back(group_by_buffer);
370  row_set_mem_owner_->addGroupByBuffer(group_by_buffer);
371 
372  const auto column_frag_offsets =
373  get_col_frag_offsets(exe_unit.target_exprs, frag_offsets);
374  const auto column_frag_sizes =
375  get_consistent_frags_sizes(exe_unit.target_exprs, consistent_frag_sizes);
376  result_sets_.emplace_back(
377  new ResultSet(target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
378  {},
379  col_buffers,
380  column_frag_offsets,
381  column_frag_sizes,
382  device_type,
383  device_id,
386  executor));
387  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
389 }
#define CHECK_EQ(x, y)
Definition: Logger.h:201
const int8_t const int64_t * num_rows
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map)
DeviceAllocator * device_allocator_
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
#define CHECK_GE(x, y)
Definition: Logger.h:206
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr *> &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)
std::vector< int64_t > init_agg_vals_
void * checked_malloc(const size_t size)
Definition: checked_alloc.h:40
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< Analyzer::Expr *> &target_exprs, const std::vector< int64_t > &table_frag_sizes)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:452
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr *> &targets, const QueryMemoryDescriptor &query_mem_desc)
std::vector< Analyzer::Expr * > target_exprs
std::vector< std::unique_ptr< ResultSet > > result_sets_
bool lazyInitGroups(const ExecutorDeviceType) const
+ Here is the call graph for this function:

Member Function Documentation

◆ allocateCountDistinctBitmap()

int64_t QueryMemoryInitializer::allocateCountDistinctBitmap ( const size_t  bitmap_byte_sz)
private

Definition at line 629 of file QueryMemoryInitializer.cpp.

References CHECK, checked_calloc(), count_distinct_bitmap_crt_ptr_, count_distinct_bitmap_host_mem_, and row_set_mem_owner_.

Referenced by allocateCountDistinctBuffers(), getNumBuffers(), and initColumnPerRow().

629  {
633  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
634  row_set_mem_owner_->addCountDistinctBuffer(ptr, bitmap_byte_sz, false);
635  return reinterpret_cast<int64_t>(ptr);
636  }
637  auto count_distinct_buffer = static_cast<int8_t*>(checked_calloc(bitmap_byte_sz, 1));
638  row_set_mem_owner_->addCountDistinctBuffer(count_distinct_buffer, bitmap_byte_sz, true);
639  return reinterpret_cast<int64_t>(count_distinct_buffer);
640 }
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
void * checked_calloc(const size_t nmemb, const size_t size)
Definition: checked_alloc.h:48
#define CHECK(condition)
Definition: Logger.h:193
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ allocateCountDistinctBuffers()

std::vector< ssize_t > QueryMemoryInitializer::allocateCountDistinctBuffers ( const QueryMemoryDescriptor query_mem_desc,
const bool  deferred,
const Executor executor 
)
private

Definition at line 583 of file QueryMemoryInitializer.cpp.

References agg_col_count, allocateCountDistinctBitmap(), allocateCountDistinctSet(), Bitmap, CHECK, CHECK_EQ, CHECK_GE, CHECK_LT, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getLogicalSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), init_agg_vals_, Invalid, is_distinct_target(), kAPPROX_COUNT_DISTINCT, kCOUNT, and StdSet.

Referenced by getNumBuffers(), initGroups(), and QueryMemoryInitializer().

586  {
587  const size_t agg_col_count{query_mem_desc.getSlotCount()};
588  std::vector<ssize_t> agg_bitmap_size(deferred ? agg_col_count : 0);
589 
590  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
591  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
592  ++target_idx) {
593  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
594  const auto agg_info = get_target_info(target_expr, g_bigint_count);
595  if (is_distinct_target(agg_info)) {
596  CHECK(agg_info.is_agg &&
597  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
598  CHECK(!agg_info.sql_type.is_varlen());
599 
600  const auto agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
601  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
602 
603  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
604  sizeof(int64_t));
605  const auto& count_distinct_desc =
606  query_mem_desc.getCountDistinctDescriptor(target_idx);
607  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
608  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
609  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
610  if (deferred) {
611  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
612  } else {
613  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
614  }
615  } else {
616  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::StdSet);
617  if (deferred) {
618  agg_bitmap_size[agg_col_idx] = -1;
619  } else {
620  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
621  }
622  }
623  }
624  }
625 
626  return agg_bitmap_size;
627 }
#define CHECK_EQ(x, y)
Definition: Logger.h:201
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:66
#define CHECK_GE(x, y)
Definition: Logger.h:206
std::vector< int64_t > init_agg_vals_
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:117
#define CHECK_LT(x, y)
Definition: Logger.h:203
Definition: sqldefs.h:76
#define CHECK(condition)
Definition: Logger.h:193
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ allocateCountDistinctGpuMem()

void QueryMemoryInitializer::allocateCountDistinctGpuMem ( const QueryMemoryDescriptor query_mem_desc)
private

Definition at line 549 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), Bitmap, CHECK, checked_malloc(), count_distinct_bitmap_crt_ptr_, count_distinct_bitmap_host_mem_, count_distinct_bitmap_mem_, count_distinct_bitmap_mem_bytes_, QueryMemoryDescriptor::countDistinctDescriptorsLogicallyEmpty(), device_allocator_, QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getCountDistinctDescriptorsSize(), QueryMemoryDescriptor::getEntryCount(), Invalid, row_set_mem_owner_, and DeviceAllocator::zeroDeviceMem().

Referenced by getNumBuffers(), and QueryMemoryInitializer().

550  {
551  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
552  return;
553  }
555 
556  size_t total_bytes_per_entry{0};
557  const size_t num_count_distinct_descs =
558  query_mem_desc.getCountDistinctDescriptorsSize();
559  for (size_t i = 0; i < num_count_distinct_descs; i++) {
560  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
561  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
562  continue;
563  }
564  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
565  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
566  }
567 
569  total_bytes_per_entry * query_mem_desc.getEntryCount();
570  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
572  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
574 
576  static_cast<int8_t*>(checked_malloc(count_distinct_bitmap_mem_bytes_));
577  row_set_mem_owner_->addCountDistinctBuffer(
579 }
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:27
size_t getCountDistinctDescriptorsSize() const
virtual int8_t * alloc(const size_t num_bytes)=0
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
void * checked_malloc(const size_t size)
Definition: checked_alloc.h:40
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool countDistinctDescriptorsLogicallyEmpty() const
#define CHECK(condition)
Definition: Logger.h:193
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ allocateCountDistinctSet()

int64_t QueryMemoryInitializer::allocateCountDistinctSet ( )
private

Definition at line 642 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), CHECK, CHECK_EQ, DeviceAllocator::copyToDevice(), create_dev_group_by_buffers(), device_allocator_, QueryMemoryDescriptor::didOutputColumnar(), streaming_top_n::get_heap_size(), streaming_top_n::get_rows_offset_of_heaps(), RenderAllocator::getAllocatedSize(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEffectiveKeyWidth(), getGroupByBuffersSize(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getRowSize(), QueryMemoryDescriptor::getSlotCount(), GPU, group_by_buffers_, groups_buffer_size, QueryMemoryDescriptor::hasKeylessHash(), init_columnar_group_by_buffer_on_device(), init_group_by_buffer_on_device(), QueryMemoryDescriptor::interleavedBins(), QueryMemoryDescriptor::lazyInitGroups(), SortInfo::limit, MultifragmentKernel, num_rows_, SortInfo::offset, row_set_mem_owner_, DeviceAllocator::setDeviceMem(), RelAlgExecutionUnit::sort_info, QueryMemoryDescriptor::threadsShareMemory(), RelAlgExecutionUnit::use_bump_allocator, use_streaming_top_n(), warp_size, and DeviceAllocator::zeroDeviceMem().

Referenced by allocateCountDistinctBuffers(), getNumBuffers(), and initColumnPerRow().

642  {
643  auto count_distinct_set = new std::set<int64_t>();
644  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
645  return reinterpret_cast<int64_t>(count_distinct_set);
646 }
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ applyStreamingTopNOffsetCpu()

void QueryMemoryInitializer::applyStreamingTopNOffsetCpu ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 930 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, CPU, streaming_top_n::get_rows_copy_from_heaps(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), group_by_buffers_, SortInfo::limit, SortInfo::offset, and RelAlgExecutionUnit::sort_info.

Referenced by getNumBuffers().

932  {
933  CHECK_EQ(group_by_buffers_.size(), size_t(1));
934 
935  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
937  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
938  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
939  1);
940  CHECK_EQ(rows_copy.size(),
941  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
942  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
943 }
#define CHECK_EQ(x, y)
Definition: Logger.h:201
const size_t limit
const SortInfo sort_info
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
std::vector< int64_t * > group_by_buffers_
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
const size_t offset
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ applyStreamingTopNOffsetGpu()

void QueryMemoryInitializer::applyStreamingTopNOffsetGpu ( Data_Namespace::DataMgr data_mgr,
const QueryMemoryDescriptor query_mem_desc,
const GpuGroupByBuffers gpu_group_by_buffers,
const RelAlgExecutionUnit ra_exe_unit,
const unsigned  total_thread_count,
const int  device_id 
)
private

Definition at line 945 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), group_by_buffers_, num_buffers_, GpuGroupByBuffers::second, and UNREACHABLE.

Referenced by getNumBuffers().

951  {
952 #ifdef HAVE_CUDA
954 
955  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
956  data_mgr,
957  reinterpret_cast<int64_t*>(gpu_group_by_buffers.second),
958  ra_exe_unit,
959  query_mem_desc,
960  total_thread_count,
961  device_id);
962  CHECK_EQ(
963  rows_copy.size(),
964  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
965  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
966 #else
967  UNREACHABLE();
968 #endif
969 }
#define CHECK_EQ(x, y)
Definition: Logger.h:201
#define UNREACHABLE()
Definition: Logger.h:237
CUdeviceptr second
Definition: GpuMemUtils.h:61
std::vector< int64_t * > group_by_buffers_
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ compactProjectionBuffersCpu()

void QueryMemoryInitializer::compactProjectionBuffersCpu ( const QueryMemoryDescriptor query_mem_desc,
const size_t  projection_count 
)
private

Definition at line 856 of file QueryMemoryInitializer.cpp.

References CHECK, anonymous_namespace{QueryMemoryInitializer.cpp}::compact_projection_buffer_for_cpu_columnar(), QueryMemoryDescriptor::getEntryCount(), group_by_buffers_, and result_sets_.

Referenced by getNumBuffers().

858  {
859  const auto num_allocated_rows =
860  std::min(projection_count, query_mem_desc.getEntryCount());
861 
862  // copy the results from the main buffer into projection_buffer
864  query_mem_desc,
865  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
866  num_allocated_rows);
867 
868  // update the entry count for the result set, and its underlying storage
869  CHECK(!result_sets_.empty());
870  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
871 }
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
#define CHECK(condition)
Definition: Logger.h:193
std::vector< std::unique_ptr< ResultSet > > result_sets_
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ compactProjectionBuffersGpu()

void QueryMemoryInitializer::compactProjectionBuffersGpu ( const QueryMemoryDescriptor query_mem_desc,
Data_Namespace::DataMgr data_mgr,
const GpuGroupByBuffers gpu_group_by_buffers,
const size_t  projection_count,
const int  device_id 
)
private

Definition at line 873 of file QueryMemoryInitializer.cpp.

References CHECK, copy_projection_buffer_from_gpu_columnar(), QueryMemoryDescriptor::getEntryCount(), group_by_buffers_, and result_sets_.

Referenced by getNumBuffers().

878  {
879  // store total number of allocated rows:
880  const auto num_allocated_rows =
881  std::min(projection_count, query_mem_desc.getEntryCount());
882 
883  // copy the results from the main buffer into projection_buffer
885  data_mgr,
886  gpu_group_by_buffers,
887  query_mem_desc,
888  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
889  num_allocated_rows,
890  device_id);
891 
892  // update the entry count for the result set, and its underlying storage
893  CHECK(!result_sets_.empty());
894  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
895 }
std::vector< int64_t * > group_by_buffers_
#define CHECK(condition)
Definition: Logger.h:193
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
std::vector< std::unique_ptr< ResultSet > > result_sets_
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ computeNumberOfBuffers()

size_t QueryMemoryInitializer::computeNumberOfBuffers ( const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
const Executor executor 
) const
private

Definition at line 812 of file QueryMemoryInitializer.cpp.

References QueryMemoryDescriptor::blocksShareMemory(), and CPU.

Referenced by getNumBuffers().

815  {
816  return device_type == ExecutorDeviceType::CPU
817  ? 1
818  : executor->blockSize() *
819  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
820 }
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ copyGroupByBuffersFromGpu()

void QueryMemoryInitializer::copyGroupByBuffersFromGpu ( Data_Namespace::DataMgr data_mgr,
const QueryMemoryDescriptor query_mem_desc,
const size_t  entry_count,
const GpuGroupByBuffers gpu_group_by_buffers,
const RelAlgExecutionUnit ra_exe_unit,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int  device_id,
const bool  prepend_index_buffer 
) const

Definition at line 897 of file QueryMemoryInitializer.cpp.

References copy_group_by_buffers_from_gpu(), QueryMemoryDescriptor::didOutputColumnar(), streaming_top_n::get_heap_size(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getRowSize(), GPU, group_by_buffers_, SortInfo::limit, SortInfo::offset, GpuGroupByBuffers::second, RelAlgExecutionUnit::sort_info, and use_streaming_top_n().

Referenced by getNumBuffers().

906  {
907  const auto thread_count = block_size_x * grid_size_x;
908 
909  size_t total_buff_size{0};
910  if (ra_exe_unit &&
911  use_streaming_top_n(*ra_exe_unit, query_mem_desc.didOutputColumnar())) {
912  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
913  total_buff_size =
914  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
915  } else {
916  total_buff_size =
917  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
918  }
921  total_buff_size,
922  gpu_group_by_buffers.second,
923  query_mem_desc,
924  block_size_x,
925  grid_size_x,
926  device_id,
927  prepend_index_buffer);
928 }
bool use_streaming_top_n(const RelAlgExecutionUnit &ra_exe_unit, const bool output_columnar)
const size_t limit
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t *> &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer)
CUdeviceptr second
Definition: GpuMemUtils.h:61
const SortInfo sort_info
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
std::vector< int64_t * > group_by_buffers_
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
const size_t offset
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ getAggInitValForIndex()

int64_t QueryMemoryInitializer::getAggInitValForIndex ( const size_t  index) const
inline

Definition at line 89 of file QueryMemoryInitializer.h.

References CHECK_LT, and init_agg_vals_.

89  {
90  CHECK_LT(index, init_agg_vals_.size());
91  return init_agg_vals_[index];
92  }
std::vector< int64_t > init_agg_vals_
#define CHECK_LT(x, y)
Definition: Logger.h:203

◆ getCountDistinctBitmapBytes()

const auto QueryMemoryInitializer::getCountDistinctBitmapBytes ( ) const
inline

Definition at line 70 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_mem_bytes_.

70  {
72  }

◆ getCountDistinctBitmapPtr()

const auto QueryMemoryInitializer::getCountDistinctBitmapPtr ( ) const
inline

Definition at line 66 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_mem_.

◆ getCountDistinctHostPtr()

const auto QueryMemoryInitializer::getCountDistinctHostPtr ( ) const
inline

◆ getGroupByBuffersPtr()

const auto QueryMemoryInitializer::getGroupByBuffersPtr ( )
inline

Definition at line 94 of file QueryMemoryInitializer.h.

References group_by_buffers_.

94  {
95  return reinterpret_cast<int64_t**>(group_by_buffers_.data());
96  }
std::vector< int64_t * > group_by_buffers_

◆ getGroupByBuffersSize()

const auto QueryMemoryInitializer::getGroupByBuffersSize ( ) const
inline

Definition at line 98 of file QueryMemoryInitializer.h.

References group_by_buffers_.

Referenced by allocateCountDistinctSet().

98 { return group_by_buffers_.size(); }
std::vector< int64_t * > group_by_buffers_
+ Here is the caller graph for this function:

◆ getNumBuffers()

const auto QueryMemoryInitializer::getNumBuffers ( ) const
inline

◆ getResultSet()

ResultSet* QueryMemoryInitializer::getResultSet ( const size_t  index) const
inline

Definition at line 74 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

74  {
75  CHECK_LT(index, result_sets_.size());
76  return result_sets_[index].get();
77  }
#define CHECK_LT(x, y)
Definition: Logger.h:203
std::vector< std::unique_ptr< ResultSet > > result_sets_

◆ getResultSetOwned()

std::unique_ptr<ResultSet> QueryMemoryInitializer::getResultSetOwned ( const size_t  index)
inline

Definition at line 79 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

79  {
80  CHECK_LT(index, result_sets_.size());
81  return std::move(result_sets_[index]);
82  }
#define CHECK_LT(x, y)
Definition: Logger.h:203
std::vector< std::unique_ptr< ResultSet > > result_sets_

◆ initColumnarGroups()

void QueryMemoryInitializer::initColumnarGroups ( const QueryMemoryDescriptor query_mem_desc,
int64_t *  groups_buffer,
const std::vector< int64_t > &  init_vals,
const Executor executor 
)
private

Definition at line 447 of file QueryMemoryInitializer.cpp.

References agg_col_count, align_to_int64(), CHECK, CHECK_LT, EMPTY_KEY_64, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), groups_buffer_entry_count, QueryMemoryDescriptor::hasKeylessHash(), and is_distinct_target().

Referenced by getNumBuffers(), and QueryMemoryInitializer().

451  {
452  CHECK(groups_buffer);
453  for (const auto target_expr : executor->plan_state_->target_exprs_) {
454  const auto agg_info = get_target_info(target_expr, g_bigint_count);
455  CHECK(!is_distinct_target(agg_info));
456  }
457  const int32_t agg_col_count = query_mem_desc.getSlotCount();
458  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
459 
460  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
461  if (!query_mem_desc.hasKeylessHash()) {
462  const size_t key_count{query_mem_desc.getGroupbyColCount()};
463  for (size_t i = 0; i < key_count; ++i) {
464  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
465  EMPTY_KEY_64,
467  }
468  }
469  // initializing all aggregate columns:
470  int32_t init_val_idx = 0;
471  for (int32_t i = 0; i < agg_col_count; ++i) {
472  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
473  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
474  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
475  case 1:
476  buffer_ptr = initColumnarBuffer<int8_t>(
477  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
478  break;
479  case 2:
480  buffer_ptr = initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
481  init_vals[init_val_idx++],
483  break;
484  case 4:
485  buffer_ptr = initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
486  init_vals[init_val_idx++],
488  break;
489  case 8:
490  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
491  init_vals[init_val_idx++],
493  break;
494  case 0:
495  break;
496  default:
497  CHECK(false);
498  }
499 
500  buffer_ptr = align_to_int64(buffer_ptr);
501  }
502  }
503 }
const int64_t const uint32_t const uint32_t const uint32_t agg_col_count
#define EMPTY_KEY_64
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:66
const int64_t const uint32_t groups_buffer_entry_count
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:117
#define CHECK_LT(x, y)
Definition: Logger.h:203
#define CHECK(condition)
Definition: Logger.h:193
const int64_t * init_vals
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ initColumnPerRow()

void QueryMemoryInitializer::initColumnPerRow ( const QueryMemoryDescriptor query_mem_desc,
int8_t *  row_ptr,
const size_t  bin,
const std::vector< int64_t > &  init_vals,
const std::vector< ssize_t > &  bitmap_sizes 
)
private

Definition at line 505 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBitmap(), allocateCountDistinctSet(), CHECK, CHECK_EQ, CHECK_LT, QueryMemoryDescriptor::getNextColOffInBytes(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), and QueryMemoryDescriptor::isGroupBy().

Referenced by getNumBuffers(), and initGroups().

509  {
510  int8_t* col_ptr = row_ptr;
511  size_t init_vec_idx = 0;
512  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
513  col_ptr += query_mem_desc.getNextColOffInBytes(col_ptr, bin, col_idx++)) {
514  const ssize_t bm_sz{bitmap_sizes[col_idx]};
515  int64_t init_val{0};
516  if (!bm_sz || !query_mem_desc.isGroupBy()) {
517  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
518  CHECK_LT(init_vec_idx, init_vals.size());
519  init_val = init_vals[init_vec_idx++];
520  }
521  } else {
522  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
523  sizeof(int64_t));
524  init_val =
526  ++init_vec_idx;
527  }
528  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
529  case 1:
530  *col_ptr = static_cast<int8_t>(init_val);
531  break;
532  case 2:
533  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
534  break;
535  case 4:
536  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
537  break;
538  case 8:
539  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
540  break;
541  case 0:
542  continue;
543  default:
544  CHECK(false);
545  }
546  }
547 }
#define CHECK_EQ(x, y)
Definition: Logger.h:201
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
size_t getNextColOffInBytes(const int8_t *col_ptr, const size_t bin, const size_t col_idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:203
#define CHECK(condition)
Definition: Logger.h:193
const int64_t * init_vals
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ initGroups()

void QueryMemoryInitializer::initGroups ( const QueryMemoryDescriptor query_mem_desc,
int64_t *  groups_buffer,
const std::vector< int64_t > &  init_vals,
const int32_t  groups_buffer_entry_count,
const size_t  warp_size,
const Executor executor 
)
private

Definition at line 391 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBuffers(), CHECK, fill_empty_key(), ResultSet::fixupQueryMemoryDescriptor(), QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getRowSize(), groups_buffer_entry_count, QueryMemoryDescriptor::hasKeylessHash(), initColumnPerRow(), and warp_size.

Referenced by getNumBuffers(), and QueryMemoryInitializer().

396  {
397  const size_t key_count{query_mem_desc.getGroupbyColCount()};
398  const size_t row_size{query_mem_desc.getRowSize()};
399  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
400 
401  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
402  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
403 
404  const auto query_mem_desc_fixedup =
406 
407  if (query_mem_desc.hasKeylessHash()) {
408  CHECK(warp_size >= 1);
409  CHECK(key_count == 1);
410  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
411  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
412  ++bin, buffer_ptr += row_size) {
413  initColumnPerRow(query_mem_desc_fixedup,
414  &buffer_ptr[col_base_off],
415  bin,
416  init_vals,
417  agg_bitmap_size);
418  }
419  }
420  return;
421  }
422 
423  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
424  ++bin, buffer_ptr += row_size) {
425  fill_empty_key(buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
426  initColumnPerRow(query_mem_desc_fixedup,
427  &buffer_ptr[col_base_off],
428  bin,
429  init_vals,
430  agg_bitmap_size);
431  }
432 }
std::vector< ssize_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
const int64_t const uint32_t const uint32_t const uint32_t const bool const int8_t warp_size
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
const int64_t const uint32_t groups_buffer_entry_count
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:452
#define CHECK(condition)
Definition: Logger.h:193
size_t getColOffInBytes(const size_t col_idx) const
const int64_t * init_vals
size_t getEffectiveKeyWidth() const
void initColumnPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const size_t bin, const std::vector< int64_t > &init_vals, const std::vector< ssize_t > &bitmap_sizes)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ resetResultSet()

void QueryMemoryInitializer::resetResultSet ( const size_t  index)
inline

Definition at line 84 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

84  {
85  CHECK_LT(index, result_sets_.size());
86  result_sets_[index].reset();
87  }
#define CHECK_LT(x, y)
Definition: Logger.h:203
std::vector< std::unique_ptr< ResultSet > > result_sets_

Friends And Related Function Documentation

◆ Executor

friend class Executor
friend

Definition at line 214 of file QueryMemoryInitializer.h.

Referenced by getNumBuffers().

◆ QueryExecutionContext

friend class QueryExecutionContext
friend

Definition at line 215 of file QueryMemoryInitializer.h.

Member Data Documentation

◆ count_distinct_bitmap_crt_ptr_

int8_t* QueryMemoryInitializer::count_distinct_bitmap_crt_ptr_
private

◆ count_distinct_bitmap_host_mem_

int8_t* QueryMemoryInitializer::count_distinct_bitmap_host_mem_
private

◆ count_distinct_bitmap_mem_

CUdeviceptr QueryMemoryInitializer::count_distinct_bitmap_mem_
private

◆ count_distinct_bitmap_mem_bytes_

size_t QueryMemoryInitializer::count_distinct_bitmap_mem_bytes_
private

◆ device_allocator_

DeviceAllocator* QueryMemoryInitializer::device_allocator_ {nullptr}
private

◆ group_by_buffers_

◆ init_agg_vals_

std::vector<int64_t> QueryMemoryInitializer::init_agg_vals_
private

◆ num_buffers_

const size_t QueryMemoryInitializer::num_buffers_
private

◆ num_rows_

const int64_t QueryMemoryInitializer::num_rows_
private

Definition at line 197 of file QueryMemoryInitializer.h.

Referenced by allocateCountDistinctSet(), and QueryMemoryInitializer().

◆ result_sets_

std::vector<std::unique_ptr<ResultSet> > QueryMemoryInitializer::result_sets_
private

◆ row_set_mem_owner_

std::shared_ptr<RowSetMemoryOwner> QueryMemoryInitializer::row_set_mem_owner_
private

The documentation for this class was generated from the following files: