OmniSciDB  3a86f6ec37
QueryMemoryInitializer Class Reference

#include <QueryMemoryInitializer.h>

+ Collaboration diagram for QueryMemoryInitializer:

Public Member Functions

 QueryMemoryInitializer (const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t *>> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const Executor *executor)
 
 QueryMemoryInitializer (const TableFunctionExecutionUnit &exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const int64_t num_rows, const std::vector< std::vector< const int8_t *>> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *device_allocator, const Executor *executor)
 
const auto getCountDistinctBitmapPtr () const
 
const auto getCountDistinctHostPtr () const
 
const auto getCountDistinctBitmapBytes () const
 
ResultSetgetResultSet (const size_t index) const
 
std::unique_ptr< ResultSetgetResultSetOwned (const size_t index)
 
void resetResultSet (const size_t index)
 
int64_t getAggInitValForIndex (const size_t index) const
 
const auto getGroupByBuffersPtr ()
 
const auto getGroupByBuffersSize () const
 
const auto getNumBuffers () const
 
void copyGroupByBuffersFromGpu (Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
 

Private Member Functions

void initGroupByBuffer (int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
 
void initGroups (const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
 
void initColumnarGroups (const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
 
void initColumnPerRow (const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const size_t bin, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< bool > &tdigest_deferred)
 
void allocateCountDistinctGpuMem (const QueryMemoryDescriptor &query_mem_desc)
 
std::vector< int64_t > allocateCountDistinctBuffers (const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
 
int64_t allocateCountDistinctBitmap (const size_t bitmap_byte_sz)
 
int64_t allocateCountDistinctSet ()
 
std::vector< bool > allocateTDigests (const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
 
size_t computeNumberOfBuffers (const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
 
void compactProjectionBuffersCpu (const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
 
void compactProjectionBuffersGpu (const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
 
void applyStreamingTopNOffsetCpu (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
void applyStreamingTopNOffsetGpu (Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
 

Private Attributes

const int64_t num_rows_
 
std::shared_ptr< RowSetMemoryOwnerrow_set_mem_owner_
 
std::vector< std::unique_ptr< ResultSet > > result_sets_
 
std::vector< int64_t > init_agg_vals_
 
const size_t num_buffers_
 
std::vector< int64_t * > group_by_buffers_
 
CUdeviceptr count_distinct_bitmap_mem_
 
size_t count_distinct_bitmap_mem_bytes_
 
int8_t * count_distinct_bitmap_crt_ptr_
 
int8_t * count_distinct_bitmap_host_mem_
 
DeviceAllocatordevice_allocator_ {nullptr}
 
std::vector< Data_Namespace::AbstractBuffer * > temporary_buffers_
 

Friends

class Executor
 
class QueryExecutionContext
 

Detailed Description

Definition at line 35 of file QueryMemoryInitializer.h.

Constructor & Destructor Documentation

◆ QueryMemoryInitializer() [1/2]

QueryMemoryInitializer::QueryMemoryInitializer ( const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const ExecutorDeviceType  device_type,
const ExecutorDispatchMode  dispatch_mode,
const bool  output_columnar,
const bool  sort_on_gpu,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t *>> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
RenderAllocatorMap render_allocator_map,
RenderInfo render_info,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
DeviceAllocator gpu_allocator,
const Executor executor 
)

Definition at line 153 of file QueryMemoryInitializer.cpp.

References anonymous_namespace{QueryMemoryInitializer.cpp}::alloc_group_by_buffer(), allocateCountDistinctBuffers(), allocateCountDistinctGpuMem(), allocateTDigests(), CHECK, CHECK_GE, anonymous_namespace{QueryMemoryInitializer.cpp}::check_total_bitmap_memory(), RelAlgExecutionUnit::estimator, ResultSet::fixupQueryMemoryDescriptor(), g_max_memory_allocation_size, anonymous_namespace{QueryMemoryInitializer.cpp}::get_col_frag_offsets(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_consistent_frags_sizes(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), GPU, group_by_buffers_, QueryMemoryDescriptor::hasKeylessHash(), initGroupByBuffer(), QueryMemoryDescriptor::interleavedBins(), QueryMemoryDescriptor::isGroupBy(), KernelPerFragment, QueryMemoryDescriptor::lazyInitGroups(), num_buffers_, result_sets_, row_set_mem_owner_, RelAlgExecutionUnit::target_exprs, target_exprs_to_infos(), QueryMemoryDescriptor::threadsShareMemory(), RelAlgExecutionUnit::use_bump_allocator, and RenderInfo::useCudaBuffers().

169  : num_rows_(num_rows)
170  , row_set_mem_owner_(row_set_mem_owner)
171  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
172  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
177  , device_allocator_(device_allocator) {
178  CHECK(!sort_on_gpu || output_columnar);
179 
180  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
181  if (consistent_frag_sizes.empty()) {
182  // No fragments in the input, no underlying buffers will be needed.
183  return;
184  }
185  if (!ra_exe_unit.use_bump_allocator) {
186  check_total_bitmap_memory(query_mem_desc);
187  }
188  if (device_type == ExecutorDeviceType::GPU) {
189  allocateCountDistinctGpuMem(query_mem_desc);
190  }
191 
192  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
193  allocateCountDistinctBuffers(query_mem_desc, false, executor);
194  allocateTDigests(query_mem_desc, false, executor);
195  if (render_info && render_info->useCudaBuffers()) {
196  return;
197  }
198  }
199 
200  if (ra_exe_unit.estimator) {
201  return;
202  }
203 
204  const auto thread_count = device_type == ExecutorDeviceType::GPU
205  ? executor->blockSize() * executor->gridSize()
206  : 1;
207 
208  size_t group_buffer_size{0};
209  if (ra_exe_unit.use_bump_allocator) {
210  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
211  // the fragment
212  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
213  group_buffer_size = num_rows * query_mem_desc.getRowSize();
214  } else {
215  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
216  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
217  }
218  } else {
219  group_buffer_size =
220  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
221  }
222  CHECK_GE(group_buffer_size, size_t(0));
223 
224  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
225  int64_t* group_by_buffer_template{nullptr};
226  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
227  group_by_buffer_template =
228  reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(group_buffer_size));
229  initGroupByBuffer(group_by_buffer_template,
230  ra_exe_unit,
231  query_mem_desc,
232  device_type,
233  output_columnar,
234  executor);
235  }
236 
237  if (query_mem_desc.interleavedBins(device_type)) {
238  CHECK(query_mem_desc.hasKeylessHash());
239  }
240 
241  const auto step = device_type == ExecutorDeviceType::GPU &&
242  query_mem_desc.threadsShareMemory() &&
243  query_mem_desc.isGroupBy()
244  ? executor->blockSize()
245  : size_t(1);
246  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
247  query_mem_desc.hasKeylessHash()
248  ? query_mem_desc.getEntryCount()
249  : size_t(0);
250  const auto actual_group_buffer_size =
251  group_buffer_size + index_buffer_qw * sizeof(int64_t);
252  CHECK_GE(actual_group_buffer_size, group_buffer_size);
253 
254  for (size_t i = 0; i < group_buffers_count; i += step) {
255  auto group_by_buffer = alloc_group_by_buffer(
256  actual_group_buffer_size, render_allocator_map, row_set_mem_owner_.get());
257  if (!query_mem_desc.lazyInitGroups(device_type)) {
258  if (group_by_buffer_template) {
259  memcpy(group_by_buffer + index_buffer_qw,
260  group_by_buffer_template,
261  group_buffer_size);
262  } else {
263  initGroupByBuffer(group_by_buffer + index_buffer_qw,
264  ra_exe_unit,
265  query_mem_desc,
266  device_type,
267  output_columnar,
268  executor);
269  }
270  }
271  group_by_buffers_.push_back(group_by_buffer);
272  for (size_t j = 1; j < step; ++j) {
273  group_by_buffers_.push_back(nullptr);
274  }
275  const auto column_frag_offsets =
276  get_col_frag_offsets(ra_exe_unit.target_exprs, frag_offsets);
277  const auto column_frag_sizes =
278  get_consistent_frags_sizes(ra_exe_unit.target_exprs, consistent_frag_sizes);
279  result_sets_.emplace_back(
280  new ResultSet(target_exprs_to_infos(ra_exe_unit.target_exprs, query_mem_desc),
281  executor->getColLazyFetchInfo(ra_exe_unit.target_exprs),
282  col_buffers,
283  column_frag_offsets,
284  column_frag_sizes,
285  device_type,
286  device_id,
289  executor->getCatalog(),
290  executor->blockSize(),
291  executor->gridSize()));
292  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
293  executor->plan_state_->init_agg_vals_);
294  for (size_t j = 1; j < step; ++j) {
295  result_sets_.emplace_back(nullptr);
296  }
297  }
298 }
std::vector< Analyzer::Expr * > target_exprs
DeviceAllocator * device_allocator_
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
std::vector< bool > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
#define CHECK_GE(x, y)
Definition: Logger.h:210
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
bool useCudaBuffers() const
Definition: RenderInfo.cpp:69
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr *> &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)
std::vector< int64_t > init_agg_vals_
bool interleavedBins(const ExecutorDeviceType) const
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
size_t g_max_memory_allocation_size
Definition: Execute.cpp:102
const std::shared_ptr< Analyzer::Estimator > estimator
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
std::vector< int64_t * > group_by_buffers_
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< Analyzer::Expr *> &target_exprs, const std::vector< int64_t > &table_frag_sizes)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:482
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr *> &targets, const QueryMemoryDescriptor &query_mem_desc)
#define CHECK(condition)
Definition: Logger.h:197
std::vector< std::unique_ptr< ResultSet > > result_sets_
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, RowSetMemoryOwner *mem_owner)
bool lazyInitGroups(const ExecutorDeviceType) const
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
void sort_on_gpu(int64_t *val_buff, int32_t *key_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)
+ Here is the call graph for this function:

◆ QueryMemoryInitializer() [2/2]

QueryMemoryInitializer::QueryMemoryInitializer ( const TableFunctionExecutionUnit exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const ExecutorDeviceType  device_type,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t *>> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
DeviceAllocator device_allocator,
const Executor executor 
)

Definition at line 300 of file QueryMemoryInitializer.cpp.

References anonymous_namespace{QueryMemoryInitializer.cpp}::alloc_group_by_buffer(), CHECK_EQ, CHECK_GE, count_distinct_bitmap_crt_ptr_, count_distinct_bitmap_host_mem_, count_distinct_bitmap_mem_, count_distinct_bitmap_mem_bytes_, device_allocator_, ResultSet::fixupQueryMemoryDescriptor(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_col_frag_offsets(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_consistent_frags_sizes(), GPU, group_by_buffers_, init_agg_vals_, initColumnarGroups(), num_buffers_, num_rows_, result_sets_, row_set_mem_owner_, and target_exprs_to_infos().

311  : num_rows_(num_rows)
312  , row_set_mem_owner_(row_set_mem_owner)
313  , init_agg_vals_(init_agg_val_vec(exe_unit.target_exprs, {}, query_mem_desc))
314  , num_buffers_(/*computeNumberOfBuffers(query_mem_desc, device_type, executor)*/ 1)
319  , device_allocator_(device_allocator) {
320  // Table functions output columnar, basically treat this as a projection
321  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
322  if (consistent_frag_sizes.empty()) {
323  // No fragments in the input, no underlying buffers will be needed.
324  return;
325  }
326 
327  size_t group_buffer_size{0};
328  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
329  group_buffer_size = num_rows_ * num_columns * sizeof(int64_t);
330  CHECK_GE(group_buffer_size, size_t(0));
331 
332  const auto index_buffer_qw =
333  device_type == ExecutorDeviceType::GPU && query_mem_desc.hasKeylessHash()
334  ? query_mem_desc.getEntryCount()
335  : size_t(0);
336  const auto actual_group_buffer_size =
337  group_buffer_size + index_buffer_qw * sizeof(int64_t);
338  CHECK_GE(actual_group_buffer_size, group_buffer_size);
339 
340  CHECK_EQ(num_buffers_, size_t(1));
341  auto group_by_buffer =
342  alloc_group_by_buffer(actual_group_buffer_size, nullptr, row_set_mem_owner.get());
343  if (!query_mem_desc.lazyInitGroups(device_type)) {
345  query_mem_desc, group_by_buffer + index_buffer_qw, init_agg_vals_, executor);
346  }
347  group_by_buffers_.push_back(group_by_buffer);
348 
349  const auto column_frag_offsets =
350  get_col_frag_offsets(exe_unit.target_exprs, frag_offsets);
351  const auto column_frag_sizes =
352  get_consistent_frags_sizes(exe_unit.target_exprs, consistent_frag_sizes);
353  result_sets_.emplace_back(
354  new ResultSet(target_exprs_to_infos(exe_unit.target_exprs, query_mem_desc),
355  {},
356  col_buffers,
357  column_frag_offsets,
358  column_frag_sizes,
359  device_type,
360  device_id,
363  executor->getCatalog(),
364  executor->blockSize(),
365  executor->gridSize()));
366  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
368 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
DeviceAllocator * device_allocator_
#define CHECK_GE(x, y)
Definition: Logger.h:210
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr *> &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)
std::vector< int64_t > init_agg_vals_
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< Analyzer::Expr *> &target_exprs, const std::vector< int64_t > &table_frag_sizes)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:482
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr *> &targets, const QueryMemoryDescriptor &query_mem_desc)
std::vector< Analyzer::Expr * > target_exprs
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
std::vector< std::unique_ptr< ResultSet > > result_sets_
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, RowSetMemoryOwner *mem_owner)
bool lazyInitGroups(const ExecutorDeviceType) const
+ Here is the call graph for this function:

Member Function Documentation

◆ allocateCountDistinctBitmap()

int64_t QueryMemoryInitializer::allocateCountDistinctBitmap ( const size_t  bitmap_byte_sz)
private

Definition at line 658 of file QueryMemoryInitializer.cpp.

References CHECK, count_distinct_bitmap_crt_ptr_, count_distinct_bitmap_host_mem_, and row_set_mem_owner_.

Referenced by allocateCountDistinctBuffers(), getNumBuffers(), and initColumnPerRow().

658  {
662  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
663  row_set_mem_owner_->addCountDistinctBuffer(
664  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
665  return reinterpret_cast<int64_t>(ptr);
666  }
667  return reinterpret_cast<int64_t>(
668  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz));
669 }
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
#define CHECK(condition)
Definition: Logger.h:197
+ Here is the caller graph for this function:

◆ allocateCountDistinctBuffers()

std::vector< int64_t > QueryMemoryInitializer::allocateCountDistinctBuffers ( const QueryMemoryDescriptor query_mem_desc,
const bool  deferred,
const Executor executor 
)
private

Definition at line 612 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBitmap(), allocateCountDistinctSet(), Bitmap, CHECK, CHECK_EQ, CHECK_GE, CHECK_LT, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getLogicalSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), init_agg_vals_, Invalid, is_distinct_target(), kAPPROX_COUNT_DISTINCT, kCOUNT, and StdSet.

Referenced by getNumBuffers(), initGroups(), and QueryMemoryInitializer().

615  {
616  const size_t agg_col_count{query_mem_desc.getSlotCount()};
617  std::vector<int64_t> agg_bitmap_size(deferred ? agg_col_count : 0);
618 
619  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
620  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
621  ++target_idx) {
622  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
623  const auto agg_info = get_target_info(target_expr, g_bigint_count);
624  if (is_distinct_target(agg_info)) {
625  CHECK(agg_info.is_agg &&
626  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
627  CHECK(!agg_info.sql_type.is_varlen());
628 
629  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
630  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
631 
632  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
633  sizeof(int64_t));
634  const auto& count_distinct_desc =
635  query_mem_desc.getCountDistinctDescriptor(target_idx);
636  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
637  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
638  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
639  if (deferred) {
640  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
641  } else {
642  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
643  }
644  } else {
645  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::StdSet);
646  if (deferred) {
647  agg_bitmap_size[agg_col_idx] = -1;
648  } else {
649  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
650  }
651  }
652  }
653  }
654 
655  return agg_bitmap_size;
656 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:79
#define CHECK_GE(x, y)
Definition: Logger.h:210
std::vector< int64_t > init_agg_vals_
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:130
#define CHECK_LT(x, y)
Definition: Logger.h:207
Definition: sqldefs.h:76
#define CHECK(condition)
Definition: Logger.h:197
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ allocateCountDistinctGpuMem()

void QueryMemoryInitializer::allocateCountDistinctGpuMem ( const QueryMemoryDescriptor query_mem_desc)
private

Definition at line 580 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), Bitmap, CHECK, count_distinct_bitmap_crt_ptr_, count_distinct_bitmap_host_mem_, count_distinct_bitmap_mem_, count_distinct_bitmap_mem_bytes_, QueryMemoryDescriptor::countDistinctDescriptorsLogicallyEmpty(), device_allocator_, QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getCountDistinctDescriptorsSize(), QueryMemoryDescriptor::getEntryCount(), Invalid, row_set_mem_owner_, and DeviceAllocator::zeroDeviceMem().

Referenced by getNumBuffers(), and QueryMemoryInitializer().

581  {
582  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
583  return;
584  }
586 
587  size_t total_bytes_per_entry{0};
588  const size_t num_count_distinct_descs =
589  query_mem_desc.getCountDistinctDescriptorsSize();
590  for (size_t i = 0; i < num_count_distinct_descs; i++) {
591  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
592  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
593  continue;
594  }
595  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
596  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
597  }
598 
600  total_bytes_per_entry * query_mem_desc.getEntryCount();
601  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
603  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
605 
608 }
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:27
size_t getCountDistinctDescriptorsSize() const
virtual int8_t * alloc(const size_t num_bytes)=0
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool countDistinctDescriptorsLogicallyEmpty() const
#define CHECK(condition)
Definition: Logger.h:197
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ allocateCountDistinctSet()

int64_t QueryMemoryInitializer::allocateCountDistinctSet ( )
private

Definition at line 671 of file QueryMemoryInitializer.cpp.

References row_set_mem_owner_.

Referenced by allocateCountDistinctBuffers(), getNumBuffers(), and initColumnPerRow().

671  {
672  auto count_distinct_set = new std::set<int64_t>();
673  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
674  return reinterpret_cast<int64_t>(count_distinct_set);
675 }
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
+ Here is the caller graph for this function:

◆ allocateTDigests()

std::vector< bool > QueryMemoryInitializer::allocateTDigests ( const QueryMemoryDescriptor query_mem_desc,
const bool  deferred,
const Executor executor 
)
private

Definition at line 677 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), QueryMemoryDescriptor::blocksShareMemory(), CHECK, CHECK_EQ, CHECK_GE, CHECK_GT, CHECK_LE, CHECK_LT, copy_from_gpu(), DeviceAllocator::copyToDevice(), create_dev_group_by_buffers(), device_allocator_, GpuGroupByBuffers::entry_count, streaming_top_n::get_heap_size(), streaming_top_n::get_rows_offset_of_heaps(), RenderAllocator::getAllocatedSize(), QueryMemoryDescriptor::getBufferColSlotCount(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEffectiveKeyWidth(), getGroupByBuffersSize(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getLogicalSlotWidthBytes(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getRowSize(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), GPU, group_by_buffers_, QueryMemoryDescriptor::hasKeylessHash(), init_agg_vals_, init_columnar_group_by_buffer_on_device(), init_group_by_buffer_on_device(), QueryMemoryDescriptor::interleavedBins(), kAPPROX_MEDIAN, QueryMemoryDescriptor::lazyInitGroups(), SortInfo::limit, num_rows_, SortInfo::offset, row_set_mem_owner_, GpuGroupByBuffers::second, DeviceAllocator::setDeviceMem(), RelAlgExecutionUnit::sort_info, QueryMemoryDescriptor::threadsShareMemory(), RelAlgExecutionUnit::use_bump_allocator, QueryMemoryDescriptor::useStreamingTopN(), and DeviceAllocator::zeroDeviceMem().

Referenced by getNumBuffers(), initGroups(), and QueryMemoryInitializer().

680  {
681  size_t const slot_count = query_mem_desc.getSlotCount();
682  size_t const ntargets = executor->plan_state_->target_exprs_.size();
683  CHECK_GE(slot_count, ntargets);
684  std::vector<bool> tdigest_deferred(deferred ? slot_count : 0);
685 
686  for (size_t target_idx = 0; target_idx < ntargets; ++target_idx) {
687  auto const target_expr = executor->plan_state_->target_exprs_[target_idx];
688  if (auto const agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr)) {
689  if (agg_expr->get_aggtype() == kAPPROX_MEDIAN) {
690  size_t const agg_col_idx =
691  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
692  CHECK_LT(agg_col_idx, slot_count);
693  CHECK_EQ(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx),
694  static_cast<int8_t>(sizeof(int64_t)));
695  if (deferred) {
696  tdigest_deferred[agg_col_idx] = true;
697  } else {
698  init_agg_vals_[agg_col_idx] =
699  reinterpret_cast<int64_t>(row_set_mem_owner_->newTDigest());
700  }
701  }
702  }
703  }
704  return tdigest_deferred;
705 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
#define CHECK_GE(x, y)
Definition: Logger.h:210
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< int64_t > init_agg_vals_
#define CHECK_LT(x, y)
Definition: Logger.h:207
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ applyStreamingTopNOffsetCpu()

void QueryMemoryInitializer::applyStreamingTopNOffsetCpu ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 1043 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, CPU, streaming_top_n::get_rows_copy_from_heaps(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), group_by_buffers_, SortInfo::limit, SortInfo::offset, and RelAlgExecutionUnit::sort_info.

Referenced by getNumBuffers().

1045  {
1046  CHECK_EQ(group_by_buffers_.size(), size_t(1));
1047 
1048  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1049  group_by_buffers_[0],
1050  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1051  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
1052  1);
1053  CHECK_EQ(rows_copy.size(),
1054  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1055  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
1056 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
const size_t limit
const SortInfo sort_info
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
std::vector< int64_t * > group_by_buffers_
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
const size_t offset
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ applyStreamingTopNOffsetGpu()

void QueryMemoryInitializer::applyStreamingTopNOffsetGpu ( Data_Namespace::DataMgr data_mgr,
const QueryMemoryDescriptor query_mem_desc,
const GpuGroupByBuffers gpu_group_by_buffers,
const RelAlgExecutionUnit ra_exe_unit,
const unsigned  total_thread_count,
const int  device_id 
)
private

Definition at line 1058 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), group_by_buffers_, num_buffers_, GpuGroupByBuffers::second, and UNREACHABLE.

Referenced by getNumBuffers().

1064  {
1065 #ifdef HAVE_CUDA
1067 
1068  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1069  data_mgr,
1070  reinterpret_cast<int64_t*>(gpu_group_by_buffers.second),
1071  ra_exe_unit,
1072  query_mem_desc,
1073  total_thread_count,
1074  device_id);
1075  CHECK_EQ(
1076  rows_copy.size(),
1077  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1078  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
1079 #else
1080  UNREACHABLE();
1081 #endif
1082 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
#define UNREACHABLE()
Definition: Logger.h:241
CUdeviceptr second
Definition: GpuMemUtils.h:61
std::vector< int64_t * > group_by_buffers_
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ compactProjectionBuffersCpu()

void QueryMemoryInitializer::compactProjectionBuffersCpu ( const QueryMemoryDescriptor query_mem_desc,
const size_t  projection_count 
)
private

Definition at line 970 of file QueryMemoryInitializer.cpp.

References CHECK, anonymous_namespace{QueryMemoryInitializer.cpp}::compact_projection_buffer_for_cpu_columnar(), QueryMemoryDescriptor::getEntryCount(), group_by_buffers_, and result_sets_.

Referenced by getNumBuffers().

972  {
973  const auto num_allocated_rows =
974  std::min(projection_count, query_mem_desc.getEntryCount());
975 
976  // copy the results from the main buffer into projection_buffer
978  query_mem_desc,
979  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
980  num_allocated_rows);
981 
982  // update the entry count for the result set, and its underlying storage
983  CHECK(!result_sets_.empty());
984  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
985 }
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
#define CHECK(condition)
Definition: Logger.h:197
std::vector< std::unique_ptr< ResultSet > > result_sets_
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ compactProjectionBuffersGpu()

void QueryMemoryInitializer::compactProjectionBuffersGpu ( const QueryMemoryDescriptor query_mem_desc,
Data_Namespace::DataMgr data_mgr,
const GpuGroupByBuffers gpu_group_by_buffers,
const size_t  projection_count,
const int  device_id 
)
private

Definition at line 987 of file QueryMemoryInitializer.cpp.

References CHECK, copy_projection_buffer_from_gpu_columnar(), QueryMemoryDescriptor::getEntryCount(), group_by_buffers_, and result_sets_.

Referenced by getNumBuffers().

992  {
993  // store total number of allocated rows:
994  const auto num_allocated_rows =
995  std::min(projection_count, query_mem_desc.getEntryCount());
996 
997  // copy the results from the main buffer into projection_buffer
999  data_mgr,
1000  gpu_group_by_buffers,
1001  query_mem_desc,
1002  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
1003  num_allocated_rows,
1004  device_id);
1005 
1006  // update the entry count for the result set, and its underlying storage
1007  CHECK(!result_sets_.empty());
1008  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1009 }
std::vector< int64_t * > group_by_buffers_
#define CHECK(condition)
Definition: Logger.h:197
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
std::vector< std::unique_ptr< ResultSet > > result_sets_
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ computeNumberOfBuffers()

size_t QueryMemoryInitializer::computeNumberOfBuffers ( const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
const Executor executor 
) const
private

Definition at line 926 of file QueryMemoryInitializer.cpp.

References QueryMemoryDescriptor::blocksShareMemory(), and CPU.

Referenced by getNumBuffers().

929  {
930  return device_type == ExecutorDeviceType::CPU
931  ? 1
932  : executor->blockSize() *
933  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
934 }
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ copyGroupByBuffersFromGpu()

void QueryMemoryInitializer::copyGroupByBuffersFromGpu ( Data_Namespace::DataMgr data_mgr,
const QueryMemoryDescriptor query_mem_desc,
const size_t  entry_count,
const GpuGroupByBuffers gpu_group_by_buffers,
const RelAlgExecutionUnit ra_exe_unit,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int  device_id,
const bool  prepend_index_buffer 
) const

Definition at line 1011 of file QueryMemoryInitializer.cpp.

References copy_group_by_buffers_from_gpu(), streaming_top_n::get_heap_size(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getRowSize(), GPU, group_by_buffers_, SortInfo::limit, SortInfo::offset, GpuGroupByBuffers::second, RelAlgExecutionUnit::sort_info, and QueryMemoryDescriptor::useStreamingTopN().

Referenced by getNumBuffers().

1020  {
1021  const auto thread_count = block_size_x * grid_size_x;
1022 
1023  size_t total_buff_size{0};
1024  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
1025  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
1026  total_buff_size =
1027  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1028  } else {
1029  total_buff_size =
1030  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
1031  }
1034  total_buff_size,
1035  gpu_group_by_buffers.second,
1036  query_mem_desc,
1037  block_size_x,
1038  grid_size_x,
1039  device_id,
1040  prepend_index_buffer);
1041 }
const size_t limit
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t *> &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer)
CUdeviceptr second
Definition: GpuMemUtils.h:61
const SortInfo sort_info
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
std::vector< int64_t * > group_by_buffers_
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
const size_t offset
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ getAggInitValForIndex()

int64_t QueryMemoryInitializer::getAggInitValForIndex ( const size_t  index) const
inline

Definition at line 89 of file QueryMemoryInitializer.h.

References CHECK_LT, and init_agg_vals_.

89  {
90  CHECK_LT(index, init_agg_vals_.size());
91  return init_agg_vals_[index];
92  }
std::vector< int64_t > init_agg_vals_
#define CHECK_LT(x, y)
Definition: Logger.h:207

◆ getCountDistinctBitmapBytes()

const auto QueryMemoryInitializer::getCountDistinctBitmapBytes ( ) const
inline

Definition at line 70 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_mem_bytes_.

70  {
72  }

◆ getCountDistinctBitmapPtr()

const auto QueryMemoryInitializer::getCountDistinctBitmapPtr ( ) const
inline

Definition at line 66 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_mem_.

◆ getCountDistinctHostPtr()

const auto QueryMemoryInitializer::getCountDistinctHostPtr ( ) const
inline

◆ getGroupByBuffersPtr()

const auto QueryMemoryInitializer::getGroupByBuffersPtr ( )
inline

Definition at line 94 of file QueryMemoryInitializer.h.

References group_by_buffers_.

94  {
95  return reinterpret_cast<int64_t**>(group_by_buffers_.data());
96  }
std::vector< int64_t * > group_by_buffers_

◆ getGroupByBuffersSize()

const auto QueryMemoryInitializer::getGroupByBuffersSize ( ) const
inline

Definition at line 98 of file QueryMemoryInitializer.h.

References group_by_buffers_.

Referenced by allocateTDigests().

98 { return group_by_buffers_.size(); }
std::vector< int64_t * > group_by_buffers_
+ Here is the caller graph for this function:

◆ getNumBuffers()

const auto QueryMemoryInitializer::getNumBuffers ( ) const
inline

◆ getResultSet()

ResultSet* QueryMemoryInitializer::getResultSet ( const size_t  index) const
inline

Definition at line 74 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

74  {
75  CHECK_LT(index, result_sets_.size());
76  return result_sets_[index].get();
77  }
#define CHECK_LT(x, y)
Definition: Logger.h:207
std::vector< std::unique_ptr< ResultSet > > result_sets_

◆ getResultSetOwned()

std::unique_ptr<ResultSet> QueryMemoryInitializer::getResultSetOwned ( const size_t  index)
inline

Definition at line 79 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

79  {
80  CHECK_LT(index, result_sets_.size());
81  return std::move(result_sets_[index]);
82  }
#define CHECK_LT(x, y)
Definition: Logger.h:207
std::vector< std::unique_ptr< ResultSet > > result_sets_

◆ initColumnarGroups()

void QueryMemoryInitializer::initColumnarGroups ( const QueryMemoryDescriptor query_mem_desc,
int64_t *  groups_buffer,
const std::vector< int64_t > &  init_vals,
const Executor executor 
)
private

Definition at line 466 of file QueryMemoryInitializer.cpp.

References align_to_int64(), CHECK, CHECK_LT, EMPTY_KEY_64, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getQueryDescriptionType(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::hasKeylessHash(), is_distinct_target(), and Projection.

Referenced by getNumBuffers(), initGroupByBuffer(), and QueryMemoryInitializer().

470  {
471  CHECK(groups_buffer);
472  for (const auto target_expr : executor->plan_state_->target_exprs_) {
473  const auto agg_info = get_target_info(target_expr, g_bigint_count);
474  CHECK(!is_distinct_target(agg_info));
475  }
476  const int32_t agg_col_count = query_mem_desc.getSlotCount();
477  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
478 
479  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
480  if (!query_mem_desc.hasKeylessHash()) {
481  const size_t key_count{query_mem_desc.getGroupbyColCount()};
482  for (size_t i = 0; i < key_count; ++i) {
483  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
484  EMPTY_KEY_64,
485  groups_buffer_entry_count);
486  }
487  }
488 
490  // initializing all aggregate columns:
491  int32_t init_val_idx = 0;
492  for (int32_t i = 0; i < agg_col_count; ++i) {
493  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
494  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
495  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
496  case 1:
497  buffer_ptr = initColumnarBuffer<int8_t>(
498  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
499  break;
500  case 2:
501  buffer_ptr =
502  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
503  init_vals[init_val_idx++],
504  groups_buffer_entry_count);
505  break;
506  case 4:
507  buffer_ptr =
508  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
509  init_vals[init_val_idx++],
510  groups_buffer_entry_count);
511  break;
512  case 8:
513  buffer_ptr =
514  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
515  init_vals[init_val_idx++],
516  groups_buffer_entry_count);
517  break;
518  case 0:
519  break;
520  default:
521  CHECK(false);
522  }
523 
524  buffer_ptr = align_to_int64(buffer_ptr);
525  }
526  }
527  }
528 }
#define EMPTY_KEY_64
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:79
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:130
#define CHECK_LT(x, y)
Definition: Logger.h:207
#define CHECK(condition)
Definition: Logger.h:197
QueryDescriptionType getQueryDescriptionType() const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ initColumnPerRow()

void QueryMemoryInitializer::initColumnPerRow ( const QueryMemoryDescriptor query_mem_desc,
int8_t *  row_ptr,
const size_t  bin,
const std::vector< int64_t > &  init_vals,
const std::vector< int64_t > &  bitmap_sizes,
const std::vector< bool > &  tdigest_deferred 
)
private

Definition at line 530 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBitmap(), allocateCountDistinctSet(), CHECK, CHECK_EQ, CHECK_LT, QueryMemoryDescriptor::getNextColOffInBytes(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::isGroupBy(), and row_set_mem_owner_.

Referenced by getNumBuffers(), and initGroups().

535  {
536  int8_t* col_ptr = row_ptr;
537  size_t init_vec_idx = 0;
538  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
539  col_ptr += query_mem_desc.getNextColOffInBytes(col_ptr, bin, col_idx++)) {
540  const int64_t bm_sz{bitmap_sizes[col_idx]};
541  int64_t init_val{0};
542  if (bm_sz && query_mem_desc.isGroupBy()) {
543  // COUNT DISTINCT / APPROX_COUNT_DISTINCT
544  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
545  sizeof(int64_t));
546  init_val =
548  ++init_vec_idx;
549  } else if (query_mem_desc.isGroupBy() && tdigest_deferred[col_idx]) {
550  // APPROX_MEDIAN
551  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->newTDigest());
552  ++init_vec_idx;
553  } else {
554  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
555  CHECK_LT(init_vec_idx, init_vals.size());
556  init_val = init_vals[init_vec_idx++];
557  }
558  }
559  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
560  case 1:
561  *col_ptr = static_cast<int8_t>(init_val);
562  break;
563  case 2:
564  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
565  break;
566  case 4:
567  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
568  break;
569  case 8:
570  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
571  break;
572  case 0:
573  continue;
574  default:
575  CHECK(false);
576  }
577  }
578 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
size_t getNextColOffInBytes(const int8_t *col_ptr, const size_t bin, const size_t col_idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:207
#define CHECK(condition)
Definition: Logger.h:197
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ initGroupByBuffer()

void QueryMemoryInitializer::initGroupByBuffer ( int64_t *  buffer,
const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
const bool  output_columnar,
const Executor executor 
)
private

Definition at line 370 of file QueryMemoryInitializer.cpp.

References streaming_top_n::get_rows_offset_of_heaps(), QueryMemoryDescriptor::getEntryCount(), GPU, init_agg_vals_, initColumnarGroups(), initGroups(), QueryMemoryDescriptor::interleavedBins(), SortInfo::limit, SortInfo::offset, RelAlgExecutionUnit::sort_info, and QueryMemoryDescriptor::useStreamingTopN().

Referenced by getNumBuffers(), and QueryMemoryInitializer().

376  {
377  if (output_columnar) {
378  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor);
379  } else {
380  auto rows_ptr = buffer;
381  auto actual_entry_count = query_mem_desc.getEntryCount();
382  const auto thread_count = device_type == ExecutorDeviceType::GPU
383  ? executor->blockSize() * executor->gridSize()
384  : 1;
385  auto warp_size =
386  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
387  if (query_mem_desc.useStreamingTopN()) {
388  const auto node_count_size = thread_count * sizeof(int64_t);
389  memset(rows_ptr, 0, node_count_size);
390  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
391  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
392  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
393  rows_ptr += rows_offset / sizeof(int64_t);
394  actual_entry_count = n * thread_count;
395  warp_size = 1;
396  }
397  initGroups(query_mem_desc,
398  rows_ptr,
400  actual_entry_count,
401  warp_size,
402  executor);
403  }
404 }
void initGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
const size_t limit
std::vector< int64_t > init_agg_vals_
const SortInfo sort_info
bool interleavedBins(const ExecutorDeviceType) const
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
const size_t offset
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ initGroups()

void QueryMemoryInitializer::initGroups ( const QueryMemoryDescriptor query_mem_desc,
int64_t *  groups_buffer,
const std::vector< int64_t > &  init_vals,
const int32_t  groups_buffer_entry_count,
const size_t  warp_size,
const Executor executor 
)
private

Definition at line 406 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBuffers(), allocateTDigests(), CHECK, result_set::fill_empty_key(), ResultSet::fixupQueryMemoryDescriptor(), QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getRowSize(), QueryMemoryDescriptor::hasKeylessHash(), and initColumnPerRow().

Referenced by getNumBuffers(), and initGroupByBuffer().

411  {
412  const size_t key_count{query_mem_desc.getGroupbyColCount()};
413  const size_t row_size{query_mem_desc.getRowSize()};
414  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
415 
416  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
417  auto tdigest_deferred = allocateTDigests(query_mem_desc, true, executor);
418  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
419 
420  const auto query_mem_desc_fixedup =
422 
423  if (query_mem_desc.hasKeylessHash()) {
424  CHECK(warp_size >= 1);
425  CHECK(key_count == 1 || warp_size == 1);
426  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
427  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
428  ++bin, buffer_ptr += row_size) {
429  initColumnPerRow(query_mem_desc_fixedup,
430  &buffer_ptr[col_base_off],
431  bin,
432  init_vals,
433  agg_bitmap_size,
434  tdigest_deferred);
435  }
436  }
437  return;
438  }
439 
440  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
441  ++bin, buffer_ptr += row_size) {
443  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
444  initColumnPerRow(query_mem_desc_fixedup,
445  &buffer_ptr[col_base_off],
446  bin,
447  init_vals,
448  agg_bitmap_size,
449  tdigest_deferred);
450  }
451 }
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
std::vector< bool > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:482
void initColumnPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const size_t bin, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< bool > &tdigest_deferred)
#define CHECK(condition)
Definition: Logger.h:197
size_t getColOffInBytes(const size_t col_idx) const
size_t getEffectiveKeyWidth() const
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ resetResultSet()

void QueryMemoryInitializer::resetResultSet ( const size_t  index)
inline

Definition at line 84 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

84  {
85  CHECK_LT(index, result_sets_.size());
86  result_sets_[index].reset();
87  }
#define CHECK_LT(x, y)
Definition: Logger.h:207
std::vector< std::unique_ptr< ResultSet > > result_sets_

Friends And Related Function Documentation

◆ Executor

friend class Executor
friend

Definition at line 233 of file QueryMemoryInitializer.h.

Referenced by getNumBuffers().

◆ QueryExecutionContext

friend class QueryExecutionContext
friend

Definition at line 234 of file QueryMemoryInitializer.h.

Member Data Documentation

◆ count_distinct_bitmap_crt_ptr_

int8_t* QueryMemoryInitializer::count_distinct_bitmap_crt_ptr_
private

◆ count_distinct_bitmap_host_mem_

int8_t* QueryMemoryInitializer::count_distinct_bitmap_host_mem_
private

◆ count_distinct_bitmap_mem_

CUdeviceptr QueryMemoryInitializer::count_distinct_bitmap_mem_
private

◆ count_distinct_bitmap_mem_bytes_

size_t QueryMemoryInitializer::count_distinct_bitmap_mem_bytes_
private

◆ device_allocator_

DeviceAllocator* QueryMemoryInitializer::device_allocator_ {nullptr}
private

◆ group_by_buffers_

◆ init_agg_vals_

std::vector<int64_t> QueryMemoryInitializer::init_agg_vals_
private

◆ num_buffers_

const size_t QueryMemoryInitializer::num_buffers_
private

◆ num_rows_

const int64_t QueryMemoryInitializer::num_rows_
private

Definition at line 216 of file QueryMemoryInitializer.h.

Referenced by allocateTDigests(), and QueryMemoryInitializer().

◆ result_sets_

std::vector<std::unique_ptr<ResultSet> > QueryMemoryInitializer::result_sets_
private

◆ row_set_mem_owner_

std::shared_ptr<RowSetMemoryOwner> QueryMemoryInitializer::row_set_mem_owner_
private

◆ temporary_buffers_

std::vector<Data_Namespace::AbstractBuffer*> QueryMemoryInitializer::temporary_buffers_
private

Definition at line 231 of file QueryMemoryInitializer.h.


The documentation for this class was generated from the following files: