OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryMemoryInitializer Class Reference

#include <QueryMemoryInitializer.h>

+ Collaboration diagram for QueryMemoryInitializer:

Classes

struct  TargetAggOpsMetadata
 

Public Types

using ModeIndexSet = robin_hood::unordered_set< size_t >
 
using QuantileParam = std::optional< double >
 

Public Member Functions

 QueryMemoryInitializer (const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const shared::TableKey &outer_table_key, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
 
 QueryMemoryInitializer (const TableFunctionExecutionUnit &exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *device_allocator, const Executor *executor)
 
const auto getCountDistinctBitmapDevicePtr () const
 
const auto getCountDistinctBitmapHostPtr () const
 
const auto getCountDistinctBitmapBytes () const
 
const auto getVarlenOutputHostPtr () const
 
const auto getVarlenOutputPtr () const
 
ResultSetgetResultSet (const size_t index) const
 
std::unique_ptr< ResultSetgetResultSetOwned (const size_t index)
 
void resetResultSet (const size_t index)
 
int64_t getAggInitValForIndex (const size_t index) const
 
const auto getGroupByBuffersPtr ()
 
const auto getGroupByBuffersSize () const
 
const auto getNumBuffers () const
 
GpuGroupByBuffers setupTableFunctionGpuBuffers (const QueryMemoryDescriptor &query_mem_desc, const int device_id, const unsigned block_size_x, const unsigned grid_size_x, const bool zero_initialize_buffers)
 
void copyFromTableFunctionGpuBuffers (Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
 
void copyGroupByBuffersFromGpu (DeviceAllocator &device_allocator, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
 

Private Member Functions

void initGroupByBuffer (int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, TargetAggOpsMetadata &agg_expr_metadata, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
 
void initRowGroups (const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, TargetAggOpsMetadata &agg_expr_metadata, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)
 
void initColumnarGroups (const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)
 
void initColumnsPerRow (const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const TargetAggOpsMetadata &agg_op_metadata)
 
void allocateCountDistinctGpuMem (const QueryMemoryDescriptor &query_mem_desc)
 
std::vector< int64_t > calculateCountDistinctBufferSize (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit) const
 
void allocateCountDistinctBuffers (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
int64_t allocateCountDistinctBitmap (const size_t bitmap_byte_sz)
 
int64_t allocateCountDistinctSet ()
 
ModeIndexSet initializeModeIndexSet (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
void allocateModeBuffer (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
std::vector< QuantileParaminitializeQuantileParams (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
void allocateTDigestsBuffer (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
GpuGroupByBuffers prepareTopNHeapsDevBuffer (const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
 
GpuGroupByBuffers createAndInitializeGroupByBufferGpu (const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const int device_id, const ExecutorDispatchMode dispatch_mode, const unsigned block_size_x, const unsigned grid_size_x, const int8_t warp_size, const bool can_sort_on_gpu, const bool output_columnar, RenderAllocator *render_allocator)
 
size_t computeNumberOfBuffers (const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
 
void compactProjectionBuffersCpu (const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
 
void compactProjectionBuffersGpu (const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
 
void applyStreamingTopNOffsetCpu (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
void applyStreamingTopNOffsetGpu (Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
 
std::shared_ptr< VarlenOutputInfogetVarlenOutputInfo ()
 

Private Attributes

const int64_t num_rows_
 
std::shared_ptr
< RowSetMemoryOwner
row_set_mem_owner_
 
std::vector< std::unique_ptr
< ResultSet > > 
result_sets_
 
std::vector< int64_t > init_agg_vals_
 
size_t num_buffers_
 
std::vector< int64_t * > group_by_buffers_
 
std::shared_ptr< VarlenOutputInfovarlen_output_info_
 
CUdeviceptr varlen_output_buffer_
 
int8_t * varlen_output_buffer_host_ptr_
 
CUdeviceptr count_distinct_bitmap_device_mem_ptr_
 
size_t count_distinct_bitmap_mem_size_
 
int8_t * count_distinct_bitmap_host_crt_ptr_
 
int8_t * count_distinct_bitmap_host_mem_ptr_
 
DeviceAllocatordevice_allocator_ {nullptr}
 
std::vector
< Data_Namespace::AbstractBuffer * > 
temporary_buffers_
 
const size_t thread_idx_
 

Friends

class Executor
 
class QueryExecutionContext
 

Detailed Description

Definition at line 35 of file QueryMemoryInitializer.h.

Member Typedef Documentation

using QueryMemoryInitializer::ModeIndexSet = robin_hood::unordered_set<size_t>

Definition at line 37 of file QueryMemoryInitializer.h.

using QueryMemoryInitializer::QuantileParam = std::optional<double>

Definition at line 38 of file QueryMemoryInitializer.h.

Constructor & Destructor Documentation

QueryMemoryInitializer::QueryMemoryInitializer ( const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const ExecutorDeviceType  device_type,
const ExecutorDispatchMode  dispatch_mode,
const bool  output_columnar,
const bool  sort_on_gpu,
const shared::TableKey outer_table_key,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
RenderAllocatorMap render_allocator_map,
RenderInfo render_info,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
DeviceAllocator gpu_allocator,
const size_t  thread_idx,
const Executor executor 
)

Definition at line 216 of file QueryMemoryInitializer.cpp.

References anonymous_namespace{QueryMemoryInitializer.cpp}::alloc_group_by_buffer(), allocateCountDistinctBuffers(), allocateCountDistinctGpuMem(), allocateModeBuffer(), allocateTDigestsBuffer(), calculateCountDistinctBufferSize(), CHECK, anonymous_namespace{QueryMemoryInitializer.cpp}::check_count_distinct_expr_metadata(), CHECK_EQ, CHECK_GE, anonymous_namespace{QueryMemoryInitializer.cpp}::check_total_bitmap_memory(), anonymous_namespace{QueryMemoryInitializer.cpp}::collect_target_expr_metadata(), QueryMemoryInitializer::TargetAggOpsMetadata::count_distinct_buf_size, CPU, RelAlgExecutionUnit::estimator, ResultSet::fixupQueryMemoryDescriptor(), g_max_memory_allocation_size, anonymous_namespace{QueryMemoryInitializer.cpp}::get_col_frag_offsets(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_consistent_frags_sizes(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_input_idx(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), getVarlenOutputInfo(), GPU, group_by_buffers_, QueryMemoryInitializer::TargetAggOpsMetadata::has_count_distinct, QueryMemoryInitializer::TargetAggOpsMetadata::has_mode, QueryMemoryInitializer::TargetAggOpsMetadata::has_tdigest, QueryMemoryDescriptor::hasKeylessHash(), QueryMemoryDescriptor::hasVarlenOutput(), initGroupByBuffer(), initializeModeIndexSet(), initializeQuantileParams(), QueryMemoryDescriptor::interleavedBins(), QueryMemoryDescriptor::isGroupBy(), KernelPerFragment, QueryMemoryDescriptor::lazyInitGroups(), QueryMemoryInitializer::TargetAggOpsMetadata::mode_index_set, num_buffers_, QueryMemoryInitializer::TargetAggOpsMetadata::qualtile_params, result_sets_, row_set_mem_owner_, RelAlgExecutionUnit::target_exprs, target_exprs_to_infos(), RelAlgExecutionUnit::target_exprs_union, thread_idx_, QueryMemoryDescriptor::threadsCanReuseGroupByBuffers(), QueryMemoryDescriptor::threadsShareMemory(), RelAlgExecutionUnit::union_all, RelAlgExecutionUnit::use_bump_allocator, RenderInfo::useCudaBuffers(), and QueryMemoryDescriptor::varlenOutputBufferElemSize().

234  : num_rows_(num_rows)
235  , row_set_mem_owner_(row_set_mem_owner)
236  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
237  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
244  , device_allocator_(device_allocator)
245  , thread_idx_(thread_idx) {
246  CHECK(!sort_on_gpu || output_columnar);
247  executor->logSystemCPUMemoryStatus("Before Query Memory Initialization", thread_idx);
248 
249  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
250  if (consistent_frag_sizes.empty()) {
251  // No fragments in the input, no underlying buffers will be needed.
252  return;
253  }
254 
255  TargetAggOpsMetadata agg_op_metadata =
256  collect_target_expr_metadata(query_mem_desc, ra_exe_unit);
257  if (agg_op_metadata.has_count_distinct) {
258  check_count_distinct_expr_metadata(query_mem_desc, ra_exe_unit);
259  if (!ra_exe_unit.use_bump_allocator) {
260  check_total_bitmap_memory(query_mem_desc);
261  }
262  if (device_type == ExecutorDeviceType::GPU) {
263  allocateCountDistinctGpuMem(query_mem_desc);
264  }
265  }
266 
267  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
268  if (agg_op_metadata.has_count_distinct) {
269  allocateCountDistinctBuffers(query_mem_desc, ra_exe_unit);
270  }
271  if (agg_op_metadata.has_mode) {
272  allocateModeBuffer(query_mem_desc, ra_exe_unit);
273  }
274  if (agg_op_metadata.has_tdigest) {
275  allocateTDigestsBuffer(query_mem_desc, ra_exe_unit);
276  }
277  if (render_info && render_info->useCudaBuffers()) {
278  return;
279  }
280  }
281 
282  if (query_mem_desc.isGroupBy()) {
283  if (agg_op_metadata.has_count_distinct) {
284  agg_op_metadata.count_distinct_buf_size =
285  calculateCountDistinctBufferSize(query_mem_desc, ra_exe_unit);
286  }
287  if (agg_op_metadata.has_mode) {
288  agg_op_metadata.mode_index_set =
289  initializeModeIndexSet(query_mem_desc, ra_exe_unit);
290  }
291  if (agg_op_metadata.has_tdigest) {
292  agg_op_metadata.qualtile_params =
293  initializeQuantileParams(query_mem_desc, ra_exe_unit);
294  }
295  }
296 
297  if (ra_exe_unit.estimator) {
298  return;
299  }
300 
301  const auto thread_count = device_type == ExecutorDeviceType::GPU
302  ? executor->blockSize() * executor->gridSize()
303  : 1;
304 
305  size_t group_buffer_size{0};
306  if (ra_exe_unit.use_bump_allocator) {
307  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
308  // the fragment
309  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
310  group_buffer_size = num_rows * query_mem_desc.getRowSize();
311  } else {
312  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
313  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
314  }
315  } else {
316  group_buffer_size =
317  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
318  }
319  CHECK_GE(group_buffer_size, size_t(0));
320 
321  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
322  int64_t* group_by_buffer_template{nullptr};
323  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
324  group_by_buffer_template = reinterpret_cast<int64_t*>(
325  row_set_mem_owner_->allocate(group_buffer_size, thread_idx_));
326  initGroupByBuffer(group_by_buffer_template,
327  ra_exe_unit,
328  query_mem_desc,
329  agg_op_metadata,
330  device_type,
331  output_columnar,
332  executor);
333  }
334 
335  if (query_mem_desc.interleavedBins(device_type)) {
336  CHECK(query_mem_desc.hasKeylessHash());
337  }
338 
339  const auto step = device_type == ExecutorDeviceType::GPU &&
340  query_mem_desc.threadsShareMemory() &&
341  query_mem_desc.isGroupBy()
342  ? executor->blockSize()
343  : size_t(1);
344  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
345  query_mem_desc.hasKeylessHash()
346  ? query_mem_desc.getEntryCount()
347  : size_t(0);
348  const auto actual_group_buffer_size =
349  group_buffer_size + index_buffer_qw * sizeof(int64_t);
350  CHECK_GE(actual_group_buffer_size, group_buffer_size);
351 
352  if (query_mem_desc.hasVarlenOutput()) {
353  const auto varlen_buffer_elem_size_opt = query_mem_desc.varlenOutputBufferElemSize();
354  CHECK(varlen_buffer_elem_size_opt); // TODO(adb): relax
355  auto const varlen_buffer_sz =
356  query_mem_desc.getEntryCount() * varlen_buffer_elem_size_opt.value();
357  auto varlen_output_buffer =
358  reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(varlen_buffer_sz));
359  num_buffers_ += 1;
360  group_by_buffers_.push_back(varlen_output_buffer);
361  }
362 
363  if (query_mem_desc.threadsCanReuseGroupByBuffers()) {
364  // Sanity checks, intra-thread buffer reuse should only
365  // occur on CPU for group-by queries, which also means
366  // that only one group-by buffer should be allocated
367  // (multiple-buffer allocation only occurs for GPU)
368  CHECK(device_type == ExecutorDeviceType::CPU);
369  CHECK(query_mem_desc.isGroupBy());
370  CHECK_EQ(group_buffers_count, size_t(1));
371  }
372 
373  // Group-by buffer reuse assumes 1 group-by-buffer per query step
374  // Multiple group-by-buffers should only be used on GPU,
375  // whereas buffer reuse only is done on CPU
376  CHECK(group_buffers_count <= 1 || !query_mem_desc.threadsCanReuseGroupByBuffers());
377  for (size_t i = 0; i < group_buffers_count; i += step) {
378  auto group_by_info =
379  alloc_group_by_buffer(actual_group_buffer_size,
380  render_allocator_map,
381  thread_idx_,
382  row_set_mem_owner_.get(),
383  query_mem_desc.threadsCanReuseGroupByBuffers());
384 
385  auto group_by_buffer = group_by_info.first;
386  const bool was_cached = group_by_info.second;
387  if (!was_cached) {
388  if (!query_mem_desc.lazyInitGroups(device_type)) {
389  if (group_by_buffer_template) {
390  memcpy(group_by_buffer + index_buffer_qw,
391  group_by_buffer_template,
392  group_buffer_size);
393  } else {
394  initGroupByBuffer(group_by_buffer + index_buffer_qw,
395  ra_exe_unit,
396  query_mem_desc,
397  agg_op_metadata,
398  device_type,
399  output_columnar,
400  executor);
401  }
402  }
403  }
404 
405  size_t old_size = group_by_buffers_.size();
406  group_by_buffers_.resize(old_size + std::max(size_t(1), step), nullptr);
407  group_by_buffers_[old_size] = group_by_buffer;
408 
409  const bool use_target_exprs_union =
410  ra_exe_unit.union_all && get_input_idx(ra_exe_unit, outer_table_key);
411  const auto& target_exprs = use_target_exprs_union ? ra_exe_unit.target_exprs_union
412  : ra_exe_unit.target_exprs;
413  const auto column_frag_offsets = get_col_frag_offsets(target_exprs, frag_offsets);
414  const auto column_frag_sizes =
415  get_consistent_frags_sizes(target_exprs, consistent_frag_sizes);
416 
417  old_size = result_sets_.size();
418  result_sets_.resize(old_size + std::max(size_t(1), step));
419  result_sets_[old_size] =
420  std::make_unique<ResultSet>(target_exprs_to_infos(target_exprs, query_mem_desc),
421  executor->getColLazyFetchInfo(target_exprs),
422  col_buffers,
423  column_frag_offsets,
424  column_frag_sizes,
425  device_type,
426  device_id,
427  thread_idx,
430  executor->blockSize(),
431  executor->gridSize());
432  result_sets_[old_size]->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
433  executor->plan_state_->init_agg_vals_,
435  }
436 }
ModeIndexSet initializeModeIndexSet(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:54
CUdeviceptr count_distinct_bitmap_device_mem_ptr_
QueryMemoryInitializer::TargetAggOpsMetadata collect_target_expr_metadata(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
DeviceAllocator * device_allocator_
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
const std::optional< bool > union_all
void allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
#define CHECK_GE(x, y)
Definition: Logger.h:306
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< QuantileParam > initializeQuantileParams(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void check_count_distinct_expr_metadata(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
std::vector< Analyzer::Expr * > target_exprs_union
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::pair< int64_t *, bool > alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, const size_t thread_idx, RowSetMemoryOwner *mem_owner, const bool reuse_existing_buffer_for_thread)
std::vector< int64_t > calculateCountDistinctBufferSize(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit) const
std::vector< int64_t > init_agg_vals_
bool lazyInitGroups(const ExecutorDeviceType) const
bool threadsCanReuseGroupByBuffers() const
size_t g_max_memory_allocation_size
Definition: Execute.cpp:124
const std::shared_ptr< Analyzer::Estimator > estimator
std::vector< int64_t * > group_by_buffers_
std::optional< size_t > varlenOutputBufferElemSize() const
std::shared_ptr< VarlenOutputInfo > getVarlenOutputInfo()
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, TargetAggOpsMetadata &agg_expr_metadata, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
void allocateTDigestsBuffer(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void allocateModeBuffer(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:766
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:291
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
std::vector< std::unique_ptr< ResultSet > > result_sets_
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
int get_input_idx(RelAlgExecutionUnit const &ra_exe_unit, const shared::TableKey &outer_table_key)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)

+ Here is the call graph for this function:

QueryMemoryInitializer::QueryMemoryInitializer ( const TableFunctionExecutionUnit exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const ExecutorDeviceType  device_type,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
DeviceAllocator device_allocator,
const Executor executor 
)

Definition at line 439 of file QueryMemoryInitializer.cpp.

450  : num_rows_(num_rows)
451  , row_set_mem_owner_(row_set_mem_owner)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< int64_t > init_agg_vals_
std::vector< Analyzer::Expr * > target_exprs
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)

Member Function Documentation

int64_t QueryMemoryInitializer::allocateCountDistinctBitmap ( const size_t  bitmap_byte_sz)
private

Definition at line 883 of file QueryMemoryInitializer.cpp.

References CHECK, count_distinct_bitmap_host_crt_ptr_, count_distinct_bitmap_host_mem_ptr_, row_set_mem_owner_, and thread_idx_.

Referenced by allocateCountDistinctBuffers(), and initColumnsPerRow().

883  {
887  count_distinct_bitmap_host_crt_ptr_ += bitmap_byte_sz;
888  row_set_mem_owner_->addCountDistinctBuffer(
889  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
890  return reinterpret_cast<int64_t>(ptr);
891  }
892  return reinterpret_cast<int64_t>(
893  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz, thread_idx_));
894 }
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

void QueryMemoryInitializer::allocateCountDistinctBuffers ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 861 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBitmap(), allocateCountDistinctSet(), Bitmap, CountDistinctDescriptor::bitmapPaddedSizeBytes(), CHECK, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), init_agg_vals_, is_distinct_target(), RelAlgExecutionUnit::target_exprs, and UnorderedSet.

Referenced by QueryMemoryInitializer().

863  {
864  for (size_t target_idx = 0; target_idx < ra_exe_unit.target_exprs.size();
865  ++target_idx) {
866  const auto target_expr = ra_exe_unit.target_exprs[target_idx];
867  const auto agg_info = get_target_info(target_expr, g_bigint_count);
868  if (is_distinct_target(agg_info)) {
869  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
870  const auto& count_distinct_desc =
871  query_mem_desc.getCountDistinctDescriptor(target_idx);
872  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
873  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
874  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
875  } else {
876  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::UnorderedSet);
877  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
878  }
879  }
880  }
881 }
std::vector< Analyzer::Expr * > target_exprs
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
std::vector< int64_t > init_agg_vals_
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK(condition)
Definition: Logger.h:291
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::allocateCountDistinctGpuMem ( const QueryMemoryDescriptor query_mem_desc)
private

Definition at line 806 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), Bitmap, CHECK, count_distinct_bitmap_device_mem_ptr_, count_distinct_bitmap_host_crt_ptr_, count_distinct_bitmap_host_mem_ptr_, count_distinct_bitmap_mem_size_, QueryMemoryDescriptor::countDistinctDescriptorsLogicallyEmpty(), device_allocator_, QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getCountDistinctDescriptorsSize(), QueryMemoryDescriptor::getEntryCount(), Invalid, row_set_mem_owner_, thread_idx_, and DeviceAllocator::zeroDeviceMem().

Referenced by QueryMemoryInitializer().

807  {
808  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
809  return;
810  }
812 
813  size_t total_bytes_per_entry{0};
814  const size_t num_count_distinct_descs =
815  query_mem_desc.getCountDistinctDescriptorsSize();
816  for (size_t i = 0; i < num_count_distinct_descs; i++) {
817  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
818  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
819  continue;
820  }
821  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
822  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
823  }
824 
826  total_bytes_per_entry * query_mem_desc.getEntryCount();
830  reinterpret_cast<int8_t*>(count_distinct_bitmap_device_mem_ptr_),
834 }
bool countDistinctDescriptorsLogicallyEmpty() const
CUdeviceptr count_distinct_bitmap_device_mem_ptr_
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:28
virtual int8_t * alloc(const size_t num_bytes)=0
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
size_t getCountDistinctDescriptorsSize() const
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int64_t QueryMemoryInitializer::allocateCountDistinctSet ( )
private

Definition at line 896 of file QueryMemoryInitializer.cpp.

References row_set_mem_owner_.

Referenced by allocateCountDistinctBuffers(), and initColumnsPerRow().

896  {
897  auto count_distinct_set = new CountDistinctSet();
898  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
899  return reinterpret_cast<int64_t>(count_distinct_set);
900 }
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_

+ Here is the caller graph for this function:

void QueryMemoryInitializer::allocateModeBuffer ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 938 of file QueryMemoryInitializer.cpp.

References CHECK_LE, CHECK_LT, anonymous_namespace{QueryMemoryInitializer.cpp}::eachAggregateTargetIdxOfType(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), init_agg_vals_, kMODE, row_set_mem_owner_, and RelAlgExecutionUnit::target_exprs.

Referenced by QueryMemoryInitializer().

940  {
941  size_t const slot_count = query_mem_desc.getSlotCount();
942  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
944  ra_exe_unit.target_exprs,
945  kMODE,
946  [&](Analyzer::AggExpr const*, size_t const target_idx) {
947  size_t const agg_col_idx =
948  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
949  CHECK_LT(agg_col_idx, slot_count);
950  AggMode* agg_mode = row_set_mem_owner_->allocateMode();
951  init_agg_vals_[agg_col_idx] = reinterpret_cast<int64_t>(agg_mode);
952  });
953 }
std::vector< Analyzer::Expr * > target_exprs
void eachAggregateTargetIdxOfType(std::vector< Analyzer::Expr * > const &target_exprs, SQLAgg const agg_type, std::function< void(Analyzer::AggExpr const *, size_t)> lambda)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< int64_t > init_agg_vals_
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK_LE(x, y)
Definition: Logger.h:304
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
Definition: sqldefs.h:83

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::allocateTDigestsBuffer ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 979 of file QueryMemoryInitializer.cpp.

References CHECK, CHECK_EQ, CHECK_LE, CHECK_LT, anonymous_namespace{QueryMemoryInitializer.cpp}::eachAggregateTargetIdxOfType(), Analyzer::AggExpr::get_arg1(), QueryMemoryDescriptor::getLogicalSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), init_agg_vals_, kAPPROX_QUANTILE, row_set_mem_owner_, and RelAlgExecutionUnit::target_exprs.

Referenced by QueryMemoryInitializer().

981  {
982  size_t const slot_count = query_mem_desc.getSlotCount();
983  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
985  ra_exe_unit.target_exprs,
987  [&](Analyzer::AggExpr const* const agg_expr, size_t const target_idx) {
988  size_t const agg_col_idx =
989  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
990  CHECK_LT(agg_col_idx, slot_count);
991  CHECK_EQ(static_cast<int8_t>(sizeof(int64_t)),
992  query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx));
993  auto const q_expr =
994  dynamic_cast<Analyzer::Constant const*>(agg_expr->get_arg1().get());
995  CHECK(q_expr);
996  auto const q = q_expr->get_constval().doubleval;
997  // allocate for APPROX_QUANTILE only when slot is used
998  init_agg_vals_[agg_col_idx] =
999  reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
1000  });
1001 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void eachAggregateTargetIdxOfType(std::vector< Analyzer::Expr * > const &target_exprs, SQLAgg const agg_type, std::function< void(Analyzer::AggExpr const *, size_t)> lambda)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< int64_t > init_agg_vals_
std::shared_ptr< Analyzer::Expr > get_arg1() const
Definition: Analyzer.h:1333
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK_LE(x, y)
Definition: Logger.h:304
#define CHECK(condition)
Definition: Logger.h:291
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::applyStreamingTopNOffsetCpu ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 1370 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, CPU, streaming_top_n::get_rows_copy_from_heaps(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), SortInfo::limit, SortInfo::offset, and RelAlgExecutionUnit::sort_info.

1372  {
1373  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1374  CHECK_EQ(group_by_buffers_.size(), buffer_start_idx + 1);
1375 
1376  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1377  group_by_buffers_[buffer_start_idx],
1378  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1379  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit.value_or(0),
1380  1);
1381  CHECK_EQ(rows_copy.size(),
1382  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1383  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1384 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
std::optional< size_t > limit
std::vector< int64_t * > group_by_buffers_
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)

+ Here is the call graph for this function:

void QueryMemoryInitializer::applyStreamingTopNOffsetGpu ( Data_Namespace::DataMgr data_mgr,
const QueryMemoryDescriptor query_mem_desc,
const GpuGroupByBuffers gpu_group_by_buffers,
const RelAlgExecutionUnit ra_exe_unit,
const unsigned  total_thread_count,
const int  device_id 
)
private

Definition at line 1386 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, GpuGroupByBuffers::data, QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), num_buffers_, and UNREACHABLE.

1392  {
1393 #ifdef HAVE_CUDA
1395  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1396 
1397  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1398  data_mgr,
1399  reinterpret_cast<int64_t*>(gpu_group_by_buffers.data),
1400  ra_exe_unit,
1401  query_mem_desc,
1402  total_thread_count,
1403  device_id);
1404  CHECK_EQ(
1405  rows_copy.size(),
1406  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1407  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1408 #else
1409  UNREACHABLE();
1410 #endif
1411 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
#define UNREACHABLE()
Definition: Logger.h:338
std::vector< int64_t * > group_by_buffers_

+ Here is the call graph for this function:

std::vector< int64_t > QueryMemoryInitializer::calculateCountDistinctBufferSize ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
) const
private

Definition at line 836 of file QueryMemoryInitializer.cpp.

References Bitmap, CountDistinctDescriptor::bitmapPaddedSizeBytes(), CHECK, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), is_distinct_target(), RelAlgExecutionUnit::target_exprs, and UnorderedSet.

Referenced by QueryMemoryInitializer().

838  {
839  const size_t agg_col_count{query_mem_desc.getSlotCount()};
840  std::vector<int64_t> agg_bitmap_size(agg_col_count);
841  for (size_t target_idx = 0; target_idx < ra_exe_unit.target_exprs.size();
842  ++target_idx) {
843  const auto target_expr = ra_exe_unit.target_exprs[target_idx];
844  const auto agg_info = get_target_info(target_expr, g_bigint_count);
845  if (is_distinct_target(agg_info)) {
846  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
847  const auto& count_distinct_desc =
848  query_mem_desc.getCountDistinctDescriptor(target_idx);
849  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
850  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
851  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
852  } else {
853  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::UnorderedSet);
854  agg_bitmap_size[agg_col_idx] = -1;
855  }
856  }
857  }
858  return agg_bitmap_size;
859 }
std::vector< Analyzer::Expr * > target_exprs
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK(condition)
Definition: Logger.h:291
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::compactProjectionBuffersCpu ( const QueryMemoryDescriptor query_mem_desc,
const size_t  projection_count 
)
private

Definition at line 1293 of file QueryMemoryInitializer.cpp.

References CHECK, anonymous_namespace{QueryMemoryInitializer.cpp}::compact_projection_buffer_for_cpu_columnar(), QueryMemoryDescriptor::getEntryCount(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), and result_sets_.

1295  {
1296  const auto num_allocated_rows =
1297  std::min(projection_count, query_mem_desc.getEntryCount());
1298  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1299 
1300  // copy the results from the main buffer into projection_buffer
1302  query_mem_desc,
1303  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1304  num_allocated_rows);
1305 
1306  // update the entry count for the result set, and its underlying storage
1307  CHECK(!result_sets_.empty());
1308  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1309 }
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
#define CHECK(condition)
Definition: Logger.h:291
std::vector< std::unique_ptr< ResultSet > > result_sets_

+ Here is the call graph for this function:

void QueryMemoryInitializer::compactProjectionBuffersGpu ( const QueryMemoryDescriptor query_mem_desc,
Data_Namespace::DataMgr data_mgr,
const GpuGroupByBuffers gpu_group_by_buffers,
const size_t  projection_count,
const int  device_id 
)
private

Definition at line 1311 of file QueryMemoryInitializer.cpp.

References CHECK, copy_projection_buffer_from_gpu_columnar(), QueryMemoryDescriptor::getEntryCount(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), and result_sets_.

1316  {
1317  // store total number of allocated rows:
1318  const auto num_allocated_rows =
1319  std::min(projection_count, query_mem_desc.getEntryCount());
1320 
1321  // copy the results from the main buffer into projection_buffer
1322  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1324  data_mgr,
1325  gpu_group_by_buffers,
1326  query_mem_desc,
1327  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1328  num_allocated_rows,
1329  device_id);
1330 
1331  // update the entry count for the result set, and its underlying storage
1332  CHECK(!result_sets_.empty());
1333  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1334 }
std::vector< int64_t * > group_by_buffers_
#define CHECK(condition)
Definition: Logger.h:291
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
std::vector< std::unique_ptr< ResultSet > > result_sets_

+ Here is the call graph for this function:

size_t QueryMemoryInitializer::computeNumberOfBuffers ( const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
const Executor executor 
) const
private

Definition at line 1249 of file QueryMemoryInitializer.cpp.

References QueryMemoryDescriptor::blocksShareMemory(), and CPU.

1252  {
1253  return device_type == ExecutorDeviceType::CPU
1254  ? 1
1255  : executor->blockSize() *
1256  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
1257 }

+ Here is the call graph for this function:

void QueryMemoryInitializer::copyFromTableFunctionGpuBuffers ( Data_Namespace::DataMgr data_mgr,
const QueryMemoryDescriptor query_mem_desc,
const size_t  entry_count,
const GpuGroupByBuffers gpu_group_by_buffers,
const int  device_id,
const unsigned  block_size_x,
const unsigned  grid_size_x 
)

Definition at line 1212 of file QueryMemoryInitializer.cpp.

References align_to_int64(), CHECK_LE, GpuGroupByBuffers::data, GpuGroupByBuffers::entry_count, QueryMemoryDescriptor::getBufferColSlotCount(), QueryMemoryDescriptor::getColSlotContext(), getQueryEngineCudaStreamForDevice(), ColSlotContext::getSlotInfo(), group_by_buffers_, and SlotSize::logical_size.

1219  {
1220  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
1221 
1222  int8_t* dev_buffer = gpu_group_by_buffers.data;
1223  int8_t* host_buffer = reinterpret_cast<int8_t*>(group_by_buffers_[0]);
1224 
1225  const size_t original_entry_count = gpu_group_by_buffers.entry_count;
1226  CHECK_LE(entry_count, original_entry_count);
1227  size_t output_device_col_offset{0};
1228  size_t output_host_col_offset{0};
1229 
1230  const auto col_slot_context = query_mem_desc.getColSlotContext();
1231 
1232  auto allocator = std::make_unique<CudaAllocator>(
1233  data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id));
1234 
1235  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1236  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
1237  const size_t output_device_col_size = original_entry_count * col_width;
1238  const size_t output_host_col_size = entry_count * col_width;
1239  allocator->copyFromDevice(host_buffer + output_host_col_offset,
1240  dev_buffer + output_device_col_offset,
1241  output_host_col_size);
1242  output_device_col_offset =
1243  align_to_int64(output_device_col_offset + output_device_col_size);
1244  output_host_col_offset =
1245  align_to_int64(output_host_col_offset + output_host_col_size);
1246  }
1247 }
int8_t logical_size
const SlotSize & getSlotInfo(const size_t slot_idx) const
std::vector< int64_t * > group_by_buffers_
#define CHECK_LE(x, y)
Definition: Logger.h:304
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
const ColSlotContext & getColSlotContext() const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

void QueryMemoryInitializer::copyGroupByBuffersFromGpu ( DeviceAllocator device_allocator,
const QueryMemoryDescriptor query_mem_desc,
const size_t  entry_count,
const GpuGroupByBuffers gpu_group_by_buffers,
const RelAlgExecutionUnit ra_exe_unit,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int  device_id,
const bool  prepend_index_buffer 
) const

Definition at line 1336 of file QueryMemoryInitializer.cpp.

References copy_group_by_buffers_from_gpu(), GpuGroupByBuffers::data, streaming_top_n::get_heap_size(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getRowSize(), GPU, group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), SortInfo::limit, anonymous_namespace{Utm.h}::n, SortInfo::offset, RelAlgExecutionUnit::sort_info, and QueryMemoryDescriptor::useStreamingTopN().

1345  {
1346  const auto thread_count = block_size_x * grid_size_x;
1347 
1348  size_t total_buff_size{0};
1349  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
1350  const size_t n =
1351  ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit.value_or(0);
1352  total_buff_size =
1353  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1354  } else {
1355  total_buff_size =
1356  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
1357  }
1358  copy_group_by_buffers_from_gpu(device_allocator,
1360  total_buff_size,
1361  gpu_group_by_buffers.data,
1362  query_mem_desc,
1363  block_size_x,
1364  grid_size_x,
1365  device_id,
1366  prepend_index_buffer,
1367  query_mem_desc.hasVarlenOutput());
1368 }
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
std::optional< size_t > limit
std::vector< int64_t * > group_by_buffers_
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
constexpr double n
Definition: Utm.h:38
void copy_group_by_buffers_from_gpu(DeviceAllocator &device_allocator, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const int8_t *group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)

+ Here is the call graph for this function:

GpuGroupByBuffers QueryMemoryInitializer::createAndInitializeGroupByBufferGpu ( const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int8_t *  init_agg_vals_dev_ptr,
const int  device_id,
const ExecutorDispatchMode  dispatch_mode,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int8_t  warp_size,
const bool  can_sort_on_gpu,
const bool  output_columnar,
RenderAllocator render_allocator 
)
private

Definition at line 1057 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), CHECK, CHECK_EQ, DeviceAllocator::copyToDevice(), create_dev_group_by_buffers(), device_allocator_, RenderAllocator::getAllocatedSize(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getEntryCount(), getGroupByBuffersSize(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getRowSize(), QueryMemoryDescriptor::getSlotCount(), GPU, group_by_buffers_, QueryMemoryDescriptor::hasKeylessHash(), QueryMemoryDescriptor::hasVarlenOutput(), init_columnar_group_by_buffer_on_device(), init_group_by_buffer_on_device(), QueryMemoryDescriptor::interleavedBins(), QueryMemoryDescriptor::lazyInitGroups(), SortInfo::limit, anonymous_namespace{Utm.h}::n, num_rows_, SortInfo::offset, prepareTopNHeapsDevBuffer(), row_set_mem_owner_, RelAlgExecutionUnit::sort_info, thread_idx_, QueryMemoryDescriptor::threadsShareMemory(), UNREACHABLE, RelAlgExecutionUnit::use_bump_allocator, QueryMemoryDescriptor::useStreamingTopN(), varlen_output_buffer_, varlen_output_buffer_host_ptr_, varlen_output_info_, and QueryMemoryDescriptor::varlenOutputBufferElemSize().

1068  {
1069 #ifdef HAVE_CUDA
1070  if (query_mem_desc.useStreamingTopN()) {
1071  if (render_allocator) {
1073  }
1074  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit.value_or(0);
1075  CHECK(!output_columnar);
1076 
1078  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
1079  }
1080 
1081  auto dev_group_by_buffers =
1084  query_mem_desc,
1085  block_size_x,
1086  grid_size_x,
1087  device_id,
1088  dispatch_mode,
1089  num_rows_,
1090  can_sort_on_gpu,
1091  false,
1092  ra_exe_unit.use_bump_allocator,
1093  query_mem_desc.hasVarlenOutput(),
1094  render_allocator);
1095  if (query_mem_desc.hasVarlenOutput()) {
1096  CHECK(dev_group_by_buffers.varlen_output_buffer);
1098  reinterpret_cast<CUdeviceptr>(dev_group_by_buffers.varlen_output_buffer);
1099  CHECK(query_mem_desc.varlenOutputBufferElemSize());
1100  const size_t varlen_output_buf_bytes =
1101  query_mem_desc.getEntryCount() *
1102  query_mem_desc.varlenOutputBufferElemSize().value();
1104  row_set_mem_owner_->allocate(varlen_output_buf_bytes, thread_idx_);
1106  varlen_output_info_->gpu_start_address = static_cast<int64_t>(varlen_output_buffer_);
1108  }
1109  if (render_allocator) {
1110  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
1111  }
1112  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
1113  CHECK(!render_allocator);
1114 
1115  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
1116  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
1117  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
1118  auto group_by_dev_buffer = dev_group_by_buffers.data;
1119  const size_t col_count = query_mem_desc.getSlotCount();
1120  int8_t* col_widths_dev_ptr{nullptr};
1121  if (output_columnar) {
1122  std::vector<int8_t> compact_col_widths(col_count);
1123  for (size_t idx = 0; idx < col_count; ++idx) {
1124  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
1125  }
1126  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
1128  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
1129  }
1130  const int8_t warp_count =
1131  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
1132  const auto num_group_by_buffers =
1133  getGroupByBuffersSize() - (query_mem_desc.hasVarlenOutput() ? 1 : 0);
1134  for (size_t i = 0; i < num_group_by_buffers; i += step) {
1135  if (output_columnar) {
1137  reinterpret_cast<int64_t*>(group_by_dev_buffer),
1138  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1139  dev_group_by_buffers.entry_count,
1140  query_mem_desc.getGroupbyColCount(),
1141  col_count,
1142  col_widths_dev_ptr,
1143  /*need_padding = */ true,
1144  query_mem_desc.hasKeylessHash(),
1145  sizeof(int64_t),
1146  block_size_x,
1147  grid_size_x);
1148  } else {
1150  reinterpret_cast<int64_t*>(group_by_dev_buffer),
1151  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1152  dev_group_by_buffers.entry_count,
1153  query_mem_desc.getGroupbyColCount(),
1154  query_mem_desc.getEffectiveKeyWidth(),
1155  query_mem_desc.getRowSize() / sizeof(int64_t),
1156  query_mem_desc.hasKeylessHash(),
1157  warp_count,
1158  block_size_x,
1159  grid_size_x);
1160  }
1161  group_by_dev_buffer += groups_buffer_size;
1162  }
1163  }
1164  return dev_group_by_buffers;
1165 #else
1166  UNREACHABLE();
1167  return {};
1168 #endif
1169 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *device_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:70
GpuGroupByBuffers prepareTopNHeapsDevBuffer(const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:28
#define UNREACHABLE()
Definition: Logger.h:338
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getEffectiveKeyWidth() const
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
size_t getGroupbyColCount() const
bool lazyInitGroups(const ExecutorDeviceType) const
std::optional< size_t > limit
size_t getAllocatedSize() const
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
std::vector< int64_t * > group_by_buffers_
std::optional< size_t > varlenOutputBufferElemSize() const
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:291
const auto getGroupByBuffersSize() const
std::shared_ptr< VarlenOutputInfo > varlen_output_info_
constexpr double n
Definition: Utm.h:38

+ Here is the call graph for this function:

int64_t QueryMemoryInitializer::getAggInitValForIndex ( const size_t  index) const
inline

Definition at line 111 of file QueryMemoryInitializer.h.

References CHECK_LT, and init_agg_vals_.

111  {
112  CHECK_LT(index, init_agg_vals_.size());
113  return init_agg_vals_[index];
114  }
std::vector< int64_t > init_agg_vals_
#define CHECK_LT(x, y)
Definition: Logger.h:303
const auto QueryMemoryInitializer::getCountDistinctBitmapBytes ( ) const
inline

Definition at line 87 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_mem_size_.

87  {
89  }
const auto QueryMemoryInitializer::getCountDistinctBitmapDevicePtr ( ) const
inline

Definition at line 79 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_device_mem_ptr_.

79  {
81  }
CUdeviceptr count_distinct_bitmap_device_mem_ptr_
const auto QueryMemoryInitializer::getCountDistinctBitmapHostPtr ( ) const
inline

Definition at line 83 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_host_mem_ptr_.

83  {
85  }
const auto QueryMemoryInitializer::getGroupByBuffersPtr ( )
inline

Definition at line 116 of file QueryMemoryInitializer.h.

References group_by_buffers_.

116  {
117  return reinterpret_cast<int64_t**>(group_by_buffers_.data());
118  }
std::vector< int64_t * > group_by_buffers_
const auto QueryMemoryInitializer::getGroupByBuffersSize ( ) const
inline

Definition at line 120 of file QueryMemoryInitializer.h.

References group_by_buffers_.

Referenced by createAndInitializeGroupByBufferGpu().

120 { return group_by_buffers_.size(); }
std::vector< int64_t * > group_by_buffers_

+ Here is the caller graph for this function:

const auto QueryMemoryInitializer::getNumBuffers ( ) const
inline

Definition at line 122 of file QueryMemoryInitializer.h.

References CHECK_EQ, group_by_buffers_, and num_buffers_.

122  {
124  return num_buffers_;
125  }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
std::vector< int64_t * > group_by_buffers_
ResultSet* QueryMemoryInitializer::getResultSet ( const size_t  index) const
inline

Definition at line 96 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

96  {
97  CHECK_LT(index, result_sets_.size());
98  return result_sets_[index].get();
99  }
#define CHECK_LT(x, y)
Definition: Logger.h:303
std::vector< std::unique_ptr< ResultSet > > result_sets_
std::unique_ptr<ResultSet> QueryMemoryInitializer::getResultSetOwned ( const size_t  index)
inline

Definition at line 101 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

101  {
102  CHECK_LT(index, result_sets_.size());
103  return std::move(result_sets_[index]);
104  }
#define CHECK_LT(x, y)
Definition: Logger.h:303
std::vector< std::unique_ptr< ResultSet > > result_sets_
const auto QueryMemoryInitializer::getVarlenOutputHostPtr ( ) const
inline

Definition at line 92 of file QueryMemoryInitializer.h.

References varlen_output_buffer_host_ptr_.

std::shared_ptr< VarlenOutputInfo > QueryMemoryInitializer::getVarlenOutputInfo ( )
private

Definition at line 1413 of file QueryMemoryInitializer.cpp.

References varlen_output_buffer_, varlen_output_buffer_host_ptr_, and varlen_output_info_.

Referenced by QueryMemoryInitializer().

1413  {
1414  if (varlen_output_info_) {
1415  return varlen_output_info_;
1416  }
1417 
1418  // shared_ptr so that both the ResultSet and QMI can hold on to the varlen info object
1419  // and update it as needed
1420  varlen_output_info_ = std::make_shared<VarlenOutputInfo>(VarlenOutputInfo{
1421  static_cast<int64_t>(varlen_output_buffer_), varlen_output_buffer_host_ptr_});
1422  return varlen_output_info_;
1423 }
std::shared_ptr< VarlenOutputInfo > varlen_output_info_

+ Here is the caller graph for this function:

const auto QueryMemoryInitializer::getVarlenOutputPtr ( ) const
inline

Definition at line 94 of file QueryMemoryInitializer.h.

References varlen_output_buffer_.

94 { return varlen_output_buffer_; }
void QueryMemoryInitializer::initColumnarGroups ( const QueryMemoryDescriptor query_mem_desc,
int64_t *  groups_buffer,
const std::vector< int64_t > &  init_vals,
const Executor executor,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 677 of file QueryMemoryInitializer.cpp.

References align_to_int64(), CHECK, CHECK_LT, EMPTY_KEY_64, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getQueryDescriptionType(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::hasKeylessHash(), is_distinct_target(), Projection, and RelAlgExecutionUnit::target_exprs.

Referenced by initGroupByBuffer().

682  {
683  CHECK(groups_buffer);
684 
685  for (const auto target_expr : ra_exe_unit.target_exprs) {
686  const auto agg_info = get_target_info(target_expr, g_bigint_count);
687  CHECK(!is_distinct_target(agg_info));
688  }
689  const int32_t agg_col_count = query_mem_desc.getSlotCount();
690  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
691 
692  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
693  if (!query_mem_desc.hasKeylessHash()) {
694  const size_t key_count{query_mem_desc.getGroupbyColCount()};
695  for (size_t i = 0; i < key_count; ++i) {
696  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
697  EMPTY_KEY_64,
698  groups_buffer_entry_count);
699  }
700  }
701 
703  // initializing all aggregate columns:
704  int32_t init_val_idx = 0;
705  for (int32_t i = 0; i < agg_col_count; ++i) {
706  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
707  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
708  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
709  case 1:
710  buffer_ptr = initColumnarBuffer<int8_t>(
711  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
712  break;
713  case 2:
714  buffer_ptr =
715  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
716  init_vals[init_val_idx++],
717  groups_buffer_entry_count);
718  break;
719  case 4:
720  buffer_ptr =
721  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
722  init_vals[init_val_idx++],
723  groups_buffer_entry_count);
724  break;
725  case 8:
726  buffer_ptr =
727  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
728  init_vals[init_val_idx++],
729  groups_buffer_entry_count);
730  break;
731  case 0:
732  break;
733  default:
734  CHECK(false);
735  }
736 
737  buffer_ptr = align_to_int64(buffer_ptr);
738  }
739  }
740  }
741 }
std::vector< Analyzer::Expr * > target_exprs
#define EMPTY_KEY_64
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
size_t getGroupbyColCount() const
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
QueryDescriptionType getQueryDescriptionType() const
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK(condition)
Definition: Logger.h:291
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initColumnsPerRow ( const QueryMemoryDescriptor query_mem_desc,
int8_t *  row_ptr,
const std::vector< int64_t > &  init_vals,
const TargetAggOpsMetadata agg_op_metadata 
)
private

Definition at line 743 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBitmap(), allocateCountDistinctSet(), CHECK, CHECK_EQ, CHECK_LT, CHECK_NE, QueryMemoryInitializer::TargetAggOpsMetadata::count_distinct_buf_size, QueryMemoryDescriptor::getNextColOffInBytesRowOnly(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryInitializer::TargetAggOpsMetadata::has_count_distinct, QueryMemoryInitializer::TargetAggOpsMetadata::has_mode, QueryMemoryInitializer::TargetAggOpsMetadata::has_tdigest, QueryMemoryDescriptor::isGroupBy(), QueryMemoryInitializer::TargetAggOpsMetadata::mode_index_set, QueryMemoryInitializer::TargetAggOpsMetadata::qualtile_params, and row_set_mem_owner_.

Referenced by initRowGroups().

747  {
748  int8_t* col_ptr = row_ptr;
749  size_t init_vec_idx = 0;
750  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
751  col_ptr += query_mem_desc.getNextColOffInBytesRowOnly(col_ptr, col_idx++)) {
752  int64_t init_val{0};
753  if (query_mem_desc.isGroupBy()) {
754  if (agg_op_metadata.has_count_distinct) {
755  // COUNT DISTINCT / APPROX_COUNT_DISTINCT
756  // create a data structure for count_distinct operator per entries
757  const int64_t bm_sz{agg_op_metadata.count_distinct_buf_size[col_idx]};
758  if (bm_sz) {
759  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
760  sizeof(int64_t));
761  init_val =
763  CHECK_NE(init_val, 0);
764  ++init_vec_idx;
765  }
766  } else if (agg_op_metadata.has_tdigest &&
767  agg_op_metadata.qualtile_params[col_idx]) {
768  auto const q = *agg_op_metadata.qualtile_params[col_idx];
769  // allocate for APPROX_QUANTILE only when slot is used
770  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
771  CHECK_NE(init_val, 0);
772  ++init_vec_idx;
773  } else if (agg_op_metadata.has_mode &&
774  agg_op_metadata.mode_index_set.count(col_idx)) {
775  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->allocateMode());
776  CHECK_NE(init_val, 0);
777  ++init_vec_idx;
778  }
779  }
780  auto const col_slot_width = query_mem_desc.getPaddedSlotWidthBytes(col_idx);
781  if (init_val == 0 && col_slot_width > 0) {
782  CHECK_LT(init_vec_idx, init_vals.size());
783  init_val = init_vals[init_vec_idx++];
784  }
785  switch (col_slot_width) {
786  case 1:
787  *col_ptr = static_cast<int8_t>(init_val);
788  break;
789  case 2:
790  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
791  break;
792  case 4:
793  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
794  break;
795  case 8:
796  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
797  break;
798  case 0:
799  continue;
800  default:
801  CHECK(false);
802  }
803  }
804 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
#define CHECK_NE(x, y)
Definition: Logger.h:302
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:303
size_t getNextColOffInBytesRowOnly(const int8_t *col_ptr, const size_t col_idx) const
#define CHECK(condition)
Definition: Logger.h:291
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initGroupByBuffer ( int64_t *  buffer,
const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
TargetAggOpsMetadata agg_expr_metadata,
const ExecutorDeviceType  device_type,
const bool  output_columnar,
const Executor executor 
)
private

Definition at line 541 of file QueryMemoryInitializer.cpp.

References streaming_top_n::get_rows_offset_of_heaps(), QueryMemoryDescriptor::getEntryCount(), GPU, init_agg_vals_, initColumnarGroups(), initRowGroups(), QueryMemoryDescriptor::interleavedBins(), SortInfo::limit, anonymous_namespace{Utm.h}::n, SortInfo::offset, RelAlgExecutionUnit::sort_info, and QueryMemoryDescriptor::useStreamingTopN().

Referenced by QueryMemoryInitializer().

548  {
549  if (output_columnar) {
550  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor, ra_exe_unit);
551  } else {
552  auto rows_ptr = buffer;
553  auto actual_entry_count = query_mem_desc.getEntryCount();
554  const auto thread_count = device_type == ExecutorDeviceType::GPU
555  ? executor->blockSize() * executor->gridSize()
556  : 1;
557  auto warp_size =
558  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
559  if (query_mem_desc.useStreamingTopN()) {
560  const auto node_count_size = thread_count * sizeof(int64_t);
561  memset(rows_ptr, 0, node_count_size);
562  const auto n =
563  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit.value_or(0);
564  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
565  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
566  rows_ptr += rows_offset / sizeof(int64_t);
567  actual_entry_count = n * thread_count;
568  warp_size = 1;
569  }
570  initRowGroups(query_mem_desc,
571  rows_ptr,
573  agg_op_metadata,
574  actual_entry_count,
575  warp_size,
576  executor,
577  ra_exe_unit);
578  }
579 }
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
std::vector< int64_t > init_agg_vals_
std::optional< size_t > limit
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)
bool interleavedBins(const ExecutorDeviceType) const
constexpr double n
Definition: Utm.h:38
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, TargetAggOpsMetadata &agg_expr_metadata, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

QueryMemoryInitializer::ModeIndexSet QueryMemoryInitializer::initializeModeIndexSet ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 920 of file QueryMemoryInitializer.cpp.

References CHECK_LE, CHECK_LT, anonymous_namespace{QueryMemoryInitializer.cpp}::eachAggregateTargetIdxOfType(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), kMODE, and RelAlgExecutionUnit::target_exprs.

Referenced by QueryMemoryInitializer().

922  {
923  size_t const slot_count = query_mem_desc.getSlotCount();
924  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
925  ModeIndexSet mode_index_set;
927  ra_exe_unit.target_exprs,
928  kMODE,
929  [&](Analyzer::AggExpr const*, size_t const target_idx) {
930  size_t const agg_col_idx =
931  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
932  CHECK_LT(agg_col_idx, slot_count);
933  mode_index_set.emplace(agg_col_idx);
934  });
935  return mode_index_set;
936 }
std::vector< Analyzer::Expr * > target_exprs
void eachAggregateTargetIdxOfType(std::vector< Analyzer::Expr * > const &target_exprs, SQLAgg const agg_type, std::function< void(Analyzer::AggExpr const *, size_t)> lambda)
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK_LE(x, y)
Definition: Logger.h:304
robin_hood::unordered_set< size_t > ModeIndexSet
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
Definition: sqldefs.h:83

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector< QueryMemoryInitializer::QuantileParam > QueryMemoryInitializer::initializeQuantileParams ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 956 of file QueryMemoryInitializer.cpp.

References CHECK, CHECK_EQ, CHECK_LE, CHECK_LT, anonymous_namespace{QueryMemoryInitializer.cpp}::eachAggregateTargetIdxOfType(), Analyzer::AggExpr::get_arg1(), QueryMemoryDescriptor::getLogicalSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), kAPPROX_QUANTILE, and RelAlgExecutionUnit::target_exprs.

Referenced by QueryMemoryInitializer().

958  {
959  size_t const slot_count = query_mem_desc.getSlotCount();
960  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
961  std::vector<QuantileParam> quantile_params(slot_count);
963  ra_exe_unit.target_exprs,
965  [&](Analyzer::AggExpr const* const agg_expr, size_t const target_idx) {
966  size_t const agg_col_idx =
967  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
968  CHECK_LT(agg_col_idx, slot_count);
969  CHECK_EQ(static_cast<int8_t>(sizeof(int64_t)),
970  query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx));
971  auto const q_expr =
972  dynamic_cast<Analyzer::Constant const*>(agg_expr->get_arg1().get());
973  CHECK(q_expr);
974  quantile_params[agg_col_idx] = q_expr->get_constval().doubleval;
975  });
976  return quantile_params;
977 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void eachAggregateTargetIdxOfType(std::vector< Analyzer::Expr * > const &target_exprs, SQLAgg const agg_type, std::function< void(Analyzer::AggExpr const *, size_t)> lambda)
std::shared_ptr< Analyzer::Expr > get_arg1() const
Definition: Analyzer.h:1333
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK_LE(x, y)
Definition: Logger.h:304
#define CHECK(condition)
Definition: Logger.h:291
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initRowGroups ( const QueryMemoryDescriptor query_mem_desc,
int64_t *  groups_buffer,
const std::vector< int64_t > &  init_vals,
TargetAggOpsMetadata agg_expr_metadata,
const int32_t  groups_buffer_entry_count,
const size_t  warp_size,
const Executor executor,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 581 of file QueryMemoryInitializer.cpp.

References CHECK, cpu_threads(), result_set::fill_empty_key(), ResultSet::fixupQueryMemoryDescriptor(), g_optimize_row_initialization, QueryMemoryDescriptor::getAvailableCpuThreads(), QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getRowSize(), QueryMemoryInitializer::TargetAggOpsMetadata::has_count_distinct, QueryMemoryInitializer::TargetAggOpsMetadata::has_mode, QueryMemoryInitializer::TargetAggOpsMetadata::has_tdigest, QueryMemoryDescriptor::hasKeylessHash(), initColumnsPerRow(), and threading_serial::parallel_for().

Referenced by initGroupByBuffer().

588  {
589  const size_t key_count{query_mem_desc.getGroupbyColCount()};
590  const size_t row_size{query_mem_desc.getRowSize()};
591  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
592 
593  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
594  const auto query_mem_desc_fixedup =
596  // not COUNT DISTINCT / APPROX_COUNT_DISTINCT / APPROX_QUANTILE
597  // we use the default implementation in those agg ops
598  auto const key_sz = query_mem_desc.getEffectiveKeyWidth();
599  if (!(agg_op_metadata.has_count_distinct || agg_op_metadata.has_mode ||
600  agg_op_metadata.has_tdigest) &&
602  std::vector<int8_t> sample_row(row_size - col_base_off);
603  auto const num_available_cpu_threads =
604  std::min(query_mem_desc.getAvailableCpuThreads(),
605  static_cast<size_t>(std::max(cpu_threads(), 1)));
606  tbb::task_arena initialization_arena(num_available_cpu_threads);
607 
609  query_mem_desc_fixedup, sample_row.data(), init_vals, agg_op_metadata);
610 
611  if (query_mem_desc.hasKeylessHash()) {
612  CHECK(warp_size >= 1);
613  CHECK(key_count == 1 || warp_size == 1);
614  initialization_arena.execute([&] {
616  tbb::blocked_range<size_t>(0, groups_buffer_entry_count * warp_size),
617  [&](const tbb::blocked_range<size_t>& r) {
618  auto cur_row_buf = buffer_ptr + (row_size * r.begin());
619  for (size_t i = r.begin(); i != r.end(); ++i, cur_row_buf += row_size) {
620  memcpy(cur_row_buf + col_base_off, sample_row.data(), sample_row.size());
621  }
622  });
623  });
624  return;
625  }
626  initialization_arena.execute([&] {
628  tbb::blocked_range<size_t>(0, groups_buffer_entry_count),
629  [&](const tbb::blocked_range<size_t>& r) {
630  auto cur_row_buf = buffer_ptr + (row_size * r.begin());
631  for (size_t i = r.begin(); i != r.end(); ++i, cur_row_buf += row_size) {
632  memcpy(cur_row_buf + col_base_off, sample_row.data(), sample_row.size());
633  result_set::fill_empty_key(cur_row_buf, key_count, key_sz);
634  }
635  });
636  });
637  } else {
638  // todo(yoonmin): allow parallelization of `initColumnsPerRow`
639  if (query_mem_desc.hasKeylessHash()) {
640  CHECK(warp_size >= 1);
641  CHECK(key_count == 1 || warp_size == 1);
642  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
643  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
644  ++bin, buffer_ptr += row_size) {
645  initColumnsPerRow(query_mem_desc_fixedup,
646  &buffer_ptr[col_base_off],
647  init_vals,
648  agg_op_metadata);
649  }
650  }
651  return;
652  }
653 
654  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
655  ++bin, buffer_ptr += row_size) {
657  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
659  query_mem_desc_fixedup, &buffer_ptr[col_base_off], init_vals, agg_op_metadata);
660  }
661  }
662 }
size_t getAvailableCpuThreads() const
size_t getEffectiveKeyWidth() const
size_t getGroupbyColCount() const
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const TargetAggOpsMetadata &agg_op_metadata)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:766
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
#define CHECK(condition)
Definition: Logger.h:291
bool g_optimize_row_initialization
Definition: Execute.cpp:104
int cpu_threads()
Definition: thread_count.h:25
size_t getColOffInBytes(const size_t col_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GpuGroupByBuffers QueryMemoryInitializer::prepareTopNHeapsDevBuffer ( const QueryMemoryDescriptor query_mem_desc,
const int8_t *  init_agg_vals_dev_ptr,
const size_t  n,
const int  device_id,
const unsigned  block_size_x,
const unsigned  grid_size_x 
)
private

Definition at line 1003 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), CHECK, DeviceAllocator::copyToDevice(), device_allocator_, streaming_top_n::get_heap_size(), streaming_top_n::get_rows_offset_of_heaps(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getRowSize(), GPU, QueryMemoryDescriptor::hasKeylessHash(), init_group_by_buffer_on_device(), QueryMemoryDescriptor::lazyInitGroups(), anonymous_namespace{Utm.h}::n, DeviceAllocator::setDeviceMem(), UNREACHABLE, and DeviceAllocator::zeroDeviceMem().

Referenced by createAndInitializeGroupByBufferGpu().

1009  {
1010 #ifdef HAVE_CUDA
1012  const auto thread_count = block_size_x * grid_size_x;
1013  const auto total_buff_size =
1014  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1015  int8_t* dev_buffer = device_allocator_->alloc(total_buff_size);
1016 
1017  std::vector<int8_t*> dev_buffers(thread_count);
1018 
1019  for (size_t i = 0; i < thread_count; ++i) {
1020  dev_buffers[i] = dev_buffer;
1021  }
1022 
1023  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(int8_t*));
1025  dev_ptr, dev_buffers.data(), thread_count * sizeof(int8_t*));
1026 
1027  CHECK(query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU));
1028 
1029  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
1030  thread_count * sizeof(int64_t));
1031 
1033  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
1034  (unsigned char)-1,
1035  thread_count * n * sizeof(int64_t));
1036 
1038  reinterpret_cast<int64_t*>(
1039  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
1040  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1041  n * thread_count,
1042  query_mem_desc.getGroupbyColCount(),
1043  query_mem_desc.getEffectiveKeyWidth(),
1044  query_mem_desc.getRowSize() / sizeof(int64_t),
1045  query_mem_desc.hasKeylessHash(),
1046  1,
1047  block_size_x,
1048  grid_size_x);
1049 
1050  return {dev_ptr, dev_buffer};
1051 #else
1052  UNREACHABLE();
1053  return {};
1054 #endif
1055 }
DeviceAllocator * device_allocator_
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
#define UNREACHABLE()
Definition: Logger.h:338
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getEffectiveKeyWidth() const
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
size_t getGroupbyColCount() const
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
#define CHECK(condition)
Definition: Logger.h:291
constexpr double n
Definition: Utm.h:38
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::resetResultSet ( const size_t  index)
inline

Definition at line 106 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

106  {
107  CHECK_LT(index, result_sets_.size());
108  result_sets_[index].reset();
109  }
#define CHECK_LT(x, y)
Definition: Logger.h:303
std::vector< std::unique_ptr< ResultSet > > result_sets_
GpuGroupByBuffers QueryMemoryInitializer::setupTableFunctionGpuBuffers ( const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const bool  zero_initialize_buffers 
)

Definition at line 1171 of file QueryMemoryInitializer.cpp.

References align_to_int64(), Allocator::alloc(), CHECK, CHECK_GT, DeviceAllocator::copyToDevice(), device_allocator_, QueryMemoryDescriptor::getBufferColSlotCount(), QueryMemoryDescriptor::getColSlotContext(), ColSlotContext::getSlotInfo(), SlotSize::logical_size, num_rows_, and DeviceAllocator::zeroDeviceMem().

1176  {
1177  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
1178  CHECK_GT(num_columns, size_t(0));
1179  size_t total_group_by_buffer_size{0};
1180  const auto col_slot_context = query_mem_desc.getColSlotContext();
1181 
1182  std::vector<size_t> col_byte_offsets;
1183  col_byte_offsets.reserve(num_columns);
1184 
1185  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1186  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
1187  size_t group_buffer_size = num_rows_ * col_width;
1188  col_byte_offsets.emplace_back(total_group_by_buffer_size);
1189  total_group_by_buffer_size =
1190  align_to_int64(total_group_by_buffer_size + group_buffer_size);
1191  }
1192 
1193  int8_t* dev_buffers_allocation{nullptr};
1194  dev_buffers_allocation = device_allocator_->alloc(total_group_by_buffer_size);
1195  CHECK(dev_buffers_allocation);
1196  if (zero_initialize_buffers) {
1197  device_allocator_->zeroDeviceMem(dev_buffers_allocation, total_group_by_buffer_size);
1198  }
1199 
1200  auto dev_buffers_mem = dev_buffers_allocation;
1201  std::vector<int8_t*> dev_buffers(num_columns);
1202  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1203  dev_buffers[col_idx] = dev_buffers_allocation + col_byte_offsets[col_idx];
1204  }
1205  auto dev_ptrs = device_allocator_->alloc(num_columns * sizeof(CUdeviceptr));
1207  dev_ptrs, dev_buffers.data(), num_columns * sizeof(CUdeviceptr));
1208 
1209  return {dev_ptrs, dev_buffers_mem, (size_t)num_rows_};
1210 }
int8_t logical_size
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:28
virtual int8_t * alloc(const size_t num_bytes)=0
#define CHECK_GT(x, y)
Definition: Logger.h:305
const SlotSize & getSlotInfo(const size_t slot_idx) const
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
const ColSlotContext & getColSlotContext() const
#define CHECK(condition)
Definition: Logger.h:291
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

Friends And Related Function Documentation

friend class Executor
friend

Definition at line 273 of file QueryMemoryInitializer.h.

friend class QueryExecutionContext
friend

Definition at line 274 of file QueryMemoryInitializer.h.

Member Data Documentation

CUdeviceptr QueryMemoryInitializer::count_distinct_bitmap_device_mem_ptr_
private
int8_t* QueryMemoryInitializer::count_distinct_bitmap_host_crt_ptr_
private
int8_t* QueryMemoryInitializer::count_distinct_bitmap_host_mem_ptr_
private
size_t QueryMemoryInitializer::count_distinct_bitmap_mem_size_
private
DeviceAllocator* QueryMemoryInitializer::device_allocator_ {nullptr}
private
std::vector<int64_t> QueryMemoryInitializer::init_agg_vals_
private
size_t QueryMemoryInitializer::num_buffers_
private
const int64_t QueryMemoryInitializer::num_rows_
private
std::vector<std::unique_ptr<ResultSet> > QueryMemoryInitializer::result_sets_
private
std::vector<Data_Namespace::AbstractBuffer*> QueryMemoryInitializer::temporary_buffers_
private

Definition at line 269 of file QueryMemoryInitializer.h.

const size_t QueryMemoryInitializer::thread_idx_
private
CUdeviceptr QueryMemoryInitializer::varlen_output_buffer_
private
int8_t* QueryMemoryInitializer::varlen_output_buffer_host_ptr_
private
std::shared_ptr<VarlenOutputInfo> QueryMemoryInitializer::varlen_output_info_
private

The documentation for this class was generated from the following files: