OmniSciDB  a5dc49c757
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryMemoryInitializer Class Reference

#include <QueryMemoryInitializer.h>

+ Collaboration diagram for QueryMemoryInitializer:

Classes

struct  TargetAggOpsMetadata
 

Public Types

using ModeIndexSet = robin_hood::unordered_set< size_t >
 
using QuantileParam = std::optional< double >
 

Public Member Functions

 QueryMemoryInitializer (const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const shared::TableKey &outer_table_key, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
 
 QueryMemoryInitializer (const TableFunctionExecutionUnit &exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *device_allocator, const Executor *executor)
 
const auto getCountDistinctBitmapDevicePtr () const
 
const auto getCountDistinctBitmapHostPtr () const
 
const auto getCountDistinctBitmapBytes () const
 
const auto getVarlenOutputHostPtr () const
 
const auto getVarlenOutputPtr () const
 
ResultSetgetResultSet (const size_t index) const
 
std::unique_ptr< ResultSetgetResultSetOwned (const size_t index)
 
void resetResultSet (const size_t index)
 
int64_t getAggInitValForIndex (const size_t index) const
 
const auto getGroupByBuffersPtr ()
 
const auto getGroupByBuffersSize () const
 
const auto getNumBuffers () const
 
GpuGroupByBuffers setupTableFunctionGpuBuffers (const QueryMemoryDescriptor &query_mem_desc, const int device_id, const unsigned block_size_x, const unsigned grid_size_x, const bool zero_initialize_buffers)
 
void copyFromTableFunctionGpuBuffers (Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
 
void copyGroupByBuffersFromGpu (DeviceAllocator &device_allocator, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
 

Private Member Functions

void initGroupByBuffer (int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, TargetAggOpsMetadata &agg_expr_metadata, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
 
void initRowGroups (const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, TargetAggOpsMetadata &agg_expr_metadata, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)
 
void initColumnarGroups (const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)
 
void initColumnsPerRow (const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const TargetAggOpsMetadata &agg_op_metadata)
 
void allocateCountDistinctGpuMem (const QueryMemoryDescriptor &query_mem_desc)
 
std::vector< int64_t > calculateCountDistinctBufferSize (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit) const
 
void allocateCountDistinctBuffers (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
int64_t allocateCountDistinctBitmap (const size_t bitmap_byte_sz)
 
int64_t allocateCountDistinctSet ()
 
ModeIndexSet initializeModeIndexSet (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
void allocateModeBuffer (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
std::vector< QuantileParaminitializeQuantileParams (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
void allocateTDigestsBuffer (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
GpuGroupByBuffers prepareTopNHeapsDevBuffer (const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
 
GpuGroupByBuffers createAndInitializeGroupByBufferGpu (const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const int device_id, const ExecutorDispatchMode dispatch_mode, const unsigned block_size_x, const unsigned grid_size_x, const int8_t warp_size, const bool can_sort_on_gpu, const bool output_columnar, RenderAllocator *render_allocator)
 
size_t computeNumberOfBuffers (const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
 
void compactProjectionBuffersCpu (const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
 
void compactProjectionBuffersGpu (const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
 
void applyStreamingTopNOffsetCpu (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
void applyStreamingTopNOffsetGpu (Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
 
std::shared_ptr< VarlenOutputInfogetVarlenOutputInfo ()
 

Private Attributes

const int64_t num_rows_
 
std::shared_ptr
< RowSetMemoryOwner
row_set_mem_owner_
 
std::vector< std::unique_ptr
< ResultSet > > 
result_sets_
 
std::vector< int64_t > init_agg_vals_
 
size_t num_buffers_
 
std::vector< int64_t * > group_by_buffers_
 
std::shared_ptr< VarlenOutputInfovarlen_output_info_
 
CUdeviceptr varlen_output_buffer_
 
int8_t * varlen_output_buffer_host_ptr_
 
CUdeviceptr count_distinct_bitmap_device_mem_ptr_
 
size_t count_distinct_bitmap_mem_size_
 
int8_t * count_distinct_bitmap_host_crt_ptr_
 
int8_t * count_distinct_bitmap_host_mem_ptr_
 
DeviceAllocatordevice_allocator_ {nullptr}
 
std::vector
< Data_Namespace::AbstractBuffer * > 
temporary_buffers_
 
const size_t thread_idx_
 

Friends

class Executor
 
class QueryExecutionContext
 

Detailed Description

Definition at line 35 of file QueryMemoryInitializer.h.

Member Typedef Documentation

using QueryMemoryInitializer::ModeIndexSet = robin_hood::unordered_set<size_t>

Definition at line 37 of file QueryMemoryInitializer.h.

using QueryMemoryInitializer::QuantileParam = std::optional<double>

Definition at line 38 of file QueryMemoryInitializer.h.

Constructor & Destructor Documentation

QueryMemoryInitializer::QueryMemoryInitializer ( const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const ExecutorDeviceType  device_type,
const ExecutorDispatchMode  dispatch_mode,
const bool  output_columnar,
const bool  sort_on_gpu,
const shared::TableKey outer_table_key,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
RenderAllocatorMap render_allocator_map,
RenderInfo render_info,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
DeviceAllocator gpu_allocator,
const size_t  thread_idx,
const Executor executor 
)

Definition at line 224 of file QueryMemoryInitializer.cpp.

References gpu_enabled::accumulate(), anonymous_namespace{QueryMemoryInitializer.cpp}::alloc_group_by_buffer(), allocateCountDistinctBuffers(), allocateCountDistinctGpuMem(), allocateModeBuffer(), allocateTDigestsBuffer(), calculateCountDistinctBufferSize(), CHECK, anonymous_namespace{QueryMemoryInitializer.cpp}::check_count_distinct_expr_metadata(), CHECK_EQ, CHECK_GE, anonymous_namespace{QueryMemoryInitializer.cpp}::check_total_bitmap_memory(), anonymous_namespace{QueryMemoryInitializer.cpp}::collect_target_expr_metadata(), QueryMemoryInitializer::TargetAggOpsMetadata::count_distinct_buf_size, CPU, RelAlgExecutionUnit::estimator, ResultSet::fixupQueryMemoryDescriptor(), g_max_memory_allocation_size, anonymous_namespace{QueryMemoryInitializer.cpp}::get_col_frag_offsets(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_consistent_frags_sizes(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_input_idx(), QueryMemoryDescriptor::getApproxQuantileDescriptors(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), getVarlenOutputInfo(), GPU, group_by_buffers_, QueryMemoryInitializer::TargetAggOpsMetadata::has_count_distinct, QueryMemoryInitializer::TargetAggOpsMetadata::has_mode, QueryMemoryInitializer::TargetAggOpsMetadata::has_tdigest, QueryMemoryDescriptor::hasKeylessHash(), QueryMemoryDescriptor::hasVarlenOutput(), initGroupByBuffer(), initializeModeIndexSet(), initializeQuantileParams(), QueryMemoryDescriptor::interleavedBins(), QueryMemoryDescriptor::isGroupBy(), KernelPerFragment, QueryMemoryDescriptor::lazyInitGroups(), QueryMemoryInitializer::TargetAggOpsMetadata::mode_index_set, num_buffers_, QueryMemoryInitializer::TargetAggOpsMetadata::quantile_params, result_sets_, row_set_mem_owner_, RelAlgExecutionUnit::target_exprs, target_exprs_to_infos(), RelAlgExecutionUnit::target_exprs_union, thread_idx_, QueryMemoryDescriptor::threadsCanReuseGroupByBuffers(), QueryMemoryDescriptor::threadsShareMemory(), RelAlgExecutionUnit::union_all, RelAlgExecutionUnit::use_bump_allocator, RenderInfo::useCudaBuffers(), QueryMemoryDescriptor::varlenOutputBufferElemSize(), and VLOG.

242  : num_rows_(num_rows)
243  , row_set_mem_owner_(row_set_mem_owner)
244  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
245  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
252  , device_allocator_(device_allocator)
253  , thread_idx_(thread_idx) {
254  CHECK(!sort_on_gpu || output_columnar);
255  executor->logSystemCPUMemoryStatus("Before Query Memory Initialization", thread_idx);
256 
257  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
258  if (consistent_frag_sizes.empty()) {
259  // No fragments in the input, no underlying buffers will be needed.
260  return;
261  }
262 
263  TargetAggOpsMetadata agg_op_metadata =
264  collect_target_expr_metadata(query_mem_desc, ra_exe_unit);
265  if (agg_op_metadata.has_count_distinct) {
266  check_count_distinct_expr_metadata(query_mem_desc, ra_exe_unit);
267  if (!ra_exe_unit.use_bump_allocator) {
268  check_total_bitmap_memory(query_mem_desc);
269  }
270  if (device_type == ExecutorDeviceType::GPU) {
271  allocateCountDistinctGpuMem(query_mem_desc);
272  }
273  agg_op_metadata.count_distinct_buf_size =
274  calculateCountDistinctBufferSize(query_mem_desc, ra_exe_unit);
275  size_t total_buffer_size{0};
276  for (auto buffer_size : agg_op_metadata.count_distinct_buf_size) {
277  if (buffer_size > 0) {
278  total_buffer_size += buffer_size;
279  }
280  }
281  total_buffer_size *= query_mem_desc.getEntryCount();
282  row_set_mem_owner_->initCountDistinctBufferAllocator(total_buffer_size, thread_idx_);
283  }
284 
285  if (agg_op_metadata.has_tdigest) {
286  auto const& descs = query_mem_desc.getApproxQuantileDescriptors();
287  // Pre-allocate all TDigest memory for this thread.
288  AddNbytes const add_nbytes{query_mem_desc.getEntryCount()};
289  size_t const capacity =
290  std::accumulate(descs.begin(), descs.end(), size_t(0), add_nbytes);
291  VLOG(2) << "row_set_mem_owner_->reserveTDigestMemory(" << thread_idx_ << ','
292  << capacity << ") query_mem_desc.getEntryCount()("
293  << query_mem_desc.getEntryCount() << ')';
294  row_set_mem_owner_->reserveTDigestMemory(thread_idx_, capacity);
295  }
296 
297  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
298  if (agg_op_metadata.has_count_distinct) {
299  allocateCountDistinctBuffers(query_mem_desc, ra_exe_unit);
300  }
301  if (agg_op_metadata.has_mode) {
302  allocateModeBuffer(query_mem_desc, ra_exe_unit);
303  }
304  if (agg_op_metadata.has_tdigest) {
305  allocateTDigestsBuffer(query_mem_desc, ra_exe_unit);
306  }
307  if (render_info && render_info->useCudaBuffers()) {
308  return;
309  }
310  }
311 
312  if (query_mem_desc.isGroupBy()) {
313  if (agg_op_metadata.has_mode) {
314  agg_op_metadata.mode_index_set =
315  initializeModeIndexSet(query_mem_desc, ra_exe_unit);
316  }
317  if (agg_op_metadata.has_tdigest) {
318  agg_op_metadata.quantile_params =
319  initializeQuantileParams(query_mem_desc, ra_exe_unit);
320  }
321  }
322 
323  if (ra_exe_unit.estimator) {
324  return;
325  }
326 
327  const auto thread_count = device_type == ExecutorDeviceType::GPU
328  ? executor->blockSize() * executor->gridSize()
329  : 1;
330 
331  size_t group_buffer_size{0};
332  if (ra_exe_unit.use_bump_allocator) {
333  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
334  // the fragment
335  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
336  group_buffer_size = num_rows * query_mem_desc.getRowSize();
337  } else {
338  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
339  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
340  }
341  } else {
342  group_buffer_size =
343  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
344  }
345  CHECK_GE(group_buffer_size, size_t(0));
346 
347  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
348  int64_t* group_by_buffer_template{nullptr};
349 
350  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
351  group_by_buffer_template = reinterpret_cast<int64_t*>(
352  row_set_mem_owner_->allocate(group_buffer_size, thread_idx_));
353  initGroupByBuffer(group_by_buffer_template,
354  ra_exe_unit,
355  query_mem_desc,
356  agg_op_metadata,
357  device_type,
358  output_columnar,
359  executor);
360  }
361 
362  if (query_mem_desc.interleavedBins(device_type)) {
363  CHECK(query_mem_desc.hasKeylessHash());
364  }
365 
366  const auto step = device_type == ExecutorDeviceType::GPU &&
367  query_mem_desc.threadsShareMemory() &&
368  query_mem_desc.isGroupBy()
369  ? executor->blockSize()
370  : size_t(1);
371  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
372  query_mem_desc.hasKeylessHash()
373  ? query_mem_desc.getEntryCount()
374  : size_t(0);
375  const auto actual_group_buffer_size =
376  group_buffer_size + index_buffer_qw * sizeof(int64_t);
377  CHECK_GE(actual_group_buffer_size, group_buffer_size);
378 
379  if (query_mem_desc.hasVarlenOutput()) {
380  const auto varlen_buffer_elem_size_opt = query_mem_desc.varlenOutputBufferElemSize();
381  CHECK(varlen_buffer_elem_size_opt); // TODO(adb): relax
382  auto const varlen_buffer_sz =
383  query_mem_desc.getEntryCount() * varlen_buffer_elem_size_opt.value();
384  auto varlen_output_buffer =
385  reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(varlen_buffer_sz));
386  num_buffers_ += 1;
387  group_by_buffers_.push_back(varlen_output_buffer);
388  }
389 
390  if (query_mem_desc.threadsCanReuseGroupByBuffers()) {
391  // Sanity checks, intra-thread buffer reuse should only
392  // occur on CPU for group-by queries, which also means
393  // that only one group-by buffer should be allocated
394  // (multiple-buffer allocation only occurs for GPU)
395  CHECK(device_type == ExecutorDeviceType::CPU);
396  CHECK(query_mem_desc.isGroupBy());
397  CHECK_EQ(group_buffers_count, size_t(1));
398  }
399 
400  // Group-by buffer reuse assumes 1 group-by-buffer per query step
401  // Multiple group-by-buffers should only be used on GPU,
402  // whereas buffer reuse only is done on CPU
403  CHECK(group_buffers_count <= 1 || !query_mem_desc.threadsCanReuseGroupByBuffers());
404  for (size_t i = 0; i < group_buffers_count; i += step) {
405  auto group_by_info =
406  alloc_group_by_buffer(actual_group_buffer_size,
407  render_allocator_map,
408  thread_idx_,
409  row_set_mem_owner_.get(),
410  query_mem_desc.threadsCanReuseGroupByBuffers());
411 
412  auto group_by_buffer = group_by_info.first;
413  const bool was_cached = group_by_info.second;
414  if (!was_cached) {
415  if (!query_mem_desc.lazyInitGroups(device_type)) {
416  if (group_by_buffer_template) {
417  memcpy(group_by_buffer + index_buffer_qw,
418  group_by_buffer_template,
419  group_buffer_size);
420  } else {
421  initGroupByBuffer(group_by_buffer + index_buffer_qw,
422  ra_exe_unit,
423  query_mem_desc,
424  agg_op_metadata,
425  device_type,
426  output_columnar,
427  executor);
428  }
429  }
430  }
431 
432  size_t old_size = group_by_buffers_.size();
433  group_by_buffers_.resize(old_size + std::max(size_t(1), step), nullptr);
434  group_by_buffers_[old_size] = group_by_buffer;
435 
436  const bool use_target_exprs_union =
437  ra_exe_unit.union_all && get_input_idx(ra_exe_unit, outer_table_key);
438  const auto& target_exprs = use_target_exprs_union ? ra_exe_unit.target_exprs_union
439  : ra_exe_unit.target_exprs;
440  const auto column_frag_offsets = get_col_frag_offsets(target_exprs, frag_offsets);
441  const auto column_frag_sizes =
442  get_consistent_frags_sizes(target_exprs, consistent_frag_sizes);
443 
444  old_size = result_sets_.size();
445  result_sets_.resize(old_size + std::max(size_t(1), step));
446  result_sets_[old_size] =
447  std::make_unique<ResultSet>(target_exprs_to_infos(target_exprs, query_mem_desc),
448  executor->getColLazyFetchInfo(target_exprs),
449  col_buffers,
450  column_frag_offsets,
451  column_frag_sizes,
452  device_type,
453  device_id,
454  thread_idx,
457  executor->blockSize(),
458  executor->gridSize());
459  result_sets_[old_size]->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
460  executor->plan_state_->init_agg_vals_,
462  }
463 }
ModeIndexSet initializeModeIndexSet(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:54
CUdeviceptr count_distinct_bitmap_device_mem_ptr_
QueryMemoryInitializer::TargetAggOpsMetadata collect_target_expr_metadata(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
DeviceAllocator * device_allocator_
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
const std::optional< bool > union_all
void allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
#define CHECK_GE(x, y)
Definition: Logger.h:306
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
const ApproxQuantileDescriptors & getApproxQuantileDescriptors() const
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< QuantileParam > initializeQuantileParams(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void check_count_distinct_expr_metadata(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
std::vector< Analyzer::Expr * > target_exprs_union
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::pair< int64_t *, bool > alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, const size_t thread_idx, RowSetMemoryOwner *mem_owner, const bool reuse_existing_buffer_for_thread)
std::vector< int64_t > calculateCountDistinctBufferSize(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit) const
std::vector< int64_t > init_agg_vals_
bool lazyInitGroups(const ExecutorDeviceType) const
bool threadsCanReuseGroupByBuffers() const
DEVICE auto accumulate(ARGS &&...args)
Definition: gpu_enabled.h:42
size_t g_max_memory_allocation_size
Definition: Execute.cpp:128
const std::shared_ptr< Analyzer::Estimator > estimator
std::vector< int64_t * > group_by_buffers_
std::optional< size_t > varlenOutputBufferElemSize() const
std::shared_ptr< VarlenOutputInfo > getVarlenOutputInfo()
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, TargetAggOpsMetadata &agg_expr_metadata, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
void allocateTDigestsBuffer(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void allocateModeBuffer(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:766
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:291
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
std::vector< std::unique_ptr< ResultSet > > result_sets_
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
#define VLOG(n)
Definition: Logger.h:388
int get_input_idx(RelAlgExecutionUnit const &ra_exe_unit, const shared::TableKey &outer_table_key)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)

+ Here is the call graph for this function:

QueryMemoryInitializer::QueryMemoryInitializer ( const TableFunctionExecutionUnit exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const ExecutorDeviceType  device_type,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
DeviceAllocator device_allocator,
const Executor executor 
)

Definition at line 466 of file QueryMemoryInitializer.cpp.

477  : num_rows_(num_rows)
478  , row_set_mem_owner_(row_set_mem_owner)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< int64_t > init_agg_vals_
std::vector< Analyzer::Expr * > target_exprs
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)

Member Function Documentation

int64_t QueryMemoryInitializer::allocateCountDistinctBitmap ( const size_t  bitmap_byte_sz)
private

Definition at line 912 of file QueryMemoryInitializer.cpp.

References CHECK, count_distinct_bitmap_host_crt_ptr_, count_distinct_bitmap_host_mem_ptr_, row_set_mem_owner_, and thread_idx_.

Referenced by allocateCountDistinctBuffers(), and initColumnsPerRow().

912  {
916  count_distinct_bitmap_host_crt_ptr_ += bitmap_byte_sz;
917  row_set_mem_owner_->addCountDistinctBuffer(
918  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
919  return reinterpret_cast<int64_t>(ptr);
920  }
921  return reinterpret_cast<int64_t>(
922  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz, thread_idx_));
923 }
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the caller graph for this function:

void QueryMemoryInitializer::allocateCountDistinctBuffers ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 890 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBitmap(), allocateCountDistinctSet(), Bitmap, CountDistinctDescriptor::bitmapPaddedSizeBytes(), CHECK, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), init_agg_vals_, is_distinct_target(), RelAlgExecutionUnit::target_exprs, and UnorderedSet.

Referenced by QueryMemoryInitializer().

892  {
893  for (size_t target_idx = 0; target_idx < ra_exe_unit.target_exprs.size();
894  ++target_idx) {
895  const auto target_expr = ra_exe_unit.target_exprs[target_idx];
896  const auto agg_info = get_target_info(target_expr, g_bigint_count);
897  if (is_distinct_target(agg_info)) {
898  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
899  const auto& count_distinct_desc =
900  query_mem_desc.getCountDistinctDescriptor(target_idx);
901  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
902  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
903  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
904  } else {
905  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::UnorderedSet);
906  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
907  }
908  }
909  }
910 }
std::vector< Analyzer::Expr * > target_exprs
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
std::vector< int64_t > init_agg_vals_
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK(condition)
Definition: Logger.h:291
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::allocateCountDistinctGpuMem ( const QueryMemoryDescriptor query_mem_desc)
private

Definition at line 835 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), Bitmap, CHECK, count_distinct_bitmap_device_mem_ptr_, count_distinct_bitmap_host_crt_ptr_, count_distinct_bitmap_host_mem_ptr_, count_distinct_bitmap_mem_size_, QueryMemoryDescriptor::countDistinctDescriptorsLogicallyEmpty(), device_allocator_, QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getCountDistinctDescriptorsSize(), QueryMemoryDescriptor::getEntryCount(), Invalid, row_set_mem_owner_, thread_idx_, and DeviceAllocator::zeroDeviceMem().

Referenced by QueryMemoryInitializer().

836  {
837  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
838  return;
839  }
841 
842  size_t total_bytes_per_entry{0};
843  const size_t num_count_distinct_descs =
844  query_mem_desc.getCountDistinctDescriptorsSize();
845  for (size_t i = 0; i < num_count_distinct_descs; i++) {
846  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
847  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
848  continue;
849  }
850  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
851  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
852  }
853 
855  total_bytes_per_entry * query_mem_desc.getEntryCount();
859  reinterpret_cast<int8_t*>(count_distinct_bitmap_device_mem_ptr_),
863 }
bool countDistinctDescriptorsLogicallyEmpty() const
CUdeviceptr count_distinct_bitmap_device_mem_ptr_
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:28
virtual int8_t * alloc(const size_t num_bytes)=0
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
size_t getCountDistinctDescriptorsSize() const
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK(condition)
Definition: Logger.h:291

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int64_t QueryMemoryInitializer::allocateCountDistinctSet ( )
private

Definition at line 925 of file QueryMemoryInitializer.cpp.

References row_set_mem_owner_.

Referenced by allocateCountDistinctBuffers(), and initColumnsPerRow().

925  {
926  auto count_distinct_set = new CountDistinctSet();
927  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
928  return reinterpret_cast<int64_t>(count_distinct_set);
929 }
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_

+ Here is the caller graph for this function:

void QueryMemoryInitializer::allocateModeBuffer ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 946 of file QueryMemoryInitializer.cpp.

References CHECK_LE, CHECK_LT, RelAlgExecutionUnit::eachAggTarget(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), init_agg_vals_, kMODE, row_set_mem_owner_, and RelAlgExecutionUnit::target_exprs.

Referenced by QueryMemoryInitializer().

948  {
949  size_t const slot_count = query_mem_desc.getSlotCount();
950  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
951  ra_exe_unit.eachAggTarget<kMODE>([&](Analyzer::AggExpr const*,
952  size_t const target_idx) {
953  size_t const agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
954  CHECK_LT(agg_col_idx, slot_count);
955  AggMode* agg_mode = row_set_mem_owner_->allocateMode();
956  init_agg_vals_[agg_col_idx] = reinterpret_cast<int64_t>(agg_mode);
957  });
958 }
std::vector< Analyzer::Expr * > target_exprs
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
void eachAggTarget(std::function< void(Analyzer::AggExpr const *, size_t target_idx)> lambda) const
std::vector< int64_t > init_agg_vals_
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK_LE(x, y)
Definition: Logger.h:304
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
Definition: sqldefs.h:86

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::allocateTDigestsBuffer ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 981 of file QueryMemoryInitializer.cpp.

References CHECK, CHECK_EQ, CHECK_LE, CHECK_LT, RelAlgExecutionUnit::eachAggTarget(), QueryMemoryDescriptor::getApproxQuantileDescriptors(), QueryMemoryDescriptor::getLogicalSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), init_agg_vals_, kAPPROX_QUANTILE, row_set_mem_owner_, RelAlgExecutionUnit::target_exprs, and thread_idx_.

Referenced by QueryMemoryInitializer().

983  {
984  size_t const slot_count = query_mem_desc.getSlotCount();
985  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
986 
987  auto const& descs = query_mem_desc.getApproxQuantileDescriptors();
988  size_t approx_quantile_descriptors_idx = 0u;
989  ra_exe_unit.eachAggTarget<kAPPROX_QUANTILE>([&](Analyzer::AggExpr const* const agg_expr,
990  size_t const target_idx) {
991  size_t const agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
992  CHECK_LT(agg_col_idx, slot_count);
993  CHECK_EQ(static_cast<int8_t>(sizeof(int64_t)),
994  query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx));
995  auto const q_expr =
996  dynamic_cast<Analyzer::Constant const*>(agg_expr->get_arg1().get());
997  CHECK(q_expr);
998  auto const q = q_expr->get_constval().doubleval;
999  auto const& desc = descs.at(approx_quantile_descriptors_idx++);
1000  init_agg_vals_[agg_col_idx] =
1001  reinterpret_cast<int64_t>(row_set_mem_owner_->initTDigest(thread_idx_, desc, q));
1002  });
1003 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
const ApproxQuantileDescriptors & getApproxQuantileDescriptors() const
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
void eachAggTarget(std::function< void(Analyzer::AggExpr const *, size_t target_idx)> lambda) const
std::vector< int64_t > init_agg_vals_
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK_LE(x, y)
Definition: Logger.h:304
#define CHECK(condition)
Definition: Logger.h:291
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::applyStreamingTopNOffsetCpu ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 1372 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, CPU, streaming_top_n::get_rows_copy_from_heaps(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), SortInfo::limit, SortInfo::offset, and RelAlgExecutionUnit::sort_info.

1374  {
1375  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1376  CHECK_EQ(group_by_buffers_.size(), buffer_start_idx + 1);
1377 
1378  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1379  group_by_buffers_[buffer_start_idx],
1380  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1381  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit.value_or(0),
1382  1);
1383  CHECK_EQ(rows_copy.size(),
1384  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1385  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1386 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
std::optional< size_t > limit
std::vector< int64_t * > group_by_buffers_
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)

+ Here is the call graph for this function:

void QueryMemoryInitializer::applyStreamingTopNOffsetGpu ( Data_Namespace::DataMgr data_mgr,
const QueryMemoryDescriptor query_mem_desc,
const GpuGroupByBuffers gpu_group_by_buffers,
const RelAlgExecutionUnit ra_exe_unit,
const unsigned  total_thread_count,
const int  device_id 
)
private

Definition at line 1388 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, GpuGroupByBuffers::data, QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), num_buffers_, and UNREACHABLE.

1394  {
1395 #ifdef HAVE_CUDA
1397  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1398 
1399  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1400  data_mgr,
1401  reinterpret_cast<int64_t*>(gpu_group_by_buffers.data),
1402  ra_exe_unit,
1403  query_mem_desc,
1404  total_thread_count,
1405  device_id);
1406  CHECK_EQ(
1407  rows_copy.size(),
1408  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1409  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1410 #else
1411  UNREACHABLE();
1412 #endif
1413 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
#define UNREACHABLE()
Definition: Logger.h:338
std::vector< int64_t * > group_by_buffers_

+ Here is the call graph for this function:

std::vector< int64_t > QueryMemoryInitializer::calculateCountDistinctBufferSize ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
) const
private

Definition at line 865 of file QueryMemoryInitializer.cpp.

References Bitmap, CountDistinctDescriptor::bitmapPaddedSizeBytes(), CHECK, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), is_distinct_target(), RelAlgExecutionUnit::target_exprs, and UnorderedSet.

Referenced by QueryMemoryInitializer().

867  {
868  const size_t agg_col_count{query_mem_desc.getSlotCount()};
869  std::vector<int64_t> agg_bitmap_size(agg_col_count);
870  for (size_t target_idx = 0; target_idx < ra_exe_unit.target_exprs.size();
871  ++target_idx) {
872  const auto target_expr = ra_exe_unit.target_exprs[target_idx];
873  const auto agg_info = get_target_info(target_expr, g_bigint_count);
874  if (is_distinct_target(agg_info)) {
875  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
876  const auto& count_distinct_desc =
877  query_mem_desc.getCountDistinctDescriptor(target_idx);
878  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
879  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
880  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
881  } else {
882  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::UnorderedSet);
883  agg_bitmap_size[agg_col_idx] = -1;
884  }
885  }
886  }
887  return agg_bitmap_size;
888 }
std::vector< Analyzer::Expr * > target_exprs
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK(condition)
Definition: Logger.h:291
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::compactProjectionBuffersCpu ( const QueryMemoryDescriptor query_mem_desc,
const size_t  projection_count 
)
private

Definition at line 1295 of file QueryMemoryInitializer.cpp.

References CHECK, anonymous_namespace{QueryMemoryInitializer.cpp}::compact_projection_buffer_for_cpu_columnar(), QueryMemoryDescriptor::getEntryCount(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), and result_sets_.

1297  {
1298  const auto num_allocated_rows =
1299  std::min(projection_count, query_mem_desc.getEntryCount());
1300  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1301 
1302  // copy the results from the main buffer into projection_buffer
1304  query_mem_desc,
1305  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1306  num_allocated_rows);
1307 
1308  // update the entry count for the result set, and its underlying storage
1309  CHECK(!result_sets_.empty());
1310  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1311 }
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
#define CHECK(condition)
Definition: Logger.h:291
std::vector< std::unique_ptr< ResultSet > > result_sets_

+ Here is the call graph for this function:

void QueryMemoryInitializer::compactProjectionBuffersGpu ( const QueryMemoryDescriptor query_mem_desc,
Data_Namespace::DataMgr data_mgr,
const GpuGroupByBuffers gpu_group_by_buffers,
const size_t  projection_count,
const int  device_id 
)
private

Definition at line 1313 of file QueryMemoryInitializer.cpp.

References CHECK, copy_projection_buffer_from_gpu_columnar(), QueryMemoryDescriptor::getEntryCount(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), and result_sets_.

1318  {
1319  // store total number of allocated rows:
1320  const auto num_allocated_rows =
1321  std::min(projection_count, query_mem_desc.getEntryCount());
1322 
1323  // copy the results from the main buffer into projection_buffer
1324  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1326  data_mgr,
1327  gpu_group_by_buffers,
1328  query_mem_desc,
1329  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1330  num_allocated_rows,
1331  device_id);
1332 
1333  // update the entry count for the result set, and its underlying storage
1334  CHECK(!result_sets_.empty());
1335  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1336 }
std::vector< int64_t * > group_by_buffers_
#define CHECK(condition)
Definition: Logger.h:291
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
std::vector< std::unique_ptr< ResultSet > > result_sets_

+ Here is the call graph for this function:

size_t QueryMemoryInitializer::computeNumberOfBuffers ( const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
const Executor executor 
) const
private

Definition at line 1251 of file QueryMemoryInitializer.cpp.

References QueryMemoryDescriptor::blocksShareMemory(), and CPU.

1254  {
1255  return device_type == ExecutorDeviceType::CPU
1256  ? 1
1257  : executor->blockSize() *
1258  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
1259 }

+ Here is the call graph for this function:

void QueryMemoryInitializer::copyFromTableFunctionGpuBuffers ( Data_Namespace::DataMgr data_mgr,
const QueryMemoryDescriptor query_mem_desc,
const size_t  entry_count,
const GpuGroupByBuffers gpu_group_by_buffers,
const int  device_id,
const unsigned  block_size_x,
const unsigned  grid_size_x 
)

Definition at line 1214 of file QueryMemoryInitializer.cpp.

References align_to_int64(), CHECK_LE, GpuGroupByBuffers::data, GpuGroupByBuffers::entry_count, QueryMemoryDescriptor::getBufferColSlotCount(), QueryMemoryDescriptor::getColSlotContext(), getQueryEngineCudaStreamForDevice(), ColSlotContext::getSlotInfo(), group_by_buffers_, and SlotSize::logical_size.

1221  {
1222  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
1223 
1224  int8_t* dev_buffer = gpu_group_by_buffers.data;
1225  int8_t* host_buffer = reinterpret_cast<int8_t*>(group_by_buffers_[0]);
1226 
1227  const size_t original_entry_count = gpu_group_by_buffers.entry_count;
1228  CHECK_LE(entry_count, original_entry_count);
1229  size_t output_device_col_offset{0};
1230  size_t output_host_col_offset{0};
1231 
1232  const auto col_slot_context = query_mem_desc.getColSlotContext();
1233 
1234  auto allocator = std::make_unique<CudaAllocator>(
1235  data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id));
1236 
1237  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1238  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
1239  const size_t output_device_col_size = original_entry_count * col_width;
1240  const size_t output_host_col_size = entry_count * col_width;
1241  allocator->copyFromDevice(host_buffer + output_host_col_offset,
1242  dev_buffer + output_device_col_offset,
1243  output_host_col_size);
1244  output_device_col_offset =
1245  align_to_int64(output_device_col_offset + output_device_col_size);
1246  output_host_col_offset =
1247  align_to_int64(output_host_col_offset + output_host_col_size);
1248  }
1249 }
int8_t logical_size
const SlotSize & getSlotInfo(const size_t slot_idx) const
std::vector< int64_t * > group_by_buffers_
#define CHECK_LE(x, y)
Definition: Logger.h:304
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
const ColSlotContext & getColSlotContext() const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

void QueryMemoryInitializer::copyGroupByBuffersFromGpu ( DeviceAllocator device_allocator,
const QueryMemoryDescriptor query_mem_desc,
const size_t  entry_count,
const GpuGroupByBuffers gpu_group_by_buffers,
const RelAlgExecutionUnit ra_exe_unit,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int  device_id,
const bool  prepend_index_buffer 
) const

Definition at line 1338 of file QueryMemoryInitializer.cpp.

References copy_group_by_buffers_from_gpu(), GpuGroupByBuffers::data, streaming_top_n::get_heap_size(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getRowSize(), GPU, group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), SortInfo::limit, anonymous_namespace{Utm.h}::n, SortInfo::offset, RelAlgExecutionUnit::sort_info, and QueryMemoryDescriptor::useStreamingTopN().

1347  {
1348  const auto thread_count = block_size_x * grid_size_x;
1349 
1350  size_t total_buff_size{0};
1351  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
1352  const size_t n =
1353  ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit.value_or(0);
1354  total_buff_size =
1355  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1356  } else {
1357  total_buff_size =
1358  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
1359  }
1360  copy_group_by_buffers_from_gpu(device_allocator,
1362  total_buff_size,
1363  gpu_group_by_buffers.data,
1364  query_mem_desc,
1365  block_size_x,
1366  grid_size_x,
1367  device_id,
1368  prepend_index_buffer,
1369  query_mem_desc.hasVarlenOutput());
1370 }
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
std::optional< size_t > limit
std::vector< int64_t * > group_by_buffers_
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
constexpr double n
Definition: Utm.h:38
void copy_group_by_buffers_from_gpu(DeviceAllocator &device_allocator, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const int8_t *group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)

+ Here is the call graph for this function:

GpuGroupByBuffers QueryMemoryInitializer::createAndInitializeGroupByBufferGpu ( const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int8_t *  init_agg_vals_dev_ptr,
const int  device_id,
const ExecutorDispatchMode  dispatch_mode,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int8_t  warp_size,
const bool  can_sort_on_gpu,
const bool  output_columnar,
RenderAllocator render_allocator 
)
private

Definition at line 1059 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), CHECK, CHECK_EQ, DeviceAllocator::copyToDevice(), create_dev_group_by_buffers(), device_allocator_, RenderAllocator::getAllocatedSize(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getEntryCount(), getGroupByBuffersSize(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getRowSize(), QueryMemoryDescriptor::getSlotCount(), GPU, group_by_buffers_, QueryMemoryDescriptor::hasKeylessHash(), QueryMemoryDescriptor::hasVarlenOutput(), init_columnar_group_by_buffer_on_device(), init_group_by_buffer_on_device(), QueryMemoryDescriptor::interleavedBins(), QueryMemoryDescriptor::lazyInitGroups(), SortInfo::limit, anonymous_namespace{Utm.h}::n, num_rows_, SortInfo::offset, prepareTopNHeapsDevBuffer(), row_set_mem_owner_, RelAlgExecutionUnit::sort_info, thread_idx_, QueryMemoryDescriptor::threadsShareMemory(), UNREACHABLE, RelAlgExecutionUnit::use_bump_allocator, QueryMemoryDescriptor::useStreamingTopN(), varlen_output_buffer_, varlen_output_buffer_host_ptr_, varlen_output_info_, and QueryMemoryDescriptor::varlenOutputBufferElemSize().

1070  {
1071 #ifdef HAVE_CUDA
1072  if (query_mem_desc.useStreamingTopN()) {
1073  if (render_allocator) {
1075  }
1076  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit.value_or(0);
1077  CHECK(!output_columnar);
1078 
1080  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
1081  }
1082 
1083  auto dev_group_by_buffers =
1086  query_mem_desc,
1087  block_size_x,
1088  grid_size_x,
1089  device_id,
1090  dispatch_mode,
1091  num_rows_,
1092  can_sort_on_gpu,
1093  false,
1094  ra_exe_unit.use_bump_allocator,
1095  query_mem_desc.hasVarlenOutput(),
1096  render_allocator);
1097  if (query_mem_desc.hasVarlenOutput()) {
1098  CHECK(dev_group_by_buffers.varlen_output_buffer);
1100  reinterpret_cast<CUdeviceptr>(dev_group_by_buffers.varlen_output_buffer);
1101  CHECK(query_mem_desc.varlenOutputBufferElemSize());
1102  const size_t varlen_output_buf_bytes =
1103  query_mem_desc.getEntryCount() *
1104  query_mem_desc.varlenOutputBufferElemSize().value();
1106  row_set_mem_owner_->allocate(varlen_output_buf_bytes, thread_idx_);
1108  varlen_output_info_->gpu_start_address = static_cast<int64_t>(varlen_output_buffer_);
1110  }
1111  if (render_allocator) {
1112  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
1113  }
1114  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
1115  CHECK(!render_allocator);
1116 
1117  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
1118  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
1119  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
1120  auto group_by_dev_buffer = dev_group_by_buffers.data;
1121  const size_t col_count = query_mem_desc.getSlotCount();
1122  int8_t* col_widths_dev_ptr{nullptr};
1123  if (output_columnar) {
1124  std::vector<int8_t> compact_col_widths(col_count);
1125  for (size_t idx = 0; idx < col_count; ++idx) {
1126  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
1127  }
1128  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
1130  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
1131  }
1132  const int8_t warp_count =
1133  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
1134  const auto num_group_by_buffers =
1135  getGroupByBuffersSize() - (query_mem_desc.hasVarlenOutput() ? 1 : 0);
1136  for (size_t i = 0; i < num_group_by_buffers; i += step) {
1137  if (output_columnar) {
1139  reinterpret_cast<int64_t*>(group_by_dev_buffer),
1140  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1141  dev_group_by_buffers.entry_count,
1142  query_mem_desc.getGroupbyColCount(),
1143  col_count,
1144  col_widths_dev_ptr,
1145  /*need_padding = */ true,
1146  query_mem_desc.hasKeylessHash(),
1147  sizeof(int64_t),
1148  block_size_x,
1149  grid_size_x);
1150  } else {
1152  reinterpret_cast<int64_t*>(group_by_dev_buffer),
1153  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1154  dev_group_by_buffers.entry_count,
1155  query_mem_desc.getGroupbyColCount(),
1156  query_mem_desc.getEffectiveKeyWidth(),
1157  query_mem_desc.getRowSize() / sizeof(int64_t),
1158  query_mem_desc.hasKeylessHash(),
1159  warp_count,
1160  block_size_x,
1161  grid_size_x);
1162  }
1163  group_by_dev_buffer += groups_buffer_size;
1164  }
1165  }
1166  return dev_group_by_buffers;
1167 #else
1168  UNREACHABLE();
1169  return {};
1170 #endif
1171 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *device_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:70
GpuGroupByBuffers prepareTopNHeapsDevBuffer(const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:28
#define UNREACHABLE()
Definition: Logger.h:338
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getEffectiveKeyWidth() const
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
size_t getGroupbyColCount() const
bool lazyInitGroups(const ExecutorDeviceType) const
std::optional< size_t > limit
size_t getAllocatedSize() const
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
std::vector< int64_t * > group_by_buffers_
std::optional< size_t > varlenOutputBufferElemSize() const
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:291
const auto getGroupByBuffersSize() const
std::shared_ptr< VarlenOutputInfo > varlen_output_info_
constexpr double n
Definition: Utm.h:38

+ Here is the call graph for this function:

int64_t QueryMemoryInitializer::getAggInitValForIndex ( const size_t  index) const
inline

Definition at line 111 of file QueryMemoryInitializer.h.

References CHECK_LT, and init_agg_vals_.

111  {
112  CHECK_LT(index, init_agg_vals_.size());
113  return init_agg_vals_[index];
114  }
std::vector< int64_t > init_agg_vals_
#define CHECK_LT(x, y)
Definition: Logger.h:303
const auto QueryMemoryInitializer::getCountDistinctBitmapBytes ( ) const
inline

Definition at line 87 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_mem_size_.

87  {
89  }
const auto QueryMemoryInitializer::getCountDistinctBitmapDevicePtr ( ) const
inline

Definition at line 79 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_device_mem_ptr_.

79  {
81  }
CUdeviceptr count_distinct_bitmap_device_mem_ptr_
const auto QueryMemoryInitializer::getCountDistinctBitmapHostPtr ( ) const
inline

Definition at line 83 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_host_mem_ptr_.

83  {
85  }
const auto QueryMemoryInitializer::getGroupByBuffersPtr ( )
inline

Definition at line 116 of file QueryMemoryInitializer.h.

References group_by_buffers_.

116  {
117  return reinterpret_cast<int64_t**>(group_by_buffers_.data());
118  }
std::vector< int64_t * > group_by_buffers_
const auto QueryMemoryInitializer::getGroupByBuffersSize ( ) const
inline

Definition at line 120 of file QueryMemoryInitializer.h.

References group_by_buffers_.

Referenced by createAndInitializeGroupByBufferGpu().

120 { return group_by_buffers_.size(); }
std::vector< int64_t * > group_by_buffers_

+ Here is the caller graph for this function:

const auto QueryMemoryInitializer::getNumBuffers ( ) const
inline

Definition at line 122 of file QueryMemoryInitializer.h.

References CHECK_EQ, group_by_buffers_, and num_buffers_.

122  {
124  return num_buffers_;
125  }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
std::vector< int64_t * > group_by_buffers_
ResultSet* QueryMemoryInitializer::getResultSet ( const size_t  index) const
inline

Definition at line 96 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

96  {
97  CHECK_LT(index, result_sets_.size());
98  return result_sets_[index].get();
99  }
#define CHECK_LT(x, y)
Definition: Logger.h:303
std::vector< std::unique_ptr< ResultSet > > result_sets_
std::unique_ptr<ResultSet> QueryMemoryInitializer::getResultSetOwned ( const size_t  index)
inline

Definition at line 101 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

101  {
102  CHECK_LT(index, result_sets_.size());
103  return std::move(result_sets_[index]);
104  }
#define CHECK_LT(x, y)
Definition: Logger.h:303
std::vector< std::unique_ptr< ResultSet > > result_sets_
const auto QueryMemoryInitializer::getVarlenOutputHostPtr ( ) const
inline

Definition at line 92 of file QueryMemoryInitializer.h.

References varlen_output_buffer_host_ptr_.

std::shared_ptr< VarlenOutputInfo > QueryMemoryInitializer::getVarlenOutputInfo ( )
private

Definition at line 1415 of file QueryMemoryInitializer.cpp.

References varlen_output_buffer_, varlen_output_buffer_host_ptr_, and varlen_output_info_.

Referenced by QueryMemoryInitializer().

1415  {
1416  if (varlen_output_info_) {
1417  return varlen_output_info_;
1418  }
1419 
1420  // shared_ptr so that both the ResultSet and QMI can hold on to the varlen info object
1421  // and update it as needed
1422  varlen_output_info_ = std::make_shared<VarlenOutputInfo>(VarlenOutputInfo{
1423  static_cast<int64_t>(varlen_output_buffer_), varlen_output_buffer_host_ptr_});
1424  return varlen_output_info_;
1425 }
std::shared_ptr< VarlenOutputInfo > varlen_output_info_

+ Here is the caller graph for this function:

const auto QueryMemoryInitializer::getVarlenOutputPtr ( ) const
inline

Definition at line 94 of file QueryMemoryInitializer.h.

References varlen_output_buffer_.

94 { return varlen_output_buffer_; }
void QueryMemoryInitializer::initColumnarGroups ( const QueryMemoryDescriptor query_mem_desc,
int64_t *  groups_buffer,
const std::vector< int64_t > &  init_vals,
const Executor executor,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 704 of file QueryMemoryInitializer.cpp.

References align_to_int64(), CHECK, CHECK_LT, EMPTY_KEY_64, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getQueryDescriptionType(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::hasKeylessHash(), is_distinct_target(), heavyai::Projection, and RelAlgExecutionUnit::target_exprs.

Referenced by initGroupByBuffer().

709  {
710  CHECK(groups_buffer);
711 
712  for (const auto target_expr : ra_exe_unit.target_exprs) {
713  const auto agg_info = get_target_info(target_expr, g_bigint_count);
714  CHECK(!is_distinct_target(agg_info));
715  }
716  const int32_t agg_col_count = query_mem_desc.getSlotCount();
717  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
718 
719  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
720  if (!query_mem_desc.hasKeylessHash()) {
721  const size_t key_count{query_mem_desc.getGroupbyColCount()};
722  for (size_t i = 0; i < key_count; ++i) {
723  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
724  EMPTY_KEY_64,
725  groups_buffer_entry_count);
726  }
727  }
728 
730  // initializing all aggregate columns:
731  int32_t init_val_idx = 0;
732  for (int32_t i = 0; i < agg_col_count; ++i) {
733  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
734  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
735  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
736  case 1:
737  buffer_ptr = initColumnarBuffer<int8_t>(
738  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
739  break;
740  case 2:
741  buffer_ptr =
742  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
743  init_vals[init_val_idx++],
744  groups_buffer_entry_count);
745  break;
746  case 4:
747  buffer_ptr =
748  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
749  init_vals[init_val_idx++],
750  groups_buffer_entry_count);
751  break;
752  case 8:
753  buffer_ptr =
754  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
755  init_vals[init_val_idx++],
756  groups_buffer_entry_count);
757  break;
758  case 0:
759  break;
760  default:
761  CHECK(false);
762  }
763 
764  buffer_ptr = align_to_int64(buffer_ptr);
765  }
766  }
767  }
768 }
std::vector< Analyzer::Expr * > target_exprs
#define EMPTY_KEY_64
Projection
Definition: enums.h:58
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
size_t getGroupbyColCount() const
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:102
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
QueryDescriptionType getQueryDescriptionType() const
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK(condition)
Definition: Logger.h:291
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initColumnsPerRow ( const QueryMemoryDescriptor query_mem_desc,
int8_t *  row_ptr,
const std::vector< int64_t > &  init_vals,
const TargetAggOpsMetadata agg_op_metadata 
)
private

Definition at line 770 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBitmap(), allocateCountDistinctSet(), CHECK, CHECK_EQ, CHECK_LT, CHECK_NE, QueryMemoryInitializer::TargetAggOpsMetadata::count_distinct_buf_size, QueryMemoryDescriptor::getApproxQuantileDescriptors(), QueryMemoryDescriptor::getNextColOffInBytesRowOnly(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryInitializer::TargetAggOpsMetadata::has_count_distinct, QueryMemoryInitializer::TargetAggOpsMetadata::has_mode, QueryMemoryInitializer::TargetAggOpsMetadata::has_tdigest, QueryMemoryDescriptor::isGroupBy(), QueryMemoryInitializer::TargetAggOpsMetadata::mode_index_set, QueryMemoryInitializer::TargetAggOpsMetadata::quantile_params, row_set_mem_owner_, and thread_idx_.

Referenced by initRowGroups().

774  {
775  int8_t* col_ptr = row_ptr;
776  size_t init_vec_idx = 0;
777  size_t approx_quantile_descriptors_idx = 0;
778  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
779  col_ptr += query_mem_desc.getNextColOffInBytesRowOnly(col_ptr, col_idx++)) {
780  int64_t init_val{0};
781  if (query_mem_desc.isGroupBy()) {
782  if (agg_op_metadata.has_count_distinct &&
783  agg_op_metadata.count_distinct_buf_size[col_idx]) {
784  // COUNT DISTINCT / APPROX_COUNT_DISTINCT
785  // create a data structure for count_distinct operator per entries
786  const int64_t bm_sz{agg_op_metadata.count_distinct_buf_size[col_idx]};
787  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
788  sizeof(int64_t));
789  init_val =
791  CHECK_NE(init_val, 0);
792  ++init_vec_idx;
793  } else if (agg_op_metadata.has_tdigest &&
794  agg_op_metadata.quantile_params[col_idx]) {
795  auto const q = *agg_op_metadata.quantile_params[col_idx];
796  auto const& descs = query_mem_desc.getApproxQuantileDescriptors();
797  auto const& desc = descs.at(approx_quantile_descriptors_idx++);
798  init_val = reinterpret_cast<int64_t>(
799  row_set_mem_owner_->initTDigest(thread_idx_, desc, q));
800  CHECK_NE(init_val, 0);
801  ++init_vec_idx;
802  } else if (agg_op_metadata.has_mode &&
803  agg_op_metadata.mode_index_set.count(col_idx)) {
804  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->allocateMode());
805  CHECK_NE(init_val, 0);
806  ++init_vec_idx;
807  }
808  }
809  auto const col_slot_width = query_mem_desc.getPaddedSlotWidthBytes(col_idx);
810  if (init_val == 0 && col_slot_width > 0) {
811  CHECK_LT(init_vec_idx, init_vals.size());
812  init_val = init_vals[init_vec_idx++];
813  }
814  switch (col_slot_width) {
815  case 1:
816  *col_ptr = static_cast<int8_t>(init_val);
817  break;
818  case 2:
819  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
820  break;
821  case 4:
822  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
823  break;
824  case 8:
825  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
826  break;
827  case 0:
828  continue;
829  default:
830  CHECK(false);
831  }
832  }
833 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
const ApproxQuantileDescriptors & getApproxQuantileDescriptors() const
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
#define CHECK_NE(x, y)
Definition: Logger.h:302
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:303
size_t getNextColOffInBytesRowOnly(const int8_t *col_ptr, const size_t col_idx) const
#define CHECK(condition)
Definition: Logger.h:291
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initGroupByBuffer ( int64_t *  buffer,
const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
TargetAggOpsMetadata agg_expr_metadata,
const ExecutorDeviceType  device_type,
const bool  output_columnar,
const Executor executor 
)
private

Definition at line 568 of file QueryMemoryInitializer.cpp.

References streaming_top_n::get_rows_offset_of_heaps(), QueryMemoryDescriptor::getEntryCount(), GPU, init_agg_vals_, initColumnarGroups(), initRowGroups(), QueryMemoryDescriptor::interleavedBins(), SortInfo::limit, anonymous_namespace{Utm.h}::n, SortInfo::offset, RelAlgExecutionUnit::sort_info, and QueryMemoryDescriptor::useStreamingTopN().

Referenced by QueryMemoryInitializer().

575  {
576  if (output_columnar) {
577  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor, ra_exe_unit);
578  } else {
579  auto rows_ptr = buffer;
580  auto actual_entry_count = query_mem_desc.getEntryCount();
581  const auto thread_count = device_type == ExecutorDeviceType::GPU
582  ? executor->blockSize() * executor->gridSize()
583  : 1;
584  auto warp_size =
585  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
586  if (query_mem_desc.useStreamingTopN()) {
587  const auto node_count_size = thread_count * sizeof(int64_t);
588  memset(rows_ptr, 0, node_count_size);
589  const auto n =
590  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit.value_or(0);
591  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
592  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
593  rows_ptr += rows_offset / sizeof(int64_t);
594  actual_entry_count = n * thread_count;
595  warp_size = 1;
596  }
597  initRowGroups(query_mem_desc,
598  rows_ptr,
600  agg_op_metadata,
601  actual_entry_count,
602  warp_size,
603  executor,
604  ra_exe_unit);
605  }
606 }
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
std::vector< int64_t > init_agg_vals_
std::optional< size_t > limit
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)
bool interleavedBins(const ExecutorDeviceType) const
constexpr double n
Definition: Utm.h:38
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, TargetAggOpsMetadata &agg_expr_metadata, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

QueryMemoryInitializer::ModeIndexSet QueryMemoryInitializer::initializeModeIndexSet ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 931 of file QueryMemoryInitializer.cpp.

References CHECK_LE, CHECK_LT, RelAlgExecutionUnit::eachAggTarget(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), kMODE, and RelAlgExecutionUnit::target_exprs.

Referenced by QueryMemoryInitializer().

933  {
934  size_t const slot_count = query_mem_desc.getSlotCount();
935  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
936  ModeIndexSet mode_index_set;
937  ra_exe_unit.eachAggTarget<kMODE>([&](Analyzer::AggExpr const*,
938  size_t const target_idx) {
939  size_t const agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
940  CHECK_LT(agg_col_idx, slot_count);
941  mode_index_set.emplace(agg_col_idx);
942  });
943  return mode_index_set;
944 }
std::vector< Analyzer::Expr * > target_exprs
void eachAggTarget(std::function< void(Analyzer::AggExpr const *, size_t target_idx)> lambda) const
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK_LE(x, y)
Definition: Logger.h:304
robin_hood::unordered_set< size_t > ModeIndexSet
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
Definition: sqldefs.h:86

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

std::vector< QueryMemoryInitializer::QuantileParam > QueryMemoryInitializer::initializeQuantileParams ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 961 of file QueryMemoryInitializer.cpp.

References CHECK, CHECK_EQ, CHECK_LE, CHECK_LT, RelAlgExecutionUnit::eachAggTarget(), QueryMemoryDescriptor::getLogicalSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), kAPPROX_QUANTILE, and RelAlgExecutionUnit::target_exprs.

Referenced by QueryMemoryInitializer().

963  {
964  size_t const slot_count = query_mem_desc.getSlotCount();
965  CHECK_LE(ra_exe_unit.target_exprs.size(), slot_count);
966  std::vector<QuantileParam> quantile_params(slot_count);
967  ra_exe_unit.eachAggTarget<kAPPROX_QUANTILE>([&](Analyzer::AggExpr const* const agg_expr,
968  size_t const target_idx) {
969  size_t const agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
970  CHECK_LT(agg_col_idx, slot_count);
971  CHECK_EQ(static_cast<int8_t>(sizeof(int64_t)),
972  query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx));
973  auto const q_expr =
974  dynamic_cast<Analyzer::Constant const*>(agg_expr->get_arg1().get());
975  CHECK(q_expr);
976  quantile_params[agg_col_idx] = q_expr->get_constval().doubleval;
977  });
978  return quantile_params;
979 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
void eachAggTarget(std::function< void(Analyzer::AggExpr const *, size_t target_idx)> lambda) const
#define CHECK_LT(x, y)
Definition: Logger.h:303
#define CHECK_LE(x, y)
Definition: Logger.h:304
#define CHECK(condition)
Definition: Logger.h:291
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initRowGroups ( const QueryMemoryDescriptor query_mem_desc,
int64_t *  groups_buffer,
const std::vector< int64_t > &  init_vals,
TargetAggOpsMetadata agg_expr_metadata,
const int32_t  groups_buffer_entry_count,
const size_t  warp_size,
const Executor executor,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 608 of file QueryMemoryInitializer.cpp.

References CHECK, cpu_threads(), result_set::fill_empty_key(), ResultSet::fixupQueryMemoryDescriptor(), g_optimize_row_initialization, QueryMemoryDescriptor::getAvailableCpuThreads(), QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getRowSize(), QueryMemoryInitializer::TargetAggOpsMetadata::has_count_distinct, QueryMemoryInitializer::TargetAggOpsMetadata::has_mode, QueryMemoryInitializer::TargetAggOpsMetadata::has_tdigest, QueryMemoryDescriptor::hasKeylessHash(), initColumnsPerRow(), and threading_serial::parallel_for().

Referenced by initGroupByBuffer().

615  {
616  const size_t key_count{query_mem_desc.getGroupbyColCount()};
617  const size_t row_size{query_mem_desc.getRowSize()};
618  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
619 
620  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
621  const auto query_mem_desc_fixedup =
623  auto const key_sz = query_mem_desc.getEffectiveKeyWidth();
624  // not COUNT DISTINCT / APPROX_COUNT_DISTINCT / APPROX_QUANTILE
625  // we use the default implementation in those agg ops
626  if (!(agg_op_metadata.has_count_distinct || agg_op_metadata.has_mode ||
627  agg_op_metadata.has_tdigest) &&
629  std::vector<int8_t> sample_row(row_size - col_base_off);
630  auto const num_available_cpu_threads =
631  std::min(query_mem_desc.getAvailableCpuThreads(),
632  static_cast<size_t>(std::max(cpu_threads(), 1)));
633  tbb::task_arena initialization_arena(num_available_cpu_threads);
634 
636  query_mem_desc_fixedup, sample_row.data(), init_vals, agg_op_metadata);
637 
638  if (query_mem_desc.hasKeylessHash()) {
639  CHECK(warp_size >= 1);
640  CHECK(key_count == 1 || warp_size == 1);
641  initialization_arena.execute([&] {
643  tbb::blocked_range<size_t>(0, groups_buffer_entry_count * warp_size),
644  [&](const tbb::blocked_range<size_t>& r) {
645  auto cur_row_buf = buffer_ptr + (row_size * r.begin());
646  for (size_t i = r.begin(); i != r.end(); ++i, cur_row_buf += row_size) {
647  memcpy(cur_row_buf + col_base_off, sample_row.data(), sample_row.size());
648  }
649  });
650  });
651  return;
652  }
653  initialization_arena.execute([&] {
655  tbb::blocked_range<size_t>(0, groups_buffer_entry_count),
656  [&](const tbb::blocked_range<size_t>& r) {
657  auto cur_row_buf = buffer_ptr + (row_size * r.begin());
658  for (size_t i = r.begin(); i != r.end(); ++i, cur_row_buf += row_size) {
659  memcpy(cur_row_buf + col_base_off, sample_row.data(), sample_row.size());
660  result_set::fill_empty_key(cur_row_buf, key_count, key_sz);
661  }
662  });
663  });
664  } else {
665  // todo(yoonmin): allow parallelization of `initColumnsPerRow`
666  if (query_mem_desc.hasKeylessHash()) {
667  CHECK(warp_size >= 1);
668  CHECK(key_count == 1 || warp_size == 1);
669  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
670  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
671  ++bin, buffer_ptr += row_size) {
672  initColumnsPerRow(query_mem_desc_fixedup,
673  &buffer_ptr[col_base_off],
674  init_vals,
675  agg_op_metadata);
676  }
677  }
678  return;
679  }
680 
681  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
682  ++bin, buffer_ptr += row_size) {
684  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
686  query_mem_desc_fixedup, &buffer_ptr[col_base_off], init_vals, agg_op_metadata);
687  }
688  }
689 }
size_t getAvailableCpuThreads() const
size_t getEffectiveKeyWidth() const
size_t getGroupbyColCount() const
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const TargetAggOpsMetadata &agg_op_metadata)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:766
void parallel_for(const blocked_range< Int > &range, const Body &body, const Partitioner &p=Partitioner())
#define CHECK(condition)
Definition: Logger.h:291
bool g_optimize_row_initialization
Definition: Execute.cpp:108
int cpu_threads()
Definition: thread_count.h:25
size_t getColOffInBytes(const size_t col_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GpuGroupByBuffers QueryMemoryInitializer::prepareTopNHeapsDevBuffer ( const QueryMemoryDescriptor query_mem_desc,
const int8_t *  init_agg_vals_dev_ptr,
const size_t  n,
const int  device_id,
const unsigned  block_size_x,
const unsigned  grid_size_x 
)
private

Definition at line 1005 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), CHECK, DeviceAllocator::copyToDevice(), device_allocator_, streaming_top_n::get_heap_size(), streaming_top_n::get_rows_offset_of_heaps(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getRowSize(), GPU, QueryMemoryDescriptor::hasKeylessHash(), init_group_by_buffer_on_device(), QueryMemoryDescriptor::lazyInitGroups(), anonymous_namespace{Utm.h}::n, DeviceAllocator::setDeviceMem(), UNREACHABLE, and DeviceAllocator::zeroDeviceMem().

Referenced by createAndInitializeGroupByBufferGpu().

1011  {
1012 #ifdef HAVE_CUDA
1014  const auto thread_count = block_size_x * grid_size_x;
1015  const auto total_buff_size =
1016  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1017  int8_t* dev_buffer = device_allocator_->alloc(total_buff_size);
1018 
1019  std::vector<int8_t*> dev_buffers(thread_count);
1020 
1021  for (size_t i = 0; i < thread_count; ++i) {
1022  dev_buffers[i] = dev_buffer;
1023  }
1024 
1025  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(int8_t*));
1027  dev_ptr, dev_buffers.data(), thread_count * sizeof(int8_t*));
1028 
1029  CHECK(query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU));
1030 
1031  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
1032  thread_count * sizeof(int64_t));
1033 
1035  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
1036  (unsigned char)-1,
1037  thread_count * n * sizeof(int64_t));
1038 
1040  reinterpret_cast<int64_t*>(
1041  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
1042  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
1043  n * thread_count,
1044  query_mem_desc.getGroupbyColCount(),
1045  query_mem_desc.getEffectiveKeyWidth(),
1046  query_mem_desc.getRowSize() / sizeof(int64_t),
1047  query_mem_desc.hasKeylessHash(),
1048  1,
1049  block_size_x,
1050  grid_size_x);
1051 
1052  return {dev_ptr, dev_buffer};
1053 #else
1054  UNREACHABLE();
1055  return {};
1056 #endif
1057 }
DeviceAllocator * device_allocator_
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
#define UNREACHABLE()
Definition: Logger.h:338
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getEffectiveKeyWidth() const
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
size_t getGroupbyColCount() const
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
#define CHECK(condition)
Definition: Logger.h:291
constexpr double n
Definition: Utm.h:38
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::resetResultSet ( const size_t  index)
inline

Definition at line 106 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

106  {
107  CHECK_LT(index, result_sets_.size());
108  result_sets_[index].reset();
109  }
#define CHECK_LT(x, y)
Definition: Logger.h:303
std::vector< std::unique_ptr< ResultSet > > result_sets_
GpuGroupByBuffers QueryMemoryInitializer::setupTableFunctionGpuBuffers ( const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const bool  zero_initialize_buffers 
)

Definition at line 1173 of file QueryMemoryInitializer.cpp.

References align_to_int64(), Allocator::alloc(), CHECK, CHECK_GT, DeviceAllocator::copyToDevice(), device_allocator_, QueryMemoryDescriptor::getBufferColSlotCount(), QueryMemoryDescriptor::getColSlotContext(), ColSlotContext::getSlotInfo(), SlotSize::logical_size, num_rows_, and DeviceAllocator::zeroDeviceMem().

1178  {
1179  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
1180  CHECK_GT(num_columns, size_t(0));
1181  size_t total_group_by_buffer_size{0};
1182  const auto col_slot_context = query_mem_desc.getColSlotContext();
1183 
1184  std::vector<size_t> col_byte_offsets;
1185  col_byte_offsets.reserve(num_columns);
1186 
1187  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1188  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
1189  size_t group_buffer_size = num_rows_ * col_width;
1190  col_byte_offsets.emplace_back(total_group_by_buffer_size);
1191  total_group_by_buffer_size =
1192  align_to_int64(total_group_by_buffer_size + group_buffer_size);
1193  }
1194 
1195  int8_t* dev_buffers_allocation{nullptr};
1196  dev_buffers_allocation = device_allocator_->alloc(total_group_by_buffer_size);
1197  CHECK(dev_buffers_allocation);
1198  if (zero_initialize_buffers) {
1199  device_allocator_->zeroDeviceMem(dev_buffers_allocation, total_group_by_buffer_size);
1200  }
1201 
1202  auto dev_buffers_mem = dev_buffers_allocation;
1203  std::vector<int8_t*> dev_buffers(num_columns);
1204  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1205  dev_buffers[col_idx] = dev_buffers_allocation + col_byte_offsets[col_idx];
1206  }
1207  auto dev_ptrs = device_allocator_->alloc(num_columns * sizeof(CUdeviceptr));
1209  dev_ptrs, dev_buffers.data(), num_columns * sizeof(CUdeviceptr));
1210 
1211  return {dev_ptrs, dev_buffers_mem, (size_t)num_rows_};
1212 }
int8_t logical_size
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:28
virtual int8_t * alloc(const size_t num_bytes)=0
#define CHECK_GT(x, y)
Definition: Logger.h:305
const SlotSize & getSlotInfo(const size_t slot_idx) const
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
const ColSlotContext & getColSlotContext() const
#define CHECK(condition)
Definition: Logger.h:291
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

Friends And Related Function Documentation

friend class Executor
friend

Definition at line 273 of file QueryMemoryInitializer.h.

friend class QueryExecutionContext
friend

Definition at line 274 of file QueryMemoryInitializer.h.

Member Data Documentation

CUdeviceptr QueryMemoryInitializer::count_distinct_bitmap_device_mem_ptr_
private
int8_t* QueryMemoryInitializer::count_distinct_bitmap_host_crt_ptr_
private
int8_t* QueryMemoryInitializer::count_distinct_bitmap_host_mem_ptr_
private
size_t QueryMemoryInitializer::count_distinct_bitmap_mem_size_
private
DeviceAllocator* QueryMemoryInitializer::device_allocator_ {nullptr}
private
std::vector<int64_t> QueryMemoryInitializer::init_agg_vals_
private
size_t QueryMemoryInitializer::num_buffers_
private
const int64_t QueryMemoryInitializer::num_rows_
private
std::vector<std::unique_ptr<ResultSet> > QueryMemoryInitializer::result_sets_
private
std::vector<Data_Namespace::AbstractBuffer*> QueryMemoryInitializer::temporary_buffers_
private

Definition at line 269 of file QueryMemoryInitializer.h.

CUdeviceptr QueryMemoryInitializer::varlen_output_buffer_
private
int8_t* QueryMemoryInitializer::varlen_output_buffer_host_ptr_
private
std::shared_ptr<VarlenOutputInfo> QueryMemoryInitializer::varlen_output_info_
private

The documentation for this class was generated from the following files: