OmniSciDB  ba1bac9284
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
QueryMemoryInitializer Class Reference

#include <QueryMemoryInitializer.h>

+ Collaboration diagram for QueryMemoryInitializer:

Public Member Functions

 QueryMemoryInitializer (const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
 
 QueryMemoryInitializer (const TableFunctionExecutionUnit &exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *device_allocator, const Executor *executor)
 
const auto getCountDistinctBitmapPtr () const
 
const auto getCountDistinctHostPtr () const
 
const auto getCountDistinctBitmapBytes () const
 
ResultSetgetResultSet (const size_t index) const
 
std::unique_ptr< ResultSetgetResultSetOwned (const size_t index)
 
void resetResultSet (const size_t index)
 
int64_t getAggInitValForIndex (const size_t index) const
 
const auto getGroupByBuffersPtr ()
 
const auto getGroupByBuffersSize () const
 
const auto getNumBuffers () const
 
void copyGroupByBuffersFromGpu (Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
 

Private Member Functions

void initGroupByBuffer (int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
 
void initRowGroups (const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
 
void initColumnarGroups (const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
 
void initColumnsPerRow (const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< bool > &tdigest_deferred)
 
void allocateCountDistinctGpuMem (const QueryMemoryDescriptor &query_mem_desc)
 
std::vector< int64_t > allocateCountDistinctBuffers (const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
 
int64_t allocateCountDistinctBitmap (const size_t bitmap_byte_sz)
 
int64_t allocateCountDistinctSet ()
 
std::vector< bool > allocateTDigests (const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
 
size_t computeNumberOfBuffers (const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
 
void compactProjectionBuffersCpu (const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
 
void compactProjectionBuffersGpu (const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
 
void applyStreamingTopNOffsetCpu (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
void applyStreamingTopNOffsetGpu (Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
 

Private Attributes

const int64_t num_rows_
 
std::shared_ptr
< RowSetMemoryOwner
row_set_mem_owner_
 
std::vector< std::unique_ptr
< ResultSet > > 
result_sets_
 
std::vector< int64_t > init_agg_vals_
 
const size_t num_buffers_
 
std::vector< int64_t * > group_by_buffers_
 
CUdeviceptr count_distinct_bitmap_mem_
 
size_t count_distinct_bitmap_mem_bytes_
 
int8_t * count_distinct_bitmap_crt_ptr_
 
int8_t * count_distinct_bitmap_host_mem_
 
DeviceAllocatordevice_allocator_ {nullptr}
 
std::vector
< Data_Namespace::AbstractBuffer * > 
temporary_buffers_
 
const size_t thread_idx_
 

Friends

class Executor
 
class QueryExecutionContext
 

Detailed Description

Definition at line 35 of file QueryMemoryInitializer.h.

Constructor & Destructor Documentation

QueryMemoryInitializer::QueryMemoryInitializer ( const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const ExecutorDeviceType  device_type,
const ExecutorDispatchMode  dispatch_mode,
const bool  output_columnar,
const bool  sort_on_gpu,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
RenderAllocatorMap render_allocator_map,
RenderInfo render_info,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
DeviceAllocator gpu_allocator,
const size_t  thread_idx,
const Executor executor 
)

Definition at line 155 of file QueryMemoryInitializer.cpp.

References anonymous_namespace{QueryMemoryInitializer.cpp}::alloc_group_by_buffer(), allocateCountDistinctBuffers(), allocateCountDistinctGpuMem(), allocateTDigests(), CHECK, CHECK_GE, anonymous_namespace{QueryMemoryInitializer.cpp}::check_total_bitmap_memory(), RelAlgExecutionUnit::estimator, ResultSet::fixupQueryMemoryDescriptor(), g_max_memory_allocation_size, anonymous_namespace{QueryMemoryInitializer.cpp}::get_col_frag_offsets(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_consistent_frags_sizes(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), GPU, group_by_buffers_, QueryMemoryDescriptor::hasKeylessHash(), i, initGroupByBuffer(), QueryMemoryDescriptor::interleavedBins(), QueryMemoryDescriptor::isGroupBy(), generate_TableFunctionsFactory_init::j, KernelPerFragment, QueryMemoryDescriptor::lazyInitGroups(), num_buffers_, result_sets_, row_set_mem_owner_, RelAlgExecutionUnit::target_exprs, target_exprs_to_infos(), thread_idx_, QueryMemoryDescriptor::threadsShareMemory(), RelAlgExecutionUnit::use_bump_allocator, and RenderInfo::useCudaBuffers().

172  : num_rows_(num_rows)
173  , row_set_mem_owner_(row_set_mem_owner)
174  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
175  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
180  , device_allocator_(device_allocator)
181  , thread_idx_(thread_idx) {
182  CHECK(!sort_on_gpu || output_columnar);
183 
184  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
185  if (consistent_frag_sizes.empty()) {
186  // No fragments in the input, no underlying buffers will be needed.
187  return;
188  }
189  if (!ra_exe_unit.use_bump_allocator) {
190  check_total_bitmap_memory(query_mem_desc);
191  }
192  if (device_type == ExecutorDeviceType::GPU) {
193  allocateCountDistinctGpuMem(query_mem_desc);
194  }
195 
196  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
197  allocateCountDistinctBuffers(query_mem_desc, false, executor);
198  allocateTDigests(query_mem_desc, false, executor);
199  if (render_info && render_info->useCudaBuffers()) {
200  return;
201  }
202  }
203 
204  if (ra_exe_unit.estimator) {
205  return;
206  }
207 
208  const auto thread_count = device_type == ExecutorDeviceType::GPU
209  ? executor->blockSize() * executor->gridSize()
210  : 1;
211 
212  size_t group_buffer_size{0};
213  if (ra_exe_unit.use_bump_allocator) {
214  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
215  // the fragment
216  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
217  group_buffer_size = num_rows * query_mem_desc.getRowSize();
218  } else {
219  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
220  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
221  }
222  } else {
223  group_buffer_size =
224  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
225  }
226  CHECK_GE(group_buffer_size, size_t(0));
227 
228  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
229  int64_t* group_by_buffer_template{nullptr};
230  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
231  group_by_buffer_template = reinterpret_cast<int64_t*>(
232  row_set_mem_owner_->allocate(group_buffer_size, thread_idx_));
233  initGroupByBuffer(group_by_buffer_template,
234  ra_exe_unit,
235  query_mem_desc,
236  device_type,
237  output_columnar,
238  executor);
239  }
240 
241  if (query_mem_desc.interleavedBins(device_type)) {
242  CHECK(query_mem_desc.hasKeylessHash());
243  }
244 
245  const auto step = device_type == ExecutorDeviceType::GPU &&
246  query_mem_desc.threadsShareMemory() &&
247  query_mem_desc.isGroupBy()
248  ? executor->blockSize()
249  : size_t(1);
250  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
251  query_mem_desc.hasKeylessHash()
252  ? query_mem_desc.getEntryCount()
253  : size_t(0);
254  const auto actual_group_buffer_size =
255  group_buffer_size + index_buffer_qw * sizeof(int64_t);
256  CHECK_GE(actual_group_buffer_size, group_buffer_size);
257 
258  for (size_t i = 0; i < group_buffers_count; i += step) {
259  auto group_by_buffer = alloc_group_by_buffer(actual_group_buffer_size,
260  render_allocator_map,
261  thread_idx_,
262  row_set_mem_owner_.get());
263  if (!query_mem_desc.lazyInitGroups(device_type)) {
264  if (group_by_buffer_template) {
265  memcpy(group_by_buffer + index_buffer_qw,
266  group_by_buffer_template,
267  group_buffer_size);
268  } else {
269  initGroupByBuffer(group_by_buffer + index_buffer_qw,
270  ra_exe_unit,
271  query_mem_desc,
272  device_type,
273  output_columnar,
274  executor);
275  }
276  }
277  group_by_buffers_.push_back(group_by_buffer);
278  for (size_t j = 1; j < step; ++j) {
279  group_by_buffers_.push_back(nullptr);
280  }
281  const auto column_frag_offsets =
282  get_col_frag_offsets(ra_exe_unit.target_exprs, frag_offsets);
283  const auto column_frag_sizes =
284  get_consistent_frags_sizes(ra_exe_unit.target_exprs, consistent_frag_sizes);
285  result_sets_.emplace_back(
286  new ResultSet(target_exprs_to_infos(ra_exe_unit.target_exprs, query_mem_desc),
287  executor->getColLazyFetchInfo(ra_exe_unit.target_exprs),
288  col_buffers,
289  column_frag_offsets,
290  column_frag_sizes,
291  device_type,
292  device_id,
295  executor->getCatalog(),
296  executor->blockSize(),
297  executor->gridSize()));
298  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
299  executor->plan_state_->init_agg_vals_);
300  for (size_t j = 1; j < step; ++j) {
301  result_sets_.emplace_back(nullptr);
302  }
303  }
304 }
std::vector< Analyzer::Expr * > target_exprs
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:69
DeviceAllocator * device_allocator_
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
std::vector< bool > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
#define CHECK_GE(x, y)
Definition: Logger.h:219
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< int64_t > init_agg_vals_
bool lazyInitGroups(const ExecutorDeviceType) const
size_t g_max_memory_allocation_size
Definition: Execute.cpp:106
const std::shared_ptr< Analyzer::Estimator > estimator
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
std::vector< int64_t * > group_by_buffers_
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:486
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:206
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, const size_t thread_idx, RowSetMemoryOwner *mem_owner)
std::vector< std::unique_ptr< ResultSet > > result_sets_
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)

+ Here is the call graph for this function:

QueryMemoryInitializer::QueryMemoryInitializer ( const TableFunctionExecutionUnit exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const ExecutorDeviceType  device_type,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
DeviceAllocator device_allocator,
const Executor executor 
)

Definition at line 307 of file QueryMemoryInitializer.cpp.

318  : num_rows_(num_rows)
319  , row_set_mem_owner_(row_set_mem_owner)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< int64_t > init_agg_vals_
std::vector< Analyzer::Expr * > target_exprs
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)

Member Function Documentation

int64_t QueryMemoryInitializer::allocateCountDistinctBitmap ( const size_t  bitmap_byte_sz)
private

Definition at line 697 of file QueryMemoryInitializer.cpp.

References CHECK, count_distinct_bitmap_crt_ptr_, count_distinct_bitmap_host_mem_, row_set_mem_owner_, and thread_idx_.

Referenced by allocateCountDistinctBuffers(), and initColumnsPerRow().

697  {
701  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
702  row_set_mem_owner_->addCountDistinctBuffer(
703  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
704  return reinterpret_cast<int64_t>(ptr);
705  }
706  return reinterpret_cast<int64_t>(
707  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz, thread_idx_));
708 }
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
#define CHECK(condition)
Definition: Logger.h:206

+ Here is the caller graph for this function:

std::vector< int64_t > QueryMemoryInitializer::allocateCountDistinctBuffers ( const QueryMemoryDescriptor query_mem_desc,
const bool  deferred,
const Executor executor 
)
private

Definition at line 651 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBitmap(), allocateCountDistinctSet(), Bitmap, CHECK, CHECK_EQ, CHECK_GE, CHECK_LT, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getLogicalSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), init_agg_vals_, Invalid, is_distinct_target(), kAPPROX_COUNT_DISTINCT, kCOUNT, and StdSet.

Referenced by initRowGroups(), and QueryMemoryInitializer().

654  {
655  const size_t agg_col_count{query_mem_desc.getSlotCount()};
656  std::vector<int64_t> agg_bitmap_size(deferred ? agg_col_count : 0);
657 
658  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
659  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
660  ++target_idx) {
661  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
662  const auto agg_info = get_target_info(target_expr, g_bigint_count);
663  if (is_distinct_target(agg_info)) {
664  CHECK(agg_info.is_agg &&
665  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
666  CHECK(!agg_info.sql_type.is_varlen());
667 
668  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
669  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
670 
671  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
672  sizeof(int64_t));
673  const auto& count_distinct_desc =
674  query_mem_desc.getCountDistinctDescriptor(target_idx);
675  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
676  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
677  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
678  if (deferred) {
679  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
680  } else {
681  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
682  }
683  } else {
684  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::StdSet);
685  if (deferred) {
686  agg_bitmap_size[agg_col_idx] = -1;
687  } else {
688  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
689  }
690  }
691  }
692  }
693 
694  return agg_bitmap_size;
695 }
#define CHECK_EQ(x, y)
Definition: Logger.h:214
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:79
#define CHECK_GE(x, y)
Definition: Logger.h:219
std::vector< int64_t > init_agg_vals_
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:130
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:216
Definition: sqldefs.h:76
#define CHECK(condition)
Definition: Logger.h:206
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::allocateCountDistinctGpuMem ( const QueryMemoryDescriptor query_mem_desc)
private

Definition at line 619 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), Bitmap, CHECK, count_distinct_bitmap_crt_ptr_, count_distinct_bitmap_host_mem_, count_distinct_bitmap_mem_, count_distinct_bitmap_mem_bytes_, QueryMemoryDescriptor::countDistinctDescriptorsLogicallyEmpty(), device_allocator_, QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getCountDistinctDescriptorsSize(), QueryMemoryDescriptor::getEntryCount(), i, Invalid, row_set_mem_owner_, thread_idx_, and DeviceAllocator::zeroDeviceMem().

Referenced by QueryMemoryInitializer().

620  {
621  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
622  return;
623  }
625 
626  size_t total_bytes_per_entry{0};
627  const size_t num_count_distinct_descs =
628  query_mem_desc.getCountDistinctDescriptorsSize();
629  for (size_t i = 0; i < num_count_distinct_descs; i++) {
630  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
631  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
632  continue;
633  }
634  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
635  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
636  }
637 
639  total_bytes_per_entry * query_mem_desc.getEntryCount();
640  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
642  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
644 
647 }
bool countDistinctDescriptorsLogicallyEmpty() const
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:27
virtual int8_t * alloc(const size_t num_bytes)=0
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
size_t getCountDistinctDescriptorsSize() const
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK(condition)
Definition: Logger.h:206

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int64_t QueryMemoryInitializer::allocateCountDistinctSet ( )
private

Definition at line 710 of file QueryMemoryInitializer.cpp.

References row_set_mem_owner_.

Referenced by allocateCountDistinctBuffers(), and initColumnsPerRow().

710  {
711  auto count_distinct_set = new std::set<int64_t>();
712  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
713  return reinterpret_cast<int64_t>(count_distinct_set);
714 }
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_

+ Here is the caller graph for this function:

std::vector< bool > QueryMemoryInitializer::allocateTDigests ( const QueryMemoryDescriptor query_mem_desc,
const bool  deferred,
const Executor executor 
)
private

Definition at line 716 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, CHECK_GE, CHECK_LT, QueryMemoryDescriptor::getLogicalSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), init_agg_vals_, kAPPROX_MEDIAN, and row_set_mem_owner_.

Referenced by initRowGroups(), and QueryMemoryInitializer().

719  {
720  size_t const slot_count = query_mem_desc.getSlotCount();
721  size_t const ntargets = executor->plan_state_->target_exprs_.size();
722  CHECK_GE(slot_count, ntargets);
723  std::vector<bool> tdigest_deferred(deferred ? slot_count : 0);
724 
725  for (size_t target_idx = 0; target_idx < ntargets; ++target_idx) {
726  auto const target_expr = executor->plan_state_->target_exprs_[target_idx];
727  if (auto const agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr)) {
728  if (agg_expr->get_aggtype() == kAPPROX_MEDIAN) {
729  size_t const agg_col_idx =
730  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
731  CHECK_LT(agg_col_idx, slot_count);
732  CHECK_EQ(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx),
733  static_cast<int8_t>(sizeof(int64_t)));
734  if (deferred) {
735  tdigest_deferred[agg_col_idx] = true;
736  } else {
737  // allocate for APPROX_MEDIAN only when slot is used
738  init_agg_vals_[agg_col_idx] =
739  reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest());
740  }
741  }
742  }
743  }
744  return tdigest_deferred;
745 }
#define CHECK_EQ(x, y)
Definition: Logger.h:214
#define CHECK_GE(x, y)
Definition: Logger.h:219
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< int64_t > init_agg_vals_
#define CHECK_LT(x, y)
Definition: Logger.h:216
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::applyStreamingTopNOffsetCpu ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 1083 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, CPU, streaming_top_n::get_rows_copy_from_heaps(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), group_by_buffers_, SortInfo::limit, SortInfo::offset, and RelAlgExecutionUnit::sort_info.

1085  {
1086  CHECK_EQ(group_by_buffers_.size(), size_t(1));
1087 
1088  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1089  group_by_buffers_[0],
1090  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1091  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
1092  1);
1093  CHECK_EQ(rows_copy.size(),
1094  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1095  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
1096 }
#define CHECK_EQ(x, y)
Definition: Logger.h:214
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
const size_t limit
const SortInfo sort_info
std::vector< int64_t * > group_by_buffers_
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
const size_t offset

+ Here is the call graph for this function:

void QueryMemoryInitializer::applyStreamingTopNOffsetGpu ( Data_Namespace::DataMgr data_mgr,
const QueryMemoryDescriptor query_mem_desc,
const GpuGroupByBuffers gpu_group_by_buffers,
const RelAlgExecutionUnit ra_exe_unit,
const unsigned  total_thread_count,
const int  device_id 
)
private

Definition at line 1098 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), group_by_buffers_, num_buffers_, GpuGroupByBuffers::second, and UNREACHABLE.

1104  {
1105 #ifdef HAVE_CUDA
1107 
1108  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1109  data_mgr,
1110  reinterpret_cast<int64_t*>(gpu_group_by_buffers.second),
1111  ra_exe_unit,
1112  query_mem_desc,
1113  total_thread_count,
1114  device_id);
1115  CHECK_EQ(
1116  rows_copy.size(),
1117  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1118  memcpy(group_by_buffers_[0], &rows_copy[0], rows_copy.size());
1119 #else
1120  UNREACHABLE();
1121 #endif
1122 }
#define CHECK_EQ(x, y)
Definition: Logger.h:214
#define UNREACHABLE()
Definition: Logger.h:250
CUdeviceptr second
Definition: GpuMemUtils.h:61
std::vector< int64_t * > group_by_buffers_

+ Here is the call graph for this function:

void QueryMemoryInitializer::compactProjectionBuffersCpu ( const QueryMemoryDescriptor query_mem_desc,
const size_t  projection_count 
)
private

Definition at line 1010 of file QueryMemoryInitializer.cpp.

References CHECK, anonymous_namespace{QueryMemoryInitializer.cpp}::compact_projection_buffer_for_cpu_columnar(), QueryMemoryDescriptor::getEntryCount(), group_by_buffers_, and result_sets_.

1012  {
1013  const auto num_allocated_rows =
1014  std::min(projection_count, query_mem_desc.getEntryCount());
1015 
1016  // copy the results from the main buffer into projection_buffer
1018  query_mem_desc,
1019  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
1020  num_allocated_rows);
1021 
1022  // update the entry count for the result set, and its underlying storage
1023  CHECK(!result_sets_.empty());
1024  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1025 }
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
#define CHECK(condition)
Definition: Logger.h:206
std::vector< std::unique_ptr< ResultSet > > result_sets_

+ Here is the call graph for this function:

void QueryMemoryInitializer::compactProjectionBuffersGpu ( const QueryMemoryDescriptor query_mem_desc,
Data_Namespace::DataMgr data_mgr,
const GpuGroupByBuffers gpu_group_by_buffers,
const size_t  projection_count,
const int  device_id 
)
private

Definition at line 1027 of file QueryMemoryInitializer.cpp.

References CHECK, copy_projection_buffer_from_gpu_columnar(), QueryMemoryDescriptor::getEntryCount(), group_by_buffers_, and result_sets_.

1032  {
1033  // store total number of allocated rows:
1034  const auto num_allocated_rows =
1035  std::min(projection_count, query_mem_desc.getEntryCount());
1036 
1037  // copy the results from the main buffer into projection_buffer
1039  data_mgr,
1040  gpu_group_by_buffers,
1041  query_mem_desc,
1042  reinterpret_cast<int8_t*>(group_by_buffers_[0]),
1043  num_allocated_rows,
1044  device_id);
1045 
1046  // update the entry count for the result set, and its underlying storage
1047  CHECK(!result_sets_.empty());
1048  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1049 }
std::vector< int64_t * > group_by_buffers_
#define CHECK(condition)
Definition: Logger.h:206
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
std::vector< std::unique_ptr< ResultSet > > result_sets_

+ Here is the call graph for this function:

size_t QueryMemoryInitializer::computeNumberOfBuffers ( const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
const Executor executor 
) const
private

Definition at line 966 of file QueryMemoryInitializer.cpp.

References QueryMemoryDescriptor::blocksShareMemory(), and CPU.

969  {
970  return device_type == ExecutorDeviceType::CPU
971  ? 1
972  : executor->blockSize() *
973  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
974 }

+ Here is the call graph for this function:

void QueryMemoryInitializer::copyGroupByBuffersFromGpu ( Data_Namespace::DataMgr data_mgr,
const QueryMemoryDescriptor query_mem_desc,
const size_t  entry_count,
const GpuGroupByBuffers gpu_group_by_buffers,
const RelAlgExecutionUnit ra_exe_unit,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int  device_id,
const bool  prepend_index_buffer 
) const

Definition at line 1051 of file QueryMemoryInitializer.cpp.

References copy_group_by_buffers_from_gpu(), streaming_top_n::get_heap_size(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getRowSize(), GPU, group_by_buffers_, SortInfo::limit, SortInfo::offset, GpuGroupByBuffers::second, RelAlgExecutionUnit::sort_info, and QueryMemoryDescriptor::useStreamingTopN().

1060  {
1061  const auto thread_count = block_size_x * grid_size_x;
1062 
1063  size_t total_buff_size{0};
1064  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
1065  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
1066  total_buff_size =
1067  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1068  } else {
1069  total_buff_size =
1070  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
1071  }
1074  total_buff_size,
1075  gpu_group_by_buffers.second,
1076  query_mem_desc,
1077  block_size_x,
1078  grid_size_x,
1079  device_id,
1080  prepend_index_buffer);
1081 }
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
const size_t limit
CUdeviceptr second
Definition: GpuMemUtils.h:61
const SortInfo sort_info
std::vector< int64_t * > group_by_buffers_
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer)
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
const size_t offset

+ Here is the call graph for this function:

int64_t QueryMemoryInitializer::getAggInitValForIndex ( const size_t  index) const
inline

Definition at line 90 of file QueryMemoryInitializer.h.

References CHECK_LT, and init_agg_vals_.

90  {
91  CHECK_LT(index, init_agg_vals_.size());
92  return init_agg_vals_[index];
93  }
std::vector< int64_t > init_agg_vals_
#define CHECK_LT(x, y)
Definition: Logger.h:216
const auto QueryMemoryInitializer::getCountDistinctBitmapBytes ( ) const
inline

Definition at line 71 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_mem_bytes_.

71  {
73  }
const auto QueryMemoryInitializer::getCountDistinctBitmapPtr ( ) const
inline

Definition at line 67 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_mem_.

const auto QueryMemoryInitializer::getCountDistinctHostPtr ( ) const
inline

Definition at line 69 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_host_mem_.

const auto QueryMemoryInitializer::getGroupByBuffersPtr ( )
inline

Definition at line 95 of file QueryMemoryInitializer.h.

References group_by_buffers_.

95  {
96  return reinterpret_cast<int64_t**>(group_by_buffers_.data());
97  }
std::vector< int64_t * > group_by_buffers_
const auto QueryMemoryInitializer::getGroupByBuffersSize ( ) const
inline

Definition at line 99 of file QueryMemoryInitializer.h.

References group_by_buffers_.

99 { return group_by_buffers_.size(); }
std::vector< int64_t * > group_by_buffers_
const auto QueryMemoryInitializer::getNumBuffers ( ) const
inline

Definition at line 101 of file QueryMemoryInitializer.h.

References CHECK_EQ, group_by_buffers_, and num_buffers_.

101  {
103  return num_buffers_;
104  }
#define CHECK_EQ(x, y)
Definition: Logger.h:214
std::vector< int64_t * > group_by_buffers_
ResultSet* QueryMemoryInitializer::getResultSet ( const size_t  index) const
inline

Definition at line 75 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

75  {
76  CHECK_LT(index, result_sets_.size());
77  return result_sets_[index].get();
78  }
#define CHECK_LT(x, y)
Definition: Logger.h:216
std::vector< std::unique_ptr< ResultSet > > result_sets_
std::unique_ptr<ResultSet> QueryMemoryInitializer::getResultSetOwned ( const size_t  index)
inline

Definition at line 80 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

80  {
81  CHECK_LT(index, result_sets_.size());
82  return std::move(result_sets_[index]);
83  }
#define CHECK_LT(x, y)
Definition: Logger.h:216
std::vector< std::unique_ptr< ResultSet > > result_sets_
void QueryMemoryInitializer::initColumnarGroups ( const QueryMemoryDescriptor query_mem_desc,
int64_t *  groups_buffer,
const std::vector< int64_t > &  init_vals,
const Executor executor 
)
private

Definition at line 505 of file QueryMemoryInitializer.cpp.

References align_to_int64(), CHECK, CHECK_LT, EMPTY_KEY_64, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getQueryDescriptionType(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::hasKeylessHash(), i, is_distinct_target(), and Projection.

Referenced by initGroupByBuffer().

509  {
510  CHECK(groups_buffer);
511  for (const auto target_expr : executor->plan_state_->target_exprs_) {
512  const auto agg_info = get_target_info(target_expr, g_bigint_count);
513  CHECK(!is_distinct_target(agg_info));
514  }
515  const int32_t agg_col_count = query_mem_desc.getSlotCount();
516  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
517 
518  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
519  if (!query_mem_desc.hasKeylessHash()) {
520  const size_t key_count{query_mem_desc.getGroupbyColCount()};
521  for (size_t i = 0; i < key_count; ++i) {
522  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
523  EMPTY_KEY_64,
524  groups_buffer_entry_count);
525  }
526  }
527 
529  // initializing all aggregate columns:
530  int32_t init_val_idx = 0;
531  for (int32_t i = 0; i < agg_col_count; ++i) {
532  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
533  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
534  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
535  case 1:
536  buffer_ptr = initColumnarBuffer<int8_t>(
537  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
538  break;
539  case 2:
540  buffer_ptr =
541  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
542  init_vals[init_val_idx++],
543  groups_buffer_entry_count);
544  break;
545  case 4:
546  buffer_ptr =
547  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
548  init_vals[init_val_idx++],
549  groups_buffer_entry_count);
550  break;
551  case 8:
552  buffer_ptr =
553  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
554  init_vals[init_val_idx++],
555  groups_buffer_entry_count);
556  break;
557  case 0:
558  break;
559  default:
560  CHECK(false);
561  }
562 
563  buffer_ptr = align_to_int64(buffer_ptr);
564  }
565  }
566  }
567 }
#define EMPTY_KEY_64
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:79
size_t getGroupbyColCount() const
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:130
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
QueryDescriptionType getQueryDescriptionType() const
#define CHECK_LT(x, y)
Definition: Logger.h:216
#define CHECK(condition)
Definition: Logger.h:206
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initColumnsPerRow ( const QueryMemoryDescriptor query_mem_desc,
int8_t *  row_ptr,
const std::vector< int64_t > &  init_vals,
const std::vector< int64_t > &  bitmap_sizes,
const std::vector< bool > &  tdigest_deferred 
)
private

Definition at line 569 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBitmap(), allocateCountDistinctSet(), CHECK, CHECK_EQ, CHECK_LT, QueryMemoryDescriptor::getNextColOffInBytesRowOnly(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::isGroupBy(), and row_set_mem_owner_.

Referenced by initRowGroups().

574  {
575  int8_t* col_ptr = row_ptr;
576  size_t init_vec_idx = 0;
577  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
578  col_ptr += query_mem_desc.getNextColOffInBytesRowOnly(col_ptr, col_idx++)) {
579  const int64_t bm_sz{bitmap_sizes[col_idx]};
580  int64_t init_val{0};
581  if (bm_sz && query_mem_desc.isGroupBy()) {
582  // COUNT DISTINCT / APPROX_COUNT_DISTINCT
583  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
584  sizeof(int64_t));
585  init_val =
587  ++init_vec_idx;
588  } else if (query_mem_desc.isGroupBy() && tdigest_deferred[col_idx]) {
589  // allocate for APPROX_MEDIAN only when slot is used
590  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest());
591  ++init_vec_idx;
592  } else {
593  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
594  CHECK_LT(init_vec_idx, init_vals.size());
595  init_val = init_vals[init_vec_idx++];
596  }
597  }
598  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
599  case 1:
600  *col_ptr = static_cast<int8_t>(init_val);
601  break;
602  case 2:
603  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
604  break;
605  case 4:
606  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
607  break;
608  case 8:
609  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
610  break;
611  case 0:
612  continue;
613  default:
614  CHECK(false);
615  }
616  }
617 }
#define CHECK_EQ(x, y)
Definition: Logger.h:214
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:216
size_t getNextColOffInBytesRowOnly(const int8_t *col_ptr, const size_t col_idx) const
#define CHECK(condition)
Definition: Logger.h:206
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initGroupByBuffer ( int64_t *  buffer,
const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
const bool  output_columnar,
const Executor executor 
)
private

Definition at line 378 of file QueryMemoryInitializer.cpp.

References streaming_top_n::get_rows_offset_of_heaps(), QueryMemoryDescriptor::getEntryCount(), GPU, init_agg_vals_, initColumnarGroups(), initRowGroups(), QueryMemoryDescriptor::interleavedBins(), SortInfo::limit, SortInfo::offset, RelAlgExecutionUnit::sort_info, and QueryMemoryDescriptor::useStreamingTopN().

Referenced by QueryMemoryInitializer().

384  {
385  if (output_columnar) {
386  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor);
387  } else {
388  auto rows_ptr = buffer;
389  auto actual_entry_count = query_mem_desc.getEntryCount();
390  const auto thread_count = device_type == ExecutorDeviceType::GPU
391  ? executor->blockSize() * executor->gridSize()
392  : 1;
393  auto warp_size =
394  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
395  if (query_mem_desc.useStreamingTopN()) {
396  const auto node_count_size = thread_count * sizeof(int64_t);
397  memset(rows_ptr, 0, node_count_size);
398  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
399  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
400  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
401  rows_ptr += rows_offset / sizeof(int64_t);
402  actual_entry_count = n * thread_count;
403  warp_size = 1;
404  }
405  initRowGroups(query_mem_desc,
406  rows_ptr,
408  actual_entry_count,
409  warp_size,
410  executor);
411  }
412 }
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
const size_t limit
std::vector< int64_t > init_agg_vals_
const SortInfo sort_info
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
bool interleavedBins(const ExecutorDeviceType) const
const size_t offset
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initRowGroups ( const QueryMemoryDescriptor query_mem_desc,
int64_t *  groups_buffer,
const std::vector< int64_t > &  init_vals,
const int32_t  groups_buffer_entry_count,
const size_t  warp_size,
const Executor executor 
)
private

Definition at line 414 of file QueryMemoryInitializer.cpp.

References gpu_enabled::accumulate(), allocateCountDistinctBuffers(), allocateTDigests(), CHECK, result_set::fill_empty_key(), ResultSet::fixupQueryMemoryDescriptor(), g_optimize_row_initialization, QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getRowSize(), QueryMemoryDescriptor::hasKeylessHash(), and initColumnsPerRow().

Referenced by initGroupByBuffer().

419  {
420  const size_t key_count{query_mem_desc.getGroupbyColCount()};
421  const size_t row_size{query_mem_desc.getRowSize()};
422  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
423 
424  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
425  auto tdigest_deferred = allocateTDigests(query_mem_desc, true, executor);
426  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
427 
428  const auto query_mem_desc_fixedup =
430 
431  // not COUNT DISTINCT / APPROX_COUNT_DISTINCT / APPROX_MEDIAN
432  // we fallback to default implementation in that cases
433  if (!std::accumulate(agg_bitmap_size.begin(), agg_bitmap_size.end(), 0) &&
434  !std::accumulate(tdigest_deferred.begin(), tdigest_deferred.end(), 0) &&
436  std::vector<int8_t> sample_row(row_size - col_base_off);
437 
438  initColumnsPerRow(query_mem_desc_fixedup,
439  sample_row.data(),
440  init_vals,
441  agg_bitmap_size,
442  tdigest_deferred);
443 
444  if (query_mem_desc.hasKeylessHash()) {
445  CHECK(warp_size >= 1);
446  CHECK(key_count == 1 || warp_size == 1);
447  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
448  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
449  ++bin, buffer_ptr += row_size) {
450  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
451  }
452  }
453  return;
454  }
455 
456  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
457  ++bin, buffer_ptr += row_size) {
458  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
460  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
461  }
462  } else {
463  if (query_mem_desc.hasKeylessHash()) {
464  CHECK(warp_size >= 1);
465  CHECK(key_count == 1 || warp_size == 1);
466  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
467  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
468  ++bin, buffer_ptr += row_size) {
469  initColumnsPerRow(query_mem_desc_fixedup,
470  &buffer_ptr[col_base_off],
471  init_vals,
472  agg_bitmap_size,
473  tdigest_deferred);
474  }
475  }
476  return;
477  }
478 
479  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
480  ++bin, buffer_ptr += row_size) {
482  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
483  initColumnsPerRow(query_mem_desc_fixedup,
484  &buffer_ptr[col_base_off],
485  init_vals,
486  agg_bitmap_size,
487  tdigest_deferred);
488  }
489  }
490 }
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< bool > &tdigest_deferred)
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
std::vector< bool > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
size_t getEffectiveKeyWidth() const
size_t getGroupbyColCount() const
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
DEVICE auto accumulate(ARGS &&...args)
Definition: gpu_enabled.h:42
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:486
#define CHECK(condition)
Definition: Logger.h:206
bool g_optimize_row_initialization
Definition: Execute.cpp:95
size_t getColOffInBytes(const size_t col_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::resetResultSet ( const size_t  index)
inline

Definition at line 85 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

85  {
86  CHECK_LT(index, result_sets_.size());
87  result_sets_[index].reset();
88  }
#define CHECK_LT(x, y)
Definition: Logger.h:216
std::vector< std::unique_ptr< ResultSet > > result_sets_

Friends And Related Function Documentation

friend class Executor
friend

Definition at line 235 of file QueryMemoryInitializer.h.

friend class QueryExecutionContext
friend

Definition at line 236 of file QueryMemoryInitializer.h.

Member Data Documentation

int8_t* QueryMemoryInitializer::count_distinct_bitmap_crt_ptr_
private
int8_t* QueryMemoryInitializer::count_distinct_bitmap_host_mem_
private
CUdeviceptr QueryMemoryInitializer::count_distinct_bitmap_mem_
private
size_t QueryMemoryInitializer::count_distinct_bitmap_mem_bytes_
private
DeviceAllocator* QueryMemoryInitializer::device_allocator_ {nullptr}
private

Definition at line 230 of file QueryMemoryInitializer.h.

Referenced by allocateCountDistinctGpuMem().

std::vector<int64_t> QueryMemoryInitializer::init_agg_vals_
private
const size_t QueryMemoryInitializer::num_buffers_
private
const int64_t QueryMemoryInitializer::num_rows_
private

Definition at line 216 of file QueryMemoryInitializer.h.

std::vector<std::unique_ptr<ResultSet> > QueryMemoryInitializer::result_sets_
private
std::shared_ptr<RowSetMemoryOwner> QueryMemoryInitializer::row_set_mem_owner_
private
std::vector<Data_Namespace::AbstractBuffer*> QueryMemoryInitializer::temporary_buffers_
private

Definition at line 231 of file QueryMemoryInitializer.h.

const size_t QueryMemoryInitializer::thread_idx_
private

The documentation for this class was generated from the following files: