OmniSciDB  16c4e035a1
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryMemoryInitializer Class Reference

#include <QueryMemoryInitializer.h>

+ Collaboration diagram for QueryMemoryInitializer:

Public Member Functions

 QueryMemoryInitializer (const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int outer_table_id, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
 
 QueryMemoryInitializer (const TableFunctionExecutionUnit &exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *device_allocator, const Executor *executor)
 
const auto getCountDistinctBitmapPtr () const
 
const auto getCountDistinctHostPtr () const
 
const auto getCountDistinctBitmapBytes () const
 
const auto getVarlenOutputHostPtr () const
 
const auto getVarlenOutputPtr () const
 
ResultSetgetResultSet (const size_t index) const
 
std::unique_ptr< ResultSetgetResultSetOwned (const size_t index)
 
void resetResultSet (const size_t index)
 
int64_t getAggInitValForIndex (const size_t index) const
 
const auto getGroupByBuffersPtr ()
 
const auto getGroupByBuffersSize () const
 
const auto getNumBuffers () const
 
GpuGroupByBuffers setupTableFunctionGpuBuffers (const QueryMemoryDescriptor &query_mem_desc, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
 
void copyFromTableFunctionGpuBuffers (Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
 
void copyGroupByBuffersFromGpu (DeviceAllocator &device_allocator, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
 

Private Types

using QuantileParam = std::optional< double >
 

Private Member Functions

void initGroupByBuffer (int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
 
void initRowGroups (const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
 
void initColumnarGroups (const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
 
void initColumnsPerRow (const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< QuantileParam > &quantile_params)
 
void allocateCountDistinctGpuMem (const QueryMemoryDescriptor &query_mem_desc)
 
std::vector< int64_t > allocateCountDistinctBuffers (const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
 
int64_t allocateCountDistinctBitmap (const size_t bitmap_byte_sz)
 
int64_t allocateCountDistinctSet ()
 
std::vector< QuantileParamallocateTDigests (const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
 
GpuGroupByBuffers prepareTopNHeapsDevBuffer (const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
 
GpuGroupByBuffers createAndInitializeGroupByBufferGpu (const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const int device_id, const ExecutorDispatchMode dispatch_mode, const unsigned block_size_x, const unsigned grid_size_x, const int8_t warp_size, const bool can_sort_on_gpu, const bool output_columnar, RenderAllocator *render_allocator)
 
size_t computeNumberOfBuffers (const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
 
void compactProjectionBuffersCpu (const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
 
void compactProjectionBuffersGpu (const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
 
void applyStreamingTopNOffsetCpu (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
void applyStreamingTopNOffsetGpu (Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
 
std::shared_ptr< VarlenOutputInfogetVarlenOutputInfo ()
 

Private Attributes

const int64_t num_rows_
 
std::shared_ptr
< RowSetMemoryOwner
row_set_mem_owner_
 
std::vector< std::unique_ptr
< ResultSet > > 
result_sets_
 
std::vector< int64_t > init_agg_vals_
 
size_t num_buffers_
 
std::vector< int64_t * > group_by_buffers_
 
std::shared_ptr< VarlenOutputInfovarlen_output_info_
 
CUdeviceptr varlen_output_buffer_
 
int8_t * varlen_output_buffer_host_ptr_
 
CUdeviceptr count_distinct_bitmap_mem_
 
size_t count_distinct_bitmap_mem_bytes_
 
int8_t * count_distinct_bitmap_crt_ptr_
 
int8_t * count_distinct_bitmap_host_mem_
 
DeviceAllocatordevice_allocator_ {nullptr}
 
std::vector
< Data_Namespace::AbstractBuffer * > 
temporary_buffers_
 
const size_t thread_idx_
 

Friends

class Executor
 
class QueryExecutionContext
 

Detailed Description

Definition at line 35 of file QueryMemoryInitializer.h.

Member Typedef Documentation

using QueryMemoryInitializer::QuantileParam = std::optional<double>
private

Definition at line 155 of file QueryMemoryInitializer.h.

Constructor & Destructor Documentation

QueryMemoryInitializer::QueryMemoryInitializer ( const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const ExecutorDeviceType  device_type,
const ExecutorDispatchMode  dispatch_mode,
const bool  output_columnar,
const bool  sort_on_gpu,
const int  outer_table_id,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
RenderAllocatorMap render_allocator_map,
RenderInfo render_info,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
DeviceAllocator gpu_allocator,
const size_t  thread_idx,
const Executor executor 
)

Definition at line 165 of file QueryMemoryInitializer.cpp.

References anonymous_namespace{QueryMemoryInitializer.cpp}::alloc_group_by_buffer(), allocateCountDistinctBuffers(), allocateCountDistinctGpuMem(), allocateTDigests(), CHECK, CHECK_GE, anonymous_namespace{QueryMemoryInitializer.cpp}::check_total_bitmap_memory(), RelAlgExecutionUnit::estimator, ResultSet::fixupQueryMemoryDescriptor(), g_max_memory_allocation_size, anonymous_namespace{QueryMemoryInitializer.cpp}::get_col_frag_offsets(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_consistent_frags_sizes(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_input_idx(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), getVarlenOutputInfo(), GPU, group_by_buffers_, QueryMemoryDescriptor::hasKeylessHash(), QueryMemoryDescriptor::hasVarlenOutput(), i, initGroupByBuffer(), QueryMemoryDescriptor::interleavedBins(), QueryMemoryDescriptor::isGroupBy(), KernelPerFragment, QueryMemoryDescriptor::lazyInitGroups(), num_buffers_, result_sets_, row_set_mem_owner_, RelAlgExecutionUnit::target_exprs, target_exprs_to_infos(), RelAlgExecutionUnit::target_exprs_union, thread_idx_, QueryMemoryDescriptor::threadsShareMemory(), RelAlgExecutionUnit::union_all, RelAlgExecutionUnit::use_bump_allocator, RenderInfo::useCudaBuffers(), and QueryMemoryDescriptor::varlenOutputBufferElemSize().

183  : num_rows_(num_rows)
184  , row_set_mem_owner_(row_set_mem_owner)
185  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
186  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
193  , device_allocator_(device_allocator)
194  , thread_idx_(thread_idx) {
195  CHECK(!sort_on_gpu || output_columnar);
196 
197  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
198  if (consistent_frag_sizes.empty()) {
199  // No fragments in the input, no underlying buffers will be needed.
200  return;
201  }
202  if (!ra_exe_unit.use_bump_allocator) {
203  check_total_bitmap_memory(query_mem_desc);
204  }
205  if (device_type == ExecutorDeviceType::GPU) {
206  allocateCountDistinctGpuMem(query_mem_desc);
207  }
208 
209  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
210  allocateCountDistinctBuffers(query_mem_desc, false, executor);
211  allocateTDigests(query_mem_desc, false, executor);
212  if (render_info && render_info->useCudaBuffers()) {
213  return;
214  }
215  }
216 
217  if (ra_exe_unit.estimator) {
218  return;
219  }
220 
221  const auto thread_count = device_type == ExecutorDeviceType::GPU
222  ? executor->blockSize() * executor->gridSize()
223  : 1;
224 
225  size_t group_buffer_size{0};
226  if (ra_exe_unit.use_bump_allocator) {
227  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
228  // the fragment
229  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
230  group_buffer_size = num_rows * query_mem_desc.getRowSize();
231  } else {
232  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
233  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
234  }
235  } else {
236  group_buffer_size =
237  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
238  }
239  CHECK_GE(group_buffer_size, size_t(0));
240 
241  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
242  int64_t* group_by_buffer_template{nullptr};
243  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
244  group_by_buffer_template = reinterpret_cast<int64_t*>(
245  row_set_mem_owner_->allocate(group_buffer_size, thread_idx_));
246  initGroupByBuffer(group_by_buffer_template,
247  ra_exe_unit,
248  query_mem_desc,
249  device_type,
250  output_columnar,
251  executor);
252  }
253 
254  if (query_mem_desc.interleavedBins(device_type)) {
255  CHECK(query_mem_desc.hasKeylessHash());
256  }
257 
258  const auto step = device_type == ExecutorDeviceType::GPU &&
259  query_mem_desc.threadsShareMemory() &&
260  query_mem_desc.isGroupBy()
261  ? executor->blockSize()
262  : size_t(1);
263  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
264  query_mem_desc.hasKeylessHash()
265  ? query_mem_desc.getEntryCount()
266  : size_t(0);
267  const auto actual_group_buffer_size =
268  group_buffer_size + index_buffer_qw * sizeof(int64_t);
269  CHECK_GE(actual_group_buffer_size, group_buffer_size);
270 
271  if (query_mem_desc.hasVarlenOutput()) {
272  const auto varlen_buffer_elem_size_opt = query_mem_desc.varlenOutputBufferElemSize();
273  CHECK(varlen_buffer_elem_size_opt); // TODO(adb): relax
274  auto varlen_output_buffer = reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(
275  query_mem_desc.getEntryCount() * varlen_buffer_elem_size_opt.value()));
276  num_buffers_ += 1;
277  group_by_buffers_.push_back(varlen_output_buffer);
278  }
279 
280  for (size_t i = 0; i < group_buffers_count; i += step) {
281  auto group_by_buffer = alloc_group_by_buffer(actual_group_buffer_size,
282  render_allocator_map,
283  thread_idx_,
284  row_set_mem_owner_.get());
285  if (!query_mem_desc.lazyInitGroups(device_type)) {
286  if (group_by_buffer_template) {
287  memcpy(group_by_buffer + index_buffer_qw,
288  group_by_buffer_template,
289  group_buffer_size);
290  } else {
291  initGroupByBuffer(group_by_buffer + index_buffer_qw,
292  ra_exe_unit,
293  query_mem_desc,
294  device_type,
295  output_columnar,
296  executor);
297  }
298  }
299  group_by_buffers_.push_back(group_by_buffer);
300  for (size_t j = 1; j < step; ++j) {
301  group_by_buffers_.push_back(nullptr);
302  }
303  const bool use_target_exprs_union =
304  ra_exe_unit.union_all && get_input_idx(ra_exe_unit, outer_table_id);
305  const auto& target_exprs = use_target_exprs_union ? ra_exe_unit.target_exprs_union
306  : ra_exe_unit.target_exprs;
307  const auto column_frag_offsets = get_col_frag_offsets(target_exprs, frag_offsets);
308  const auto column_frag_sizes =
309  get_consistent_frags_sizes(target_exprs, consistent_frag_sizes);
310 
311  result_sets_.emplace_back(
312  new ResultSet(target_exprs_to_infos(target_exprs, query_mem_desc),
313  executor->getColLazyFetchInfo(target_exprs),
314  col_buffers,
315  column_frag_offsets,
316  column_frag_sizes,
317  device_type,
318  device_id,
321  executor->getCatalog(),
322  executor->blockSize(),
323  executor->gridSize()));
324  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
325  executor->plan_state_->init_agg_vals_,
327  for (size_t j = 1; j < step; ++j) {
328  result_sets_.emplace_back(nullptr);
329  }
330  }
331 }
std::vector< Analyzer::Expr * > target_exprs
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:68
DeviceAllocator * device_allocator_
const std::optional< bool > union_all
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
#define CHECK_GE(x, y)
Definition: Logger.h:224
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< QuantileParam > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
std::vector< Analyzer::Expr * > target_exprs_union
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< int64_t > init_agg_vals_
bool lazyInitGroups(const ExecutorDeviceType) const
size_t g_max_memory_allocation_size
Definition: Execute.cpp:110
const std::shared_ptr< Analyzer::Estimator > estimator
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
std::vector< int64_t * > group_by_buffers_
std::optional< size_t > varlenOutputBufferElemSize() const
std::shared_ptr< VarlenOutputInfo > getVarlenOutputInfo()
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:642
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:211
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, const size_t thread_idx, RowSetMemoryOwner *mem_owner)
std::vector< std::unique_ptr< ResultSet > > result_sets_
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
int get_input_idx(RelAlgExecutionUnit const &ra_exe_unit, int const outer_table_id)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)

+ Here is the call graph for this function:

QueryMemoryInitializer::QueryMemoryInitializer ( const TableFunctionExecutionUnit exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const ExecutorDeviceType  device_type,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
DeviceAllocator device_allocator,
const Executor executor 
)

Definition at line 334 of file QueryMemoryInitializer.cpp.

345  : num_rows_(num_rows)
346  , row_set_mem_owner_(row_set_mem_owner)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< int64_t > init_agg_vals_
std::vector< Analyzer::Expr * > target_exprs
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)

Member Function Documentation

int64_t QueryMemoryInitializer::allocateCountDistinctBitmap ( const size_t  bitmap_byte_sz)
private

Definition at line 728 of file QueryMemoryInitializer.cpp.

References CHECK, count_distinct_bitmap_crt_ptr_, count_distinct_bitmap_host_mem_, row_set_mem_owner_, and thread_idx_.

Referenced by allocateCountDistinctBuffers(), and initColumnsPerRow().

728  {
732  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
733  row_set_mem_owner_->addCountDistinctBuffer(
734  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
735  return reinterpret_cast<int64_t>(ptr);
736  }
737  return reinterpret_cast<int64_t>(
738  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz, thread_idx_));
739 }
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
#define CHECK(condition)
Definition: Logger.h:211

+ Here is the caller graph for this function:

std::vector< int64_t > QueryMemoryInitializer::allocateCountDistinctBuffers ( const QueryMemoryDescriptor query_mem_desc,
const bool  deferred,
const Executor executor 
)
private

Definition at line 682 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBitmap(), allocateCountDistinctSet(), Bitmap, CHECK, CHECK_EQ, CHECK_GE, CHECK_LT, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getLogicalSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), init_agg_vals_, Invalid, is_distinct_target(), kAPPROX_COUNT_DISTINCT, kCOUNT, and StdSet.

Referenced by initRowGroups(), and QueryMemoryInitializer().

685  {
686  const size_t agg_col_count{query_mem_desc.getSlotCount()};
687  std::vector<int64_t> agg_bitmap_size(deferred ? agg_col_count : 0);
688 
689  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
690  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
691  ++target_idx) {
692  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
693  const auto agg_info = get_target_info(target_expr, g_bigint_count);
694  if (is_distinct_target(agg_info)) {
695  CHECK(agg_info.is_agg &&
696  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
697  CHECK(!agg_info.sql_type.is_varlen());
698 
699  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
700  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
701 
702  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
703  sizeof(int64_t));
704  const auto& count_distinct_desc =
705  query_mem_desc.getCountDistinctDescriptor(target_idx);
706  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
707  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
708  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
709  if (deferred) {
710  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
711  } else {
712  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
713  }
714  } else {
715  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::StdSet);
716  if (deferred) {
717  agg_bitmap_size[agg_col_idx] = -1;
718  } else {
719  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
720  }
721  }
722  }
723  }
724 
725  return agg_bitmap_size;
726 }
#define CHECK_EQ(x, y)
Definition: Logger.h:219
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
#define CHECK_GE(x, y)
Definition: Logger.h:224
std::vector< int64_t > init_agg_vals_
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:153
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:221
Definition: sqldefs.h:76
#define CHECK(condition)
Definition: Logger.h:211
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::allocateCountDistinctGpuMem ( const QueryMemoryDescriptor query_mem_desc)
private

Definition at line 650 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), Bitmap, CHECK, count_distinct_bitmap_crt_ptr_, count_distinct_bitmap_host_mem_, count_distinct_bitmap_mem_, count_distinct_bitmap_mem_bytes_, QueryMemoryDescriptor::countDistinctDescriptorsLogicallyEmpty(), device_allocator_, QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getCountDistinctDescriptorsSize(), QueryMemoryDescriptor::getEntryCount(), i, Invalid, row_set_mem_owner_, thread_idx_, and DeviceAllocator::zeroDeviceMem().

Referenced by QueryMemoryInitializer().

651  {
652  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
653  return;
654  }
656 
657  size_t total_bytes_per_entry{0};
658  const size_t num_count_distinct_descs =
659  query_mem_desc.getCountDistinctDescriptorsSize();
660  for (size_t i = 0; i < num_count_distinct_descs; i++) {
661  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
662  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
663  continue;
664  }
665  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
666  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
667  }
668 
670  total_bytes_per_entry * query_mem_desc.getEntryCount();
671  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
673  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
675 
678 }
bool countDistinctDescriptorsLogicallyEmpty() const
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:27
virtual int8_t * alloc(const size_t num_bytes)=0
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
size_t getCountDistinctDescriptorsSize() const
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK(condition)
Definition: Logger.h:211

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int64_t QueryMemoryInitializer::allocateCountDistinctSet ( )
private

Definition at line 741 of file QueryMemoryInitializer.cpp.

References row_set_mem_owner_.

Referenced by allocateCountDistinctBuffers(), and initColumnsPerRow().

741  {
742  auto count_distinct_set = new std::set<int64_t>();
743  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
744  return reinterpret_cast<int64_t>(count_distinct_set);
745 }
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_

+ Here is the caller graph for this function:

std::vector< QueryMemoryInitializer::QuantileParam > QueryMemoryInitializer::allocateTDigests ( const QueryMemoryDescriptor query_mem_desc,
const bool  deferred,
const Executor executor 
)
private

Definition at line 748 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, CHECK_GE, CHECK_LT, QueryMemoryDescriptor::getLogicalSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), init_agg_vals_, kAPPROX_QUANTILE, and row_set_mem_owner_.

Referenced by initRowGroups(), and QueryMemoryInitializer().

750  {
751  size_t const slot_count = query_mem_desc.getSlotCount();
752  size_t const ntargets = executor->plan_state_->target_exprs_.size();
753  CHECK_GE(slot_count, ntargets);
754  std::vector<QuantileParam> quantile_params(deferred ? slot_count : 0);
755 
756  for (size_t target_idx = 0; target_idx < ntargets; ++target_idx) {
757  auto const target_expr = executor->plan_state_->target_exprs_[target_idx];
758  if (auto const agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr)) {
759  if (agg_expr->get_aggtype() == kAPPROX_QUANTILE) {
760  size_t const agg_col_idx =
761  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
762  CHECK_LT(agg_col_idx, slot_count);
763  CHECK_EQ(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx),
764  static_cast<int8_t>(sizeof(int64_t)));
765  auto const q = agg_expr->get_arg1()->get_constval().doubleval;
766  if (deferred) {
767  quantile_params[agg_col_idx] = q;
768  } else {
769  // allocate for APPROX_QUANTILE only when slot is used
770  init_agg_vals_[agg_col_idx] =
771  reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
772  }
773  }
774  }
775  }
776  return quantile_params;
777 }
#define CHECK_EQ(x, y)
Definition: Logger.h:219
#define CHECK_GE(x, y)
Definition: Logger.h:224
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< int64_t > init_agg_vals_
#define CHECK_LT(x, y)
Definition: Logger.h:221
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::applyStreamingTopNOffsetCpu ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 1140 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, CPU, streaming_top_n::get_rows_copy_from_heaps(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), SortInfo::limit, SortInfo::offset, and RelAlgExecutionUnit::sort_info.

1142  {
1143  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1144  CHECK_EQ(group_by_buffers_.size(), buffer_start_idx + 1);
1145 
1146  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1147  group_by_buffers_[buffer_start_idx],
1148  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1149  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
1150  1);
1151  CHECK_EQ(rows_copy.size(),
1152  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1153  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1154 }
#define CHECK_EQ(x, y)
Definition: Logger.h:219
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
const size_t limit
std::vector< int64_t * > group_by_buffers_
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
const size_t offset

+ Here is the call graph for this function:

void QueryMemoryInitializer::applyStreamingTopNOffsetGpu ( Data_Namespace::DataMgr data_mgr,
const QueryMemoryDescriptor query_mem_desc,
const GpuGroupByBuffers gpu_group_by_buffers,
const RelAlgExecutionUnit ra_exe_unit,
const unsigned  total_thread_count,
const int  device_id 
)
private

Definition at line 1156 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, GpuGroupByBuffers::data, QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), num_buffers_, and UNREACHABLE.

1162  {
1163 #ifdef HAVE_CUDA
1165  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1166 
1167  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1168  data_mgr,
1169  reinterpret_cast<int64_t*>(gpu_group_by_buffers.data),
1170  ra_exe_unit,
1171  query_mem_desc,
1172  total_thread_count,
1173  device_id);
1174  CHECK_EQ(
1175  rows_copy.size(),
1176  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1177  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1178 #else
1179  UNREACHABLE();
1180 #endif
1181 }
#define CHECK_EQ(x, y)
Definition: Logger.h:219
#define UNREACHABLE()
Definition: Logger.h:255
std::vector< int64_t * > group_by_buffers_

+ Here is the call graph for this function:

void QueryMemoryInitializer::compactProjectionBuffersCpu ( const QueryMemoryDescriptor query_mem_desc,
const size_t  projection_count 
)
private

Definition at line 1064 of file QueryMemoryInitializer.cpp.

References CHECK, anonymous_namespace{QueryMemoryInitializer.cpp}::compact_projection_buffer_for_cpu_columnar(), QueryMemoryDescriptor::getEntryCount(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), and result_sets_.

1066  {
1067  const auto num_allocated_rows =
1068  std::min(projection_count, query_mem_desc.getEntryCount());
1069  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1070 
1071  // copy the results from the main buffer into projection_buffer
1073  query_mem_desc,
1074  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1075  num_allocated_rows);
1076 
1077  // update the entry count for the result set, and its underlying storage
1078  CHECK(!result_sets_.empty());
1079  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1080 }
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
#define CHECK(condition)
Definition: Logger.h:211
std::vector< std::unique_ptr< ResultSet > > result_sets_

+ Here is the call graph for this function:

void QueryMemoryInitializer::compactProjectionBuffersGpu ( const QueryMemoryDescriptor query_mem_desc,
Data_Namespace::DataMgr data_mgr,
const GpuGroupByBuffers gpu_group_by_buffers,
const size_t  projection_count,
const int  device_id 
)
private

Definition at line 1082 of file QueryMemoryInitializer.cpp.

References CHECK, copy_projection_buffer_from_gpu_columnar(), QueryMemoryDescriptor::getEntryCount(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), and result_sets_.

1087  {
1088  // store total number of allocated rows:
1089  const auto num_allocated_rows =
1090  std::min(projection_count, query_mem_desc.getEntryCount());
1091 
1092  // copy the results from the main buffer into projection_buffer
1093  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1095  data_mgr,
1096  gpu_group_by_buffers,
1097  query_mem_desc,
1098  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1099  num_allocated_rows,
1100  device_id);
1101 
1102  // update the entry count for the result set, and its underlying storage
1103  CHECK(!result_sets_.empty());
1104  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1105 }
std::vector< int64_t * > group_by_buffers_
#define CHECK(condition)
Definition: Logger.h:211
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
std::vector< std::unique_ptr< ResultSet > > result_sets_

+ Here is the call graph for this function:

size_t QueryMemoryInitializer::computeNumberOfBuffers ( const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
const Executor executor 
) const
private

Definition at line 1020 of file QueryMemoryInitializer.cpp.

References QueryMemoryDescriptor::blocksShareMemory(), and CPU.

1023  {
1024  return device_type == ExecutorDeviceType::CPU
1025  ? 1
1026  : executor->blockSize() *
1027  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
1028 }

+ Here is the call graph for this function:

void QueryMemoryInitializer::copyFromTableFunctionGpuBuffers ( Data_Namespace::DataMgr data_mgr,
const QueryMemoryDescriptor query_mem_desc,
const size_t  entry_count,
const GpuGroupByBuffers gpu_group_by_buffers,
const int  device_id,
const unsigned  block_size_x,
const unsigned  grid_size_x 
)

Definition at line 984 of file QueryMemoryInitializer.cpp.

References align_to_int64(), CHECK_LE, Data_Namespace::DataMgr::createGpuAllocator(), GpuGroupByBuffers::data, GpuGroupByBuffers::entry_count, QueryMemoryDescriptor::getBufferColSlotCount(), QueryMemoryDescriptor::getColSlotContext(), and group_by_buffers_.

991  {
992  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
993 
994  int8_t* dev_buffer = gpu_group_by_buffers.data;
995  int8_t* host_buffer = reinterpret_cast<int8_t*>(group_by_buffers_[0]);
996 
997  const size_t original_entry_count = gpu_group_by_buffers.entry_count;
998  CHECK_LE(entry_count, original_entry_count);
999  size_t output_device_col_offset{0};
1000  size_t output_host_col_offset{0};
1001 
1002  const auto col_slot_context = query_mem_desc.getColSlotContext();
1003 
1004  auto allocator = data_mgr->createGpuAllocator(device_id);
1005 
1006  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1007  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
1008  const size_t output_device_col_size = original_entry_count * col_width;
1009  const size_t output_host_col_size = entry_count * col_width;
1010  allocator->copyFromDevice(host_buffer + output_host_col_offset,
1011  dev_buffer + output_device_col_offset,
1012  output_host_col_size);
1013  output_device_col_offset =
1014  align_to_int64(output_device_col_offset + output_device_col_size);
1015  output_host_col_offset =
1016  align_to_int64(output_host_col_offset + output_host_col_size);
1017  }
1018 }
std::unique_ptr< DeviceAllocator > createGpuAllocator(int device_id)
Definition: DataMgr.cpp:526
std::vector< int64_t * > group_by_buffers_
#define CHECK_LE(x, y)
Definition: Logger.h:222
const ColSlotContext & getColSlotContext() const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

void QueryMemoryInitializer::copyGroupByBuffersFromGpu ( DeviceAllocator device_allocator,
const QueryMemoryDescriptor query_mem_desc,
const size_t  entry_count,
const GpuGroupByBuffers gpu_group_by_buffers,
const RelAlgExecutionUnit ra_exe_unit,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int  device_id,
const bool  prepend_index_buffer 
) const

Definition at line 1107 of file QueryMemoryInitializer.cpp.

References copy_group_by_buffers_from_gpu(), GpuGroupByBuffers::data, streaming_top_n::get_heap_size(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getRowSize(), GPU, group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), SortInfo::limit, anonymous_namespace{Utm.h}::n, SortInfo::offset, RelAlgExecutionUnit::sort_info, and QueryMemoryDescriptor::useStreamingTopN().

1116  {
1117  const auto thread_count = block_size_x * grid_size_x;
1118 
1119  size_t total_buff_size{0};
1120  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
1121  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
1122  total_buff_size =
1123  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1124  } else {
1125  total_buff_size =
1126  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
1127  }
1128  copy_group_by_buffers_from_gpu(device_allocator,
1130  total_buff_size,
1131  gpu_group_by_buffers.data,
1132  query_mem_desc,
1133  block_size_x,
1134  grid_size_x,
1135  device_id,
1136  prepend_index_buffer,
1137  query_mem_desc.hasVarlenOutput());
1138 }
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
const size_t limit
std::vector< int64_t * > group_by_buffers_
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
constexpr double n
Definition: Utm.h:38
const size_t offset
void copy_group_by_buffers_from_gpu(DeviceAllocator &device_allocator, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const int8_t *group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)

+ Here is the call graph for this function:

GpuGroupByBuffers QueryMemoryInitializer::createAndInitializeGroupByBufferGpu ( const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int8_t *  init_agg_vals_dev_ptr,
const int  device_id,
const ExecutorDispatchMode  dispatch_mode,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int8_t  warp_size,
const bool  can_sort_on_gpu,
const bool  output_columnar,
RenderAllocator render_allocator 
)
private

Definition at line 833 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), CHECK, CHECK_EQ, DeviceAllocator::copyToDevice(), create_dev_group_by_buffers(), device_allocator_, RenderAllocator::getAllocatedSize(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getEntryCount(), getGroupByBuffersSize(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getRowSize(), QueryMemoryDescriptor::getSlotCount(), GPU, group_by_buffers_, QueryMemoryDescriptor::hasKeylessHash(), QueryMemoryDescriptor::hasVarlenOutput(), i, init_columnar_group_by_buffer_on_device(), init_group_by_buffer_on_device(), QueryMemoryDescriptor::interleavedBins(), QueryMemoryDescriptor::lazyInitGroups(), SortInfo::limit, anonymous_namespace{Utm.h}::n, num_rows_, SortInfo::offset, prepareTopNHeapsDevBuffer(), row_set_mem_owner_, RelAlgExecutionUnit::sort_info, thread_idx_, QueryMemoryDescriptor::threadsShareMemory(), UNREACHABLE, RelAlgExecutionUnit::use_bump_allocator, QueryMemoryDescriptor::useStreamingTopN(), varlen_output_buffer_, varlen_output_buffer_host_ptr_, varlen_output_info_, and QueryMemoryDescriptor::varlenOutputBufferElemSize().

844  {
845 #ifdef HAVE_CUDA
846  if (query_mem_desc.useStreamingTopN()) {
847  if (render_allocator) {
849  }
850  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
851  CHECK(!output_columnar);
852 
854  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
855  }
856 
857  auto dev_group_by_buffers =
860  query_mem_desc,
861  block_size_x,
862  grid_size_x,
863  device_id,
864  dispatch_mode,
865  num_rows_,
866  can_sort_on_gpu,
867  false,
868  ra_exe_unit.use_bump_allocator,
869  query_mem_desc.hasVarlenOutput(),
870  render_allocator);
871  if (query_mem_desc.hasVarlenOutput()) {
872  CHECK(dev_group_by_buffers.varlen_output_buffer);
874  reinterpret_cast<CUdeviceptr>(dev_group_by_buffers.varlen_output_buffer);
875  CHECK(query_mem_desc.varlenOutputBufferElemSize());
876  const size_t varlen_output_buf_bytes =
877  query_mem_desc.getEntryCount() *
878  query_mem_desc.varlenOutputBufferElemSize().value();
880  row_set_mem_owner_->allocate(varlen_output_buf_bytes, thread_idx_);
882  varlen_output_info_->gpu_start_address = static_cast<int64_t>(varlen_output_buffer_);
884  }
885  if (render_allocator) {
886  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
887  }
888  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
889  CHECK(!render_allocator);
890 
891  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
892  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
893  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
894  auto group_by_dev_buffer = dev_group_by_buffers.data;
895  const size_t col_count = query_mem_desc.getSlotCount();
896  int8_t* col_widths_dev_ptr{nullptr};
897  if (output_columnar) {
898  std::vector<int8_t> compact_col_widths(col_count);
899  for (size_t idx = 0; idx < col_count; ++idx) {
900  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
901  }
902  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
904  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
905  }
906  const int8_t warp_count =
907  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
908  const auto num_group_by_buffers =
909  getGroupByBuffersSize() - (query_mem_desc.hasVarlenOutput() ? 1 : 0);
910  for (size_t i = 0; i < num_group_by_buffers; i += step) {
911  if (output_columnar) {
913  reinterpret_cast<int64_t*>(group_by_dev_buffer),
914  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
915  dev_group_by_buffers.entry_count,
916  query_mem_desc.getGroupbyColCount(),
917  col_count,
918  col_widths_dev_ptr,
919  /*need_padding = */ true,
920  query_mem_desc.hasKeylessHash(),
921  sizeof(int64_t),
922  block_size_x,
923  grid_size_x);
924  } else {
926  reinterpret_cast<int64_t*>(group_by_dev_buffer),
927  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
928  dev_group_by_buffers.entry_count,
929  query_mem_desc.getGroupbyColCount(),
930  query_mem_desc.getEffectiveKeyWidth(),
931  query_mem_desc.getRowSize() / sizeof(int64_t),
932  query_mem_desc.hasKeylessHash(),
933  warp_count,
934  block_size_x,
935  grid_size_x);
936  }
937  group_by_dev_buffer += groups_buffer_size;
938  }
939  }
940  return dev_group_by_buffers;
941 #else
942  UNREACHABLE();
943  return {};
944 #endif
945 }
#define CHECK_EQ(x, y)
Definition: Logger.h:219
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *device_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:60
GpuGroupByBuffers prepareTopNHeapsDevBuffer(const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:27
#define UNREACHABLE()
Definition: Logger.h:255
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getEffectiveKeyWidth() const
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
const size_t limit
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
size_t getGroupbyColCount() const
bool lazyInitGroups(const ExecutorDeviceType) const
size_t getAllocatedSize() const
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
std::vector< int64_t * > group_by_buffers_
std::optional< size_t > varlenOutputBufferElemSize() const
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:211
const auto getGroupByBuffersSize() const
std::shared_ptr< VarlenOutputInfo > varlen_output_info_
constexpr double n
Definition: Utm.h:38
const size_t offset

+ Here is the call graph for this function:

int64_t QueryMemoryInitializer::getAggInitValForIndex ( const size_t  index) const
inline

Definition at line 96 of file QueryMemoryInitializer.h.

References CHECK_LT, and init_agg_vals_.

96  {
97  CHECK_LT(index, init_agg_vals_.size());
98  return init_agg_vals_[index];
99  }
std::vector< int64_t > init_agg_vals_
#define CHECK_LT(x, y)
Definition: Logger.h:221
const auto QueryMemoryInitializer::getCountDistinctBitmapBytes ( ) const
inline

Definition at line 72 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_mem_bytes_.

72  {
74  }
const auto QueryMemoryInitializer::getCountDistinctBitmapPtr ( ) const
inline

Definition at line 68 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_mem_.

const auto QueryMemoryInitializer::getCountDistinctHostPtr ( ) const
inline

Definition at line 70 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_host_mem_.

const auto QueryMemoryInitializer::getGroupByBuffersPtr ( )
inline

Definition at line 101 of file QueryMemoryInitializer.h.

References group_by_buffers_.

101  {
102  return reinterpret_cast<int64_t**>(group_by_buffers_.data());
103  }
std::vector< int64_t * > group_by_buffers_
const auto QueryMemoryInitializer::getGroupByBuffersSize ( ) const
inline

Definition at line 105 of file QueryMemoryInitializer.h.

References group_by_buffers_.

Referenced by createAndInitializeGroupByBufferGpu().

105 { return group_by_buffers_.size(); }
std::vector< int64_t * > group_by_buffers_

+ Here is the caller graph for this function:

const auto QueryMemoryInitializer::getNumBuffers ( ) const
inline

Definition at line 107 of file QueryMemoryInitializer.h.

References CHECK_EQ, group_by_buffers_, and num_buffers_.

107  {
109  return num_buffers_;
110  }
#define CHECK_EQ(x, y)
Definition: Logger.h:219
std::vector< int64_t * > group_by_buffers_
ResultSet* QueryMemoryInitializer::getResultSet ( const size_t  index) const
inline

Definition at line 81 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

81  {
82  CHECK_LT(index, result_sets_.size());
83  return result_sets_[index].get();
84  }
#define CHECK_LT(x, y)
Definition: Logger.h:221
std::vector< std::unique_ptr< ResultSet > > result_sets_
std::unique_ptr<ResultSet> QueryMemoryInitializer::getResultSetOwned ( const size_t  index)
inline

Definition at line 86 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

86  {
87  CHECK_LT(index, result_sets_.size());
88  return std::move(result_sets_[index]);
89  }
#define CHECK_LT(x, y)
Definition: Logger.h:221
std::vector< std::unique_ptr< ResultSet > > result_sets_
const auto QueryMemoryInitializer::getVarlenOutputHostPtr ( ) const
inline

Definition at line 77 of file QueryMemoryInitializer.h.

References varlen_output_buffer_host_ptr_.

std::shared_ptr< VarlenOutputInfo > QueryMemoryInitializer::getVarlenOutputInfo ( )
private

Definition at line 1183 of file QueryMemoryInitializer.cpp.

References varlen_output_buffer_, varlen_output_buffer_host_ptr_, and varlen_output_info_.

Referenced by QueryMemoryInitializer().

1183  {
1184  if (varlen_output_info_) {
1185  return varlen_output_info_;
1186  }
1187 
1188  // shared_ptr so that both the ResultSet and QMI can hold on to the varlen info object
1189  // and update it as needed
1190  varlen_output_info_ = std::make_shared<VarlenOutputInfo>(VarlenOutputInfo{
1191  static_cast<int64_t>(varlen_output_buffer_), varlen_output_buffer_host_ptr_});
1192  return varlen_output_info_;
1193 }
std::shared_ptr< VarlenOutputInfo > varlen_output_info_

+ Here is the caller graph for this function:

const auto QueryMemoryInitializer::getVarlenOutputPtr ( ) const
inline

Definition at line 79 of file QueryMemoryInitializer.h.

References varlen_output_buffer_.

79 { return varlen_output_buffer_; }
void QueryMemoryInitializer::initColumnarGroups ( const QueryMemoryDescriptor query_mem_desc,
int64_t *  groups_buffer,
const std::vector< int64_t > &  init_vals,
const Executor executor 
)
private

Definition at line 530 of file QueryMemoryInitializer.cpp.

References align_to_int64(), CHECK, CHECK_LT, EMPTY_KEY_64, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getQueryDescriptionType(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::hasKeylessHash(), i, is_distinct_target(), Projection, and TableFunction.

Referenced by initGroupByBuffer().

534  {
535  CHECK(groups_buffer);
537  // As an optimization we don't init table function buffers as we expect outputs to be
538  // dense
539  return;
540  }
541  for (const auto target_expr : executor->plan_state_->target_exprs_) {
542  const auto agg_info = get_target_info(target_expr, g_bigint_count);
543  CHECK(!is_distinct_target(agg_info));
544  }
545  const int32_t agg_col_count = query_mem_desc.getSlotCount();
546  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
547 
548  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
549  if (!query_mem_desc.hasKeylessHash()) {
550  const size_t key_count{query_mem_desc.getGroupbyColCount()};
551  for (size_t i = 0; i < key_count; ++i) {
552  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
553  EMPTY_KEY_64,
554  groups_buffer_entry_count);
555  }
556  }
557 
559  // initializing all aggregate columns:
560  int32_t init_val_idx = 0;
561  for (int32_t i = 0; i < agg_col_count; ++i) {
562  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
563  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
564  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
565  case 1:
566  buffer_ptr = initColumnarBuffer<int8_t>(
567  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
568  break;
569  case 2:
570  buffer_ptr =
571  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
572  init_vals[init_val_idx++],
573  groups_buffer_entry_count);
574  break;
575  case 4:
576  buffer_ptr =
577  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
578  init_vals[init_val_idx++],
579  groups_buffer_entry_count);
580  break;
581  case 8:
582  buffer_ptr =
583  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
584  init_vals[init_val_idx++],
585  groups_buffer_entry_count);
586  break;
587  case 0:
588  break;
589  default:
590  CHECK(false);
591  }
592 
593  buffer_ptr = align_to_int64(buffer_ptr);
594  }
595  }
596  }
597 }
#define EMPTY_KEY_64
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
Definition: TargetInfo.h:92
size_t getGroupbyColCount() const
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:153
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
QueryDescriptionType getQueryDescriptionType() const
#define CHECK_LT(x, y)
Definition: Logger.h:221
#define CHECK(condition)
Definition: Logger.h:211
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initColumnsPerRow ( const QueryMemoryDescriptor query_mem_desc,
int8_t *  row_ptr,
const std::vector< int64_t > &  init_vals,
const std::vector< int64_t > &  bitmap_sizes,
const std::vector< QuantileParam > &  quantile_params 
)
private

Definition at line 599 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBitmap(), allocateCountDistinctSet(), CHECK, CHECK_EQ, CHECK_LT, QueryMemoryDescriptor::getNextColOffInBytesRowOnly(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::isGroupBy(), and row_set_mem_owner_.

Referenced by initRowGroups().

604  {
605  int8_t* col_ptr = row_ptr;
606  size_t init_vec_idx = 0;
607  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
608  col_ptr += query_mem_desc.getNextColOffInBytesRowOnly(col_ptr, col_idx++)) {
609  const int64_t bm_sz{bitmap_sizes[col_idx]};
610  int64_t init_val{0};
611  if (bm_sz && query_mem_desc.isGroupBy()) {
612  // COUNT DISTINCT / APPROX_COUNT_DISTINCT
613  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
614  sizeof(int64_t));
615  init_val =
617  ++init_vec_idx;
618  } else if (query_mem_desc.isGroupBy() && quantile_params[col_idx]) {
619  auto const q = *quantile_params[col_idx];
620  // allocate for APPROX_QUANTILE only when slot is used
621  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
622  ++init_vec_idx;
623  } else {
624  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
625  CHECK_LT(init_vec_idx, init_vals.size());
626  init_val = init_vals[init_vec_idx++];
627  }
628  }
629  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
630  case 1:
631  *col_ptr = static_cast<int8_t>(init_val);
632  break;
633  case 2:
634  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
635  break;
636  case 4:
637  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
638  break;
639  case 8:
640  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
641  break;
642  case 0:
643  continue;
644  default:
645  CHECK(false);
646  }
647  }
648 }
#define CHECK_EQ(x, y)
Definition: Logger.h:219
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:221
size_t getNextColOffInBytesRowOnly(const int8_t *col_ptr, const size_t col_idx) const
#define CHECK(condition)
Definition: Logger.h:211
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initGroupByBuffer ( int64_t *  buffer,
const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
const bool  output_columnar,
const Executor executor 
)
private

Definition at line 402 of file QueryMemoryInitializer.cpp.

References streaming_top_n::get_rows_offset_of_heaps(), QueryMemoryDescriptor::getEntryCount(), GPU, init_agg_vals_, initColumnarGroups(), initRowGroups(), QueryMemoryDescriptor::interleavedBins(), SortInfo::limit, anonymous_namespace{Utm.h}::n, SortInfo::offset, RelAlgExecutionUnit::sort_info, and QueryMemoryDescriptor::useStreamingTopN().

Referenced by QueryMemoryInitializer().

408  {
409  if (output_columnar) {
410  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor);
411  } else {
412  auto rows_ptr = buffer;
413  auto actual_entry_count = query_mem_desc.getEntryCount();
414  const auto thread_count = device_type == ExecutorDeviceType::GPU
415  ? executor->blockSize() * executor->gridSize()
416  : 1;
417  auto warp_size =
418  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
419  if (query_mem_desc.useStreamingTopN()) {
420  const auto node_count_size = thread_count * sizeof(int64_t);
421  memset(rows_ptr, 0, node_count_size);
422  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
423  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
424  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
425  rows_ptr += rows_offset / sizeof(int64_t);
426  actual_entry_count = n * thread_count;
427  warp_size = 1;
428  }
429  initRowGroups(query_mem_desc,
430  rows_ptr,
432  actual_entry_count,
433  warp_size,
434  executor);
435  }
436 }
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
const size_t limit
std::vector< int64_t > init_agg_vals_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
bool interleavedBins(const ExecutorDeviceType) const
constexpr double n
Definition: Utm.h:38
const size_t offset
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initRowGroups ( const QueryMemoryDescriptor query_mem_desc,
int64_t *  groups_buffer,
const std::vector< int64_t > &  init_vals,
const int32_t  groups_buffer_entry_count,
const size_t  warp_size,
const Executor executor 
)
private

Definition at line 438 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBuffers(), allocateTDigests(), CHECK, result_set::fill_empty_key(), ResultSet::fixupQueryMemoryDescriptor(), g_optimize_row_initialization, QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getRowSize(), QueryMemoryDescriptor::hasKeylessHash(), and initColumnsPerRow().

Referenced by initGroupByBuffer().

443  {
444  const size_t key_count{query_mem_desc.getGroupbyColCount()};
445  const size_t row_size{query_mem_desc.getRowSize()};
446  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
447 
448  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
449  auto quantile_params = allocateTDigests(query_mem_desc, true, executor);
450  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
451 
452  const auto query_mem_desc_fixedup =
454 
455  auto const is_true = [](auto const& x) { return static_cast<bool>(x); };
456  // not COUNT DISTINCT / APPROX_COUNT_DISTINCT / APPROX_QUANTILE
457  // we fallback to default implementation in that cases
458  if (!std::any_of(agg_bitmap_size.begin(), agg_bitmap_size.end(), is_true) &&
459  !std::any_of(quantile_params.begin(), quantile_params.end(), is_true) &&
461  std::vector<int8_t> sample_row(row_size - col_base_off);
462 
463  initColumnsPerRow(query_mem_desc_fixedup,
464  sample_row.data(),
465  init_vals,
466  agg_bitmap_size,
467  quantile_params);
468 
469  if (query_mem_desc.hasKeylessHash()) {
470  CHECK(warp_size >= 1);
471  CHECK(key_count == 1 || warp_size == 1);
472  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
473  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
474  ++bin, buffer_ptr += row_size) {
475  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
476  }
477  }
478  return;
479  }
480 
481  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
482  ++bin, buffer_ptr += row_size) {
483  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
485  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
486  }
487  } else {
488  if (query_mem_desc.hasKeylessHash()) {
489  CHECK(warp_size >= 1);
490  CHECK(key_count == 1 || warp_size == 1);
491  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
492  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
493  ++bin, buffer_ptr += row_size) {
494  initColumnsPerRow(query_mem_desc_fixedup,
495  &buffer_ptr[col_base_off],
496  init_vals,
497  agg_bitmap_size,
498  quantile_params);
499  }
500  }
501  return;
502  }
503 
504  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
505  ++bin, buffer_ptr += row_size) {
507  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
508  initColumnsPerRow(query_mem_desc_fixedup,
509  &buffer_ptr[col_base_off],
510  init_vals,
511  agg_bitmap_size,
512  quantile_params);
513  }
514  }
515 }
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< QuantileParam > &quantile_params)
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
size_t getEffectiveKeyWidth() const
std::vector< QuantileParam > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
size_t getGroupbyColCount() const
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:642
#define CHECK(condition)
Definition: Logger.h:211
bool g_optimize_row_initialization
Definition: Execute.cpp:97
size_t getColOffInBytes(const size_t col_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GpuGroupByBuffers QueryMemoryInitializer::prepareTopNHeapsDevBuffer ( const QueryMemoryDescriptor query_mem_desc,
const int8_t *  init_agg_vals_dev_ptr,
const size_t  n,
const int  device_id,
const unsigned  block_size_x,
const unsigned  grid_size_x 
)
private

Definition at line 779 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), CHECK, DeviceAllocator::copyToDevice(), device_allocator_, streaming_top_n::get_heap_size(), streaming_top_n::get_rows_offset_of_heaps(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getRowSize(), GPU, QueryMemoryDescriptor::hasKeylessHash(), i, init_group_by_buffer_on_device(), QueryMemoryDescriptor::lazyInitGroups(), anonymous_namespace{Utm.h}::n, DeviceAllocator::setDeviceMem(), UNREACHABLE, and DeviceAllocator::zeroDeviceMem().

Referenced by createAndInitializeGroupByBufferGpu().

785  {
786 #ifdef HAVE_CUDA
788  const auto thread_count = block_size_x * grid_size_x;
789  const auto total_buff_size =
790  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
791  int8_t* dev_buffer = device_allocator_->alloc(total_buff_size);
792 
793  std::vector<int8_t*> dev_buffers(thread_count);
794 
795  for (size_t i = 0; i < thread_count; ++i) {
796  dev_buffers[i] = dev_buffer;
797  }
798 
799  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(int8_t*));
801  dev_ptr, dev_buffers.data(), thread_count * sizeof(int8_t*));
802 
804 
805  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
806  thread_count * sizeof(int64_t));
807 
809  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
810  (unsigned char)-1,
811  thread_count * n * sizeof(int64_t));
812 
814  reinterpret_cast<int64_t*>(
815  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
816  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
817  n * thread_count,
818  query_mem_desc.getGroupbyColCount(),
819  query_mem_desc.getEffectiveKeyWidth(),
820  query_mem_desc.getRowSize() / sizeof(int64_t),
821  query_mem_desc.hasKeylessHash(),
822  1,
823  block_size_x,
824  grid_size_x);
825 
826  return {dev_ptr, dev_buffer};
827 #else
828  UNREACHABLE();
829  return {};
830 #endif
831 }
DeviceAllocator * device_allocator_
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
#define UNREACHABLE()
Definition: Logger.h:255
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getEffectiveKeyWidth() const
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
size_t getGroupbyColCount() const
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
#define CHECK(condition)
Definition: Logger.h:211
constexpr double n
Definition: Utm.h:38
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::resetResultSet ( const size_t  index)
inline

Definition at line 91 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

91  {
92  CHECK_LT(index, result_sets_.size());
93  result_sets_[index].reset();
94  }
#define CHECK_LT(x, y)
Definition: Logger.h:221
std::vector< std::unique_ptr< ResultSet > > result_sets_
GpuGroupByBuffers QueryMemoryInitializer::setupTableFunctionGpuBuffers ( const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const unsigned  block_size_x,
const unsigned  grid_size_x 
)

Definition at line 947 of file QueryMemoryInitializer.cpp.

References align_to_int64(), Allocator::alloc(), CHECK, CHECK_GT, DeviceAllocator::copyToDevice(), device_allocator_, QueryMemoryDescriptor::getBufferColSlotCount(), QueryMemoryDescriptor::getColSlotContext(), ColSlotContext::getSlotInfo(), SlotSize::logical_size, and num_rows_.

951  {
952  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
953  CHECK_GT(num_columns, size_t(0));
954  size_t total_group_by_buffer_size{0};
955  const auto col_slot_context = query_mem_desc.getColSlotContext();
956 
957  std::vector<size_t> col_byte_offsets;
958  col_byte_offsets.reserve(num_columns);
959 
960  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
961  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
962  size_t group_buffer_size = num_rows_ * col_width;
963  col_byte_offsets.emplace_back(total_group_by_buffer_size);
964  total_group_by_buffer_size =
965  align_to_int64(total_group_by_buffer_size + group_buffer_size);
966  }
967 
968  int8_t* dev_buffers_allocation{nullptr};
969  dev_buffers_allocation = device_allocator_->alloc(total_group_by_buffer_size);
970  CHECK(dev_buffers_allocation);
971 
972  auto dev_buffers_mem = dev_buffers_allocation;
973  std::vector<int8_t*> dev_buffers(num_columns);
974  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
975  dev_buffers[col_idx] = dev_buffers_allocation + col_byte_offsets[col_idx];
976  }
977  auto dev_ptrs = device_allocator_->alloc(num_columns * sizeof(CUdeviceptr));
979  dev_ptrs, dev_buffers.data(), num_columns * sizeof(CUdeviceptr));
980 
981  return {dev_ptrs, dev_buffers_mem, (size_t)num_rows_};
982 }
int8_t logical_size
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:27
virtual int8_t * alloc(const size_t num_bytes)=0
#define CHECK_GT(x, y)
Definition: Logger.h:223
const SlotSize & getSlotInfo(const size_t slot_idx) const
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
const ColSlotContext & getColSlotContext() const
#define CHECK(condition)
Definition: Logger.h:211
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

Friends And Related Function Documentation

friend class Executor
friend

Definition at line 243 of file QueryMemoryInitializer.h.

friend class QueryExecutionContext
friend

Definition at line 244 of file QueryMemoryInitializer.h.

Member Data Documentation

int8_t* QueryMemoryInitializer::count_distinct_bitmap_crt_ptr_
private
int8_t* QueryMemoryInitializer::count_distinct_bitmap_host_mem_
private
CUdeviceptr QueryMemoryInitializer::count_distinct_bitmap_mem_
private
size_t QueryMemoryInitializer::count_distinct_bitmap_mem_bytes_
private
DeviceAllocator* QueryMemoryInitializer::device_allocator_ {nullptr}
private
std::vector<int64_t> QueryMemoryInitializer::init_agg_vals_
private
size_t QueryMemoryInitializer::num_buffers_
private
const int64_t QueryMemoryInitializer::num_rows_
private
std::vector<std::unique_ptr<ResultSet> > QueryMemoryInitializer::result_sets_
private
std::vector<Data_Namespace::AbstractBuffer*> QueryMemoryInitializer::temporary_buffers_
private

Definition at line 239 of file QueryMemoryInitializer.h.

const size_t QueryMemoryInitializer::thread_idx_
private
CUdeviceptr QueryMemoryInitializer::varlen_output_buffer_
private
int8_t* QueryMemoryInitializer::varlen_output_buffer_host_ptr_
private
std::shared_ptr<VarlenOutputInfo> QueryMemoryInitializer::varlen_output_info_
private

The documentation for this class was generated from the following files: