OmniSciDB  c0231cc57d
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
QueryMemoryInitializer Class Reference

#include <QueryMemoryInitializer.h>

+ Collaboration diagram for QueryMemoryInitializer:

Public Member Functions

 QueryMemoryInitializer (const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int outer_table_id, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const size_t thread_idx, const Executor *executor)
 
 QueryMemoryInitializer (const TableFunctionExecutionUnit &exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *device_allocator, const Executor *executor)
 
const auto getCountDistinctBitmapPtr () const
 
const auto getCountDistinctHostPtr () const
 
const auto getCountDistinctBitmapBytes () const
 
const auto getVarlenOutputHostPtr () const
 
const auto getVarlenOutputPtr () const
 
ResultSetgetResultSet (const size_t index) const
 
std::unique_ptr< ResultSetgetResultSetOwned (const size_t index)
 
void resetResultSet (const size_t index)
 
int64_t getAggInitValForIndex (const size_t index) const
 
const auto getGroupByBuffersPtr ()
 
const auto getGroupByBuffersSize () const
 
const auto getNumBuffers () const
 
GpuGroupByBuffers setupTableFunctionGpuBuffers (const QueryMemoryDescriptor &query_mem_desc, const int device_id, const unsigned block_size_x, const unsigned grid_size_x, const bool zero_initialize_buffers)
 
void copyFromTableFunctionGpuBuffers (Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
 
void copyGroupByBuffersFromGpu (DeviceAllocator &device_allocator, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
 

Private Types

using QuantileParam = std::optional< double >
 

Private Member Functions

void initGroupByBuffer (int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
 
void initRowGroups (const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
 
void initColumnarGroups (const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
 
void initColumnsPerRow (const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< QuantileParam > &quantile_params)
 
void allocateCountDistinctGpuMem (const QueryMemoryDescriptor &query_mem_desc)
 
std::vector< int64_t > allocateCountDistinctBuffers (const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
 
int64_t allocateCountDistinctBitmap (const size_t bitmap_byte_sz)
 
int64_t allocateCountDistinctSet ()
 
std::vector< QuantileParamallocateTDigests (const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
 
GpuGroupByBuffers prepareTopNHeapsDevBuffer (const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
 
GpuGroupByBuffers createAndInitializeGroupByBufferGpu (const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const int device_id, const ExecutorDispatchMode dispatch_mode, const unsigned block_size_x, const unsigned grid_size_x, const int8_t warp_size, const bool can_sort_on_gpu, const bool output_columnar, RenderAllocator *render_allocator)
 
size_t computeNumberOfBuffers (const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
 
void compactProjectionBuffersCpu (const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
 
void compactProjectionBuffersGpu (const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
 
void applyStreamingTopNOffsetCpu (const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
 
void applyStreamingTopNOffsetGpu (Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
 
std::shared_ptr< VarlenOutputInfogetVarlenOutputInfo ()
 

Private Attributes

const int64_t num_rows_
 
std::shared_ptr
< RowSetMemoryOwner
row_set_mem_owner_
 
std::vector< std::unique_ptr
< ResultSet > > 
result_sets_
 
std::vector< int64_t > init_agg_vals_
 
size_t num_buffers_
 
std::vector< int64_t * > group_by_buffers_
 
std::shared_ptr< VarlenOutputInfovarlen_output_info_
 
CUdeviceptr varlen_output_buffer_
 
int8_t * varlen_output_buffer_host_ptr_
 
CUdeviceptr count_distinct_bitmap_mem_
 
size_t count_distinct_bitmap_mem_bytes_
 
int8_t * count_distinct_bitmap_crt_ptr_
 
int8_t * count_distinct_bitmap_host_mem_
 
DeviceAllocatordevice_allocator_ {nullptr}
 
std::vector
< Data_Namespace::AbstractBuffer * > 
temporary_buffers_
 
const size_t thread_idx_
 

Friends

class Executor
 
class QueryExecutionContext
 

Detailed Description

Definition at line 35 of file QueryMemoryInitializer.h.

Member Typedef Documentation

using QueryMemoryInitializer::QuantileParam = std::optional<double>
private

Definition at line 157 of file QueryMemoryInitializer.h.

Constructor & Destructor Documentation

QueryMemoryInitializer::QueryMemoryInitializer ( const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const ExecutorDeviceType  device_type,
const ExecutorDispatchMode  dispatch_mode,
const bool  output_columnar,
const bool  sort_on_gpu,
const int  outer_table_id,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
RenderAllocatorMap render_allocator_map,
RenderInfo render_info,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
DeviceAllocator gpu_allocator,
const size_t  thread_idx,
const Executor executor 
)

Definition at line 167 of file QueryMemoryInitializer.cpp.

References anonymous_namespace{QueryMemoryInitializer.cpp}::alloc_group_by_buffer(), allocateCountDistinctBuffers(), allocateCountDistinctGpuMem(), allocateTDigests(), CHECK, CHECK_GE, anonymous_namespace{QueryMemoryInitializer.cpp}::check_total_bitmap_memory(), RelAlgExecutionUnit::estimator, ResultSet::fixupQueryMemoryDescriptor(), g_max_memory_allocation_size, anonymous_namespace{QueryMemoryInitializer.cpp}::get_col_frag_offsets(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_consistent_frags_sizes(), anonymous_namespace{QueryMemoryInitializer.cpp}::get_input_idx(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), getVarlenOutputInfo(), GPU, group_by_buffers_, QueryMemoryDescriptor::hasKeylessHash(), QueryMemoryDescriptor::hasVarlenOutput(), initGroupByBuffer(), QueryMemoryDescriptor::interleavedBins(), QueryMemoryDescriptor::isGroupBy(), KernelPerFragment, QueryMemoryDescriptor::lazyInitGroups(), num_buffers_, result_sets_, row_set_mem_owner_, RelAlgExecutionUnit::target_exprs, target_exprs_to_infos(), RelAlgExecutionUnit::target_exprs_union, thread_idx_, QueryMemoryDescriptor::threadsShareMemory(), RelAlgExecutionUnit::union_all, RelAlgExecutionUnit::use_bump_allocator, RenderInfo::useCudaBuffers(), and QueryMemoryDescriptor::varlenOutputBufferElemSize().

185  : num_rows_(num_rows)
186  , row_set_mem_owner_(row_set_mem_owner)
187  , init_agg_vals_(executor->plan_state_->init_agg_vals_)
188  , num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
195  , device_allocator_(device_allocator)
196  , thread_idx_(thread_idx) {
197  CHECK(!sort_on_gpu || output_columnar);
198 
199  const auto& consistent_frag_sizes = get_consistent_frags_sizes(frag_offsets);
200  if (consistent_frag_sizes.empty()) {
201  // No fragments in the input, no underlying buffers will be needed.
202  return;
203  }
204  if (!ra_exe_unit.use_bump_allocator) {
205  check_total_bitmap_memory(query_mem_desc);
206  }
207  if (device_type == ExecutorDeviceType::GPU) {
208  allocateCountDistinctGpuMem(query_mem_desc);
209  }
210 
211  if (render_allocator_map || !query_mem_desc.isGroupBy()) {
212  allocateCountDistinctBuffers(query_mem_desc, false, executor);
213  allocateTDigests(query_mem_desc, false, executor);
214  if (render_info && render_info->useCudaBuffers()) {
215  return;
216  }
217  }
218 
219  if (ra_exe_unit.estimator) {
220  return;
221  }
222 
223  const auto thread_count = device_type == ExecutorDeviceType::GPU
224  ? executor->blockSize() * executor->gridSize()
225  : 1;
226 
227  size_t group_buffer_size{0};
228  if (ra_exe_unit.use_bump_allocator) {
229  // For kernel per fragment execution, just allocate a buffer equivalent to the size of
230  // the fragment
231  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
232  group_buffer_size = num_rows * query_mem_desc.getRowSize();
233  } else {
234  // otherwise, allocate a GPU buffer equivalent to the maximum GPU allocation size
235  group_buffer_size = g_max_memory_allocation_size / query_mem_desc.getRowSize();
236  }
237  } else {
238  group_buffer_size =
239  query_mem_desc.getBufferSizeBytes(ra_exe_unit, thread_count, device_type);
240  }
241  CHECK_GE(group_buffer_size, size_t(0));
242 
243  const auto group_buffers_count = !query_mem_desc.isGroupBy() ? 1 : num_buffers_;
244  int64_t* group_by_buffer_template{nullptr};
245  if (!query_mem_desc.lazyInitGroups(device_type) && group_buffers_count > 1) {
246  group_by_buffer_template = reinterpret_cast<int64_t*>(
247  row_set_mem_owner_->allocate(group_buffer_size, thread_idx_));
248  initGroupByBuffer(group_by_buffer_template,
249  ra_exe_unit,
250  query_mem_desc,
251  device_type,
252  output_columnar,
253  executor);
254  }
255 
256  if (query_mem_desc.interleavedBins(device_type)) {
257  CHECK(query_mem_desc.hasKeylessHash());
258  }
259 
260  const auto step = device_type == ExecutorDeviceType::GPU &&
261  query_mem_desc.threadsShareMemory() &&
262  query_mem_desc.isGroupBy()
263  ? executor->blockSize()
264  : size_t(1);
265  const auto index_buffer_qw = device_type == ExecutorDeviceType::GPU && sort_on_gpu &&
266  query_mem_desc.hasKeylessHash()
267  ? query_mem_desc.getEntryCount()
268  : size_t(0);
269  const auto actual_group_buffer_size =
270  group_buffer_size + index_buffer_qw * sizeof(int64_t);
271  CHECK_GE(actual_group_buffer_size, group_buffer_size);
272 
273  if (query_mem_desc.hasVarlenOutput()) {
274  const auto varlen_buffer_elem_size_opt = query_mem_desc.varlenOutputBufferElemSize();
275  CHECK(varlen_buffer_elem_size_opt); // TODO(adb): relax
276  auto varlen_output_buffer = reinterpret_cast<int64_t*>(row_set_mem_owner_->allocate(
277  query_mem_desc.getEntryCount() * varlen_buffer_elem_size_opt.value()));
278  num_buffers_ += 1;
279  group_by_buffers_.push_back(varlen_output_buffer);
280  }
281 
282  for (size_t i = 0; i < group_buffers_count; i += step) {
283  auto group_by_buffer = alloc_group_by_buffer(actual_group_buffer_size,
284  render_allocator_map,
285  thread_idx_,
286  row_set_mem_owner_.get());
287  if (!query_mem_desc.lazyInitGroups(device_type)) {
288  if (group_by_buffer_template) {
289  memcpy(group_by_buffer + index_buffer_qw,
290  group_by_buffer_template,
291  group_buffer_size);
292  } else {
293  initGroupByBuffer(group_by_buffer + index_buffer_qw,
294  ra_exe_unit,
295  query_mem_desc,
296  device_type,
297  output_columnar,
298  executor);
299  }
300  }
301  group_by_buffers_.push_back(group_by_buffer);
302  for (size_t j = 1; j < step; ++j) {
303  group_by_buffers_.push_back(nullptr);
304  }
305  const bool use_target_exprs_union =
306  ra_exe_unit.union_all && get_input_idx(ra_exe_unit, outer_table_id);
307  const auto& target_exprs = use_target_exprs_union ? ra_exe_unit.target_exprs_union
308  : ra_exe_unit.target_exprs;
309  const auto column_frag_offsets = get_col_frag_offsets(target_exprs, frag_offsets);
310  const auto column_frag_sizes =
311  get_consistent_frags_sizes(target_exprs, consistent_frag_sizes);
312 
313  result_sets_.emplace_back(
314  new ResultSet(target_exprs_to_infos(target_exprs, query_mem_desc),
315  executor->getColLazyFetchInfo(target_exprs),
316  col_buffers,
317  column_frag_offsets,
318  column_frag_sizes,
319  device_type,
320  device_id,
323  executor->getCatalog(),
324  executor->blockSize(),
325  executor->gridSize()));
326  result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
327  executor->plan_state_->init_agg_vals_,
329  for (size_t j = 1; j < step; ++j) {
330  result_sets_.emplace_back(nullptr);
331  }
332  }
333 }
std::vector< Analyzer::Expr * > target_exprs
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
bool useCudaBuffers() const
Definition: RenderInfo.cpp:53
DeviceAllocator * device_allocator_
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc, const int device_id)
const std::optional< bool > union_all
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
#define CHECK_GE(x, y)
Definition: Logger.h:235
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
std::vector< QuantileParam > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
std::vector< Analyzer::Expr * > target_exprs_union
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< int64_t > init_agg_vals_
bool lazyInitGroups(const ExecutorDeviceType) const
size_t g_max_memory_allocation_size
Definition: Execute.cpp:116
const std::shared_ptr< Analyzer::Estimator > estimator
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
std::vector< int64_t * > group_by_buffers_
std::optional< size_t > varlenOutputBufferElemSize() const
std::shared_ptr< VarlenOutputInfo > getVarlenOutputInfo()
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:756
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:222
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, const size_t thread_idx, RowSetMemoryOwner *mem_owner)
std::vector< std::unique_ptr< ResultSet > > result_sets_
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
int get_input_idx(RelAlgExecutionUnit const &ra_exe_unit, int const outer_table_id)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)

+ Here is the call graph for this function:

QueryMemoryInitializer::QueryMemoryInitializer ( const TableFunctionExecutionUnit exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const ExecutorDeviceType  device_type,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
DeviceAllocator device_allocator,
const Executor executor 
)

Definition at line 336 of file QueryMemoryInitializer.cpp.

347  : num_rows_(num_rows)
348  , row_set_mem_owner_(row_set_mem_owner)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< int64_t > init_agg_vals_
std::vector< Analyzer::Expr * > target_exprs
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)

Member Function Documentation

int64_t QueryMemoryInitializer::allocateCountDistinctBitmap ( const size_t  bitmap_byte_sz)
private

Definition at line 734 of file QueryMemoryInitializer.cpp.

References CHECK, count_distinct_bitmap_crt_ptr_, count_distinct_bitmap_host_mem_, row_set_mem_owner_, and thread_idx_.

Referenced by allocateCountDistinctBuffers(), and initColumnsPerRow().

734  {
738  count_distinct_bitmap_crt_ptr_ += bitmap_byte_sz;
739  row_set_mem_owner_->addCountDistinctBuffer(
740  ptr, bitmap_byte_sz, /*physial_buffer=*/false);
741  return reinterpret_cast<int64_t>(ptr);
742  }
743  return reinterpret_cast<int64_t>(
744  row_set_mem_owner_->allocateCountDistinctBuffer(bitmap_byte_sz, thread_idx_));
745 }
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
#define CHECK(condition)
Definition: Logger.h:222

+ Here is the caller graph for this function:

std::vector< int64_t > QueryMemoryInitializer::allocateCountDistinctBuffers ( const QueryMemoryDescriptor query_mem_desc,
const bool  deferred,
const Executor executor 
)
private

Definition at line 688 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBitmap(), allocateCountDistinctSet(), Bitmap, CHECK, CHECK_EQ, CHECK_GE, CHECK_LT, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getLogicalSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), init_agg_vals_, Invalid, is_distinct_target(), kAPPROX_COUNT_DISTINCT, kCOUNT, and UnorderedSet.

Referenced by initRowGroups(), and QueryMemoryInitializer().

691  {
692  const size_t agg_col_count{query_mem_desc.getSlotCount()};
693  std::vector<int64_t> agg_bitmap_size(deferred ? agg_col_count : 0);
694 
695  CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
696  for (size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
697  ++target_idx) {
698  const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
699  const auto agg_info = get_target_info(target_expr, g_bigint_count);
700  if (is_distinct_target(agg_info)) {
701  CHECK(agg_info.is_agg &&
702  (agg_info.agg_kind == kCOUNT || agg_info.agg_kind == kAPPROX_COUNT_DISTINCT));
703  CHECK(!agg_info.sql_type.is_varlen());
704 
705  const size_t agg_col_idx = query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
706  CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
707 
708  CHECK_EQ(static_cast<size_t>(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx)),
709  sizeof(int64_t));
710  const auto& count_distinct_desc =
711  query_mem_desc.getCountDistinctDescriptor(target_idx);
712  CHECK(count_distinct_desc.impl_type_ != CountDistinctImplType::Invalid);
713  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap) {
714  const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
715  if (deferred) {
716  agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
717  } else {
718  init_agg_vals_[agg_col_idx] = allocateCountDistinctBitmap(bitmap_byte_sz);
719  }
720  } else {
721  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::UnorderedSet);
722  if (deferred) {
723  agg_bitmap_size[agg_col_idx] = -1;
724  } else {
725  init_agg_vals_[agg_col_idx] = allocateCountDistinctSet();
726  }
727  }
728  }
729  }
730 
731  return agg_bitmap_size;
732 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
#define CHECK_GE(x, y)
Definition: Logger.h:235
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:97
std::vector< int64_t > init_agg_vals_
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:107
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:232
Definition: sqldefs.h:77
#define CHECK(condition)
Definition: Logger.h:222
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::allocateCountDistinctGpuMem ( const QueryMemoryDescriptor query_mem_desc)
private

Definition at line 656 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), Bitmap, CHECK, count_distinct_bitmap_crt_ptr_, count_distinct_bitmap_host_mem_, count_distinct_bitmap_mem_, count_distinct_bitmap_mem_bytes_, QueryMemoryDescriptor::countDistinctDescriptorsLogicallyEmpty(), device_allocator_, QueryMemoryDescriptor::getCountDistinctDescriptor(), QueryMemoryDescriptor::getCountDistinctDescriptorsSize(), QueryMemoryDescriptor::getEntryCount(), Invalid, row_set_mem_owner_, thread_idx_, and DeviceAllocator::zeroDeviceMem().

Referenced by QueryMemoryInitializer().

657  {
658  if (query_mem_desc.countDistinctDescriptorsLogicallyEmpty()) {
659  return;
660  }
662 
663  size_t total_bytes_per_entry{0};
664  const size_t num_count_distinct_descs =
665  query_mem_desc.getCountDistinctDescriptorsSize();
666  for (size_t i = 0; i < num_count_distinct_descs; i++) {
667  const auto count_distinct_desc = query_mem_desc.getCountDistinctDescriptor(i);
668  if (count_distinct_desc.impl_type_ == CountDistinctImplType::Invalid) {
669  continue;
670  }
671  CHECK(count_distinct_desc.impl_type_ == CountDistinctImplType::Bitmap);
672  total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
673  }
674 
676  total_bytes_per_entry * query_mem_desc.getEntryCount();
677  count_distinct_bitmap_mem_ = reinterpret_cast<CUdeviceptr>(
679  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(count_distinct_bitmap_mem_),
681 
684 }
bool countDistinctDescriptorsLogicallyEmpty() const
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:28
virtual int8_t * alloc(const size_t num_bytes)=0
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
size_t getCountDistinctDescriptorsSize() const
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
#define CHECK(condition)
Definition: Logger.h:222

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int64_t QueryMemoryInitializer::allocateCountDistinctSet ( )
private

Definition at line 747 of file QueryMemoryInitializer.cpp.

References row_set_mem_owner_.

Referenced by allocateCountDistinctBuffers(), and initColumnsPerRow().

747  {
748  auto count_distinct_set = new CountDistinctSet();
749  row_set_mem_owner_->addCountDistinctSet(count_distinct_set);
750  return reinterpret_cast<int64_t>(count_distinct_set);
751 }
robin_hood::unordered_set< int64_t > CountDistinctSet
Definition: CountDistinct.h:35
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_

+ Here is the caller graph for this function:

std::vector< QueryMemoryInitializer::QuantileParam > QueryMemoryInitializer::allocateTDigests ( const QueryMemoryDescriptor query_mem_desc,
const bool  deferred,
const Executor executor 
)
private

Definition at line 754 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, CHECK_GE, CHECK_LT, QueryMemoryDescriptor::getLogicalSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::getSlotIndexForSingleSlotCol(), init_agg_vals_, kAPPROX_QUANTILE, and row_set_mem_owner_.

Referenced by initRowGroups(), and QueryMemoryInitializer().

756  {
757  size_t const slot_count = query_mem_desc.getSlotCount();
758  size_t const ntargets = executor->plan_state_->target_exprs_.size();
759  CHECK_GE(slot_count, ntargets);
760  std::vector<QuantileParam> quantile_params(deferred ? slot_count : 0);
761 
762  for (size_t target_idx = 0; target_idx < ntargets; ++target_idx) {
763  auto const target_expr = executor->plan_state_->target_exprs_[target_idx];
764  if (auto const agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr)) {
765  if (agg_expr->get_aggtype() == kAPPROX_QUANTILE) {
766  size_t const agg_col_idx =
767  query_mem_desc.getSlotIndexForSingleSlotCol(target_idx);
768  CHECK_LT(agg_col_idx, slot_count);
769  CHECK_EQ(query_mem_desc.getLogicalSlotWidthBytes(agg_col_idx),
770  static_cast<int8_t>(sizeof(int64_t)));
771  auto const q = agg_expr->get_arg1()->get_constval().doubleval;
772  if (deferred) {
773  quantile_params[agg_col_idx] = q;
774  } else {
775  // allocate for APPROX_QUANTILE only when slot is used
776  init_agg_vals_[agg_col_idx] =
777  reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
778  }
779  }
780  }
781  }
782  return quantile_params;
783 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
#define CHECK_GE(x, y)
Definition: Logger.h:235
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
std::vector< int64_t > init_agg_vals_
#define CHECK_LT(x, y)
Definition: Logger.h:232
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::applyStreamingTopNOffsetCpu ( const QueryMemoryDescriptor query_mem_desc,
const RelAlgExecutionUnit ra_exe_unit 
)
private

Definition at line 1151 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, CPU, streaming_top_n::get_rows_copy_from_heaps(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), SortInfo::limit, SortInfo::offset, and RelAlgExecutionUnit::sort_info.

1153  {
1154  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1155  CHECK_EQ(group_by_buffers_.size(), buffer_start_idx + 1);
1156 
1157  const auto rows_copy = streaming_top_n::get_rows_copy_from_heaps(
1158  group_by_buffers_[buffer_start_idx],
1159  query_mem_desc.getBufferSizeBytes(ra_exe_unit, 1, ExecutorDeviceType::CPU),
1160  ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit,
1161  1);
1162  CHECK_EQ(rows_copy.size(),
1163  query_mem_desc.getEntryCount() * query_mem_desc.getRowSize());
1164  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1165 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
const size_t limit
std::vector< int64_t * > group_by_buffers_
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
const size_t offset

+ Here is the call graph for this function:

void QueryMemoryInitializer::applyStreamingTopNOffsetGpu ( Data_Namespace::DataMgr data_mgr,
const QueryMemoryDescriptor query_mem_desc,
const GpuGroupByBuffers gpu_group_by_buffers,
const RelAlgExecutionUnit ra_exe_unit,
const unsigned  total_thread_count,
const int  device_id 
)
private

Definition at line 1167 of file QueryMemoryInitializer.cpp.

References CHECK_EQ, GpuGroupByBuffers::data, QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), num_buffers_, and UNREACHABLE.

1173  {
1174 #ifdef HAVE_CUDA
1176  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1177 
1178  const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1179  data_mgr,
1180  reinterpret_cast<int64_t*>(gpu_group_by_buffers.data),
1181  ra_exe_unit,
1182  query_mem_desc,
1183  total_thread_count,
1184  device_id);
1185  CHECK_EQ(
1186  rows_copy.size(),
1187  static_cast<size_t>(query_mem_desc.getEntryCount() * query_mem_desc.getRowSize()));
1188  memcpy(group_by_buffers_[buffer_start_idx], &rows_copy[0], rows_copy.size());
1189 #else
1190  UNREACHABLE();
1191 #endif
1192 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
#define UNREACHABLE()
Definition: Logger.h:266
std::vector< int64_t * > group_by_buffers_

+ Here is the call graph for this function:

void QueryMemoryInitializer::compactProjectionBuffersCpu ( const QueryMemoryDescriptor query_mem_desc,
const size_t  projection_count 
)
private

Definition at line 1075 of file QueryMemoryInitializer.cpp.

References CHECK, anonymous_namespace{QueryMemoryInitializer.cpp}::compact_projection_buffer_for_cpu_columnar(), QueryMemoryDescriptor::getEntryCount(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), and result_sets_.

1077  {
1078  const auto num_allocated_rows =
1079  std::min(projection_count, query_mem_desc.getEntryCount());
1080  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1081 
1082  // copy the results from the main buffer into projection_buffer
1084  query_mem_desc,
1085  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1086  num_allocated_rows);
1087 
1088  // update the entry count for the result set, and its underlying storage
1089  CHECK(!result_sets_.empty());
1090  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1091 }
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
#define CHECK(condition)
Definition: Logger.h:222
std::vector< std::unique_ptr< ResultSet > > result_sets_

+ Here is the call graph for this function:

void QueryMemoryInitializer::compactProjectionBuffersGpu ( const QueryMemoryDescriptor query_mem_desc,
Data_Namespace::DataMgr data_mgr,
const GpuGroupByBuffers gpu_group_by_buffers,
const size_t  projection_count,
const int  device_id 
)
private

Definition at line 1093 of file QueryMemoryInitializer.cpp.

References CHECK, copy_projection_buffer_from_gpu_columnar(), QueryMemoryDescriptor::getEntryCount(), group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), and result_sets_.

1098  {
1099  // store total number of allocated rows:
1100  const auto num_allocated_rows =
1101  std::min(projection_count, query_mem_desc.getEntryCount());
1102 
1103  // copy the results from the main buffer into projection_buffer
1104  const size_t buffer_start_idx = query_mem_desc.hasVarlenOutput() ? 1 : 0;
1106  data_mgr,
1107  gpu_group_by_buffers,
1108  query_mem_desc,
1109  reinterpret_cast<int8_t*>(group_by_buffers_[buffer_start_idx]),
1110  num_allocated_rows,
1111  device_id);
1112 
1113  // update the entry count for the result set, and its underlying storage
1114  CHECK(!result_sets_.empty());
1115  result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1116 }
std::vector< int64_t * > group_by_buffers_
#define CHECK(condition)
Definition: Logger.h:222
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
std::vector< std::unique_ptr< ResultSet > > result_sets_

+ Here is the call graph for this function:

size_t QueryMemoryInitializer::computeNumberOfBuffers ( const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
const Executor executor 
) const
private

Definition at line 1031 of file QueryMemoryInitializer.cpp.

References QueryMemoryDescriptor::blocksShareMemory(), and CPU.

1034  {
1035  return device_type == ExecutorDeviceType::CPU
1036  ? 1
1037  : executor->blockSize() *
1038  (query_mem_desc.blocksShareMemory() ? 1 : executor->gridSize());
1039 }

+ Here is the call graph for this function:

void QueryMemoryInitializer::copyFromTableFunctionGpuBuffers ( Data_Namespace::DataMgr data_mgr,
const QueryMemoryDescriptor query_mem_desc,
const size_t  entry_count,
const GpuGroupByBuffers gpu_group_by_buffers,
const int  device_id,
const unsigned  block_size_x,
const unsigned  grid_size_x 
)

Definition at line 994 of file QueryMemoryInitializer.cpp.

References align_to_int64(), CHECK_LE, GpuGroupByBuffers::data, GpuGroupByBuffers::entry_count, QueryMemoryDescriptor::getBufferColSlotCount(), QueryMemoryDescriptor::getColSlotContext(), getQueryEngineCudaStreamForDevice(), ColSlotContext::getSlotInfo(), group_by_buffers_, and SlotSize::logical_size.

1001  {
1002  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
1003 
1004  int8_t* dev_buffer = gpu_group_by_buffers.data;
1005  int8_t* host_buffer = reinterpret_cast<int8_t*>(group_by_buffers_[0]);
1006 
1007  const size_t original_entry_count = gpu_group_by_buffers.entry_count;
1008  CHECK_LE(entry_count, original_entry_count);
1009  size_t output_device_col_offset{0};
1010  size_t output_host_col_offset{0};
1011 
1012  const auto col_slot_context = query_mem_desc.getColSlotContext();
1013 
1014  auto allocator = std::make_unique<CudaAllocator>(
1015  data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id));
1016 
1017  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
1018  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
1019  const size_t output_device_col_size = original_entry_count * col_width;
1020  const size_t output_host_col_size = entry_count * col_width;
1021  allocator->copyFromDevice(host_buffer + output_host_col_offset,
1022  dev_buffer + output_device_col_offset,
1023  output_host_col_size);
1024  output_device_col_offset =
1025  align_to_int64(output_device_col_offset + output_device_col_size);
1026  output_host_col_offset =
1027  align_to_int64(output_host_col_offset + output_host_col_size);
1028  }
1029 }
int8_t logical_size
const SlotSize & getSlotInfo(const size_t slot_idx) const
std::vector< int64_t * > group_by_buffers_
#define CHECK_LE(x, y)
Definition: Logger.h:233
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
const ColSlotContext & getColSlotContext() const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

void QueryMemoryInitializer::copyGroupByBuffersFromGpu ( DeviceAllocator device_allocator,
const QueryMemoryDescriptor query_mem_desc,
const size_t  entry_count,
const GpuGroupByBuffers gpu_group_by_buffers,
const RelAlgExecutionUnit ra_exe_unit,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int  device_id,
const bool  prepend_index_buffer 
) const

Definition at line 1118 of file QueryMemoryInitializer.cpp.

References copy_group_by_buffers_from_gpu(), GpuGroupByBuffers::data, streaming_top_n::get_heap_size(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getRowSize(), GPU, group_by_buffers_, QueryMemoryDescriptor::hasVarlenOutput(), SortInfo::limit, anonymous_namespace{Utm.h}::n, SortInfo::offset, RelAlgExecutionUnit::sort_info, and QueryMemoryDescriptor::useStreamingTopN().

1127  {
1128  const auto thread_count = block_size_x * grid_size_x;
1129 
1130  size_t total_buff_size{0};
1131  if (ra_exe_unit && query_mem_desc.useStreamingTopN()) {
1132  const size_t n = ra_exe_unit->sort_info.offset + ra_exe_unit->sort_info.limit;
1133  total_buff_size =
1134  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
1135  } else {
1136  total_buff_size =
1137  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
1138  }
1139  copy_group_by_buffers_from_gpu(device_allocator,
1141  total_buff_size,
1142  gpu_group_by_buffers.data,
1143  query_mem_desc,
1144  block_size_x,
1145  grid_size_x,
1146  device_id,
1147  prepend_index_buffer,
1148  query_mem_desc.hasVarlenOutput());
1149 }
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
const size_t limit
std::vector< int64_t * > group_by_buffers_
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
constexpr double n
Definition: Utm.h:38
const size_t offset
void copy_group_by_buffers_from_gpu(DeviceAllocator &device_allocator, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const int8_t *group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)

+ Here is the call graph for this function:

GpuGroupByBuffers QueryMemoryInitializer::createAndInitializeGroupByBufferGpu ( const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const int8_t *  init_agg_vals_dev_ptr,
const int  device_id,
const ExecutorDispatchMode  dispatch_mode,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int8_t  warp_size,
const bool  can_sort_on_gpu,
const bool  output_columnar,
RenderAllocator render_allocator 
)
private

Definition at line 839 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), CHECK, CHECK_EQ, DeviceAllocator::copyToDevice(), create_dev_group_by_buffers(), device_allocator_, RenderAllocator::getAllocatedSize(), QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getEntryCount(), getGroupByBuffersSize(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getRowSize(), QueryMemoryDescriptor::getSlotCount(), GPU, group_by_buffers_, QueryMemoryDescriptor::hasKeylessHash(), QueryMemoryDescriptor::hasVarlenOutput(), init_columnar_group_by_buffer_on_device(), init_group_by_buffer_on_device(), QueryMemoryDescriptor::interleavedBins(), QueryMemoryDescriptor::lazyInitGroups(), SortInfo::limit, anonymous_namespace{Utm.h}::n, num_rows_, SortInfo::offset, prepareTopNHeapsDevBuffer(), row_set_mem_owner_, RelAlgExecutionUnit::sort_info, thread_idx_, QueryMemoryDescriptor::threadsShareMemory(), UNREACHABLE, RelAlgExecutionUnit::use_bump_allocator, QueryMemoryDescriptor::useStreamingTopN(), varlen_output_buffer_, varlen_output_buffer_host_ptr_, varlen_output_info_, and QueryMemoryDescriptor::varlenOutputBufferElemSize().

850  {
851 #ifdef HAVE_CUDA
852  if (query_mem_desc.useStreamingTopN()) {
853  if (render_allocator) {
855  }
856  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
857  CHECK(!output_columnar);
858 
860  query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
861  }
862 
863  auto dev_group_by_buffers =
866  query_mem_desc,
867  block_size_x,
868  grid_size_x,
869  device_id,
870  dispatch_mode,
871  num_rows_,
872  can_sort_on_gpu,
873  false,
874  ra_exe_unit.use_bump_allocator,
875  query_mem_desc.hasVarlenOutput(),
876  render_allocator);
877  if (query_mem_desc.hasVarlenOutput()) {
878  CHECK(dev_group_by_buffers.varlen_output_buffer);
880  reinterpret_cast<CUdeviceptr>(dev_group_by_buffers.varlen_output_buffer);
881  CHECK(query_mem_desc.varlenOutputBufferElemSize());
882  const size_t varlen_output_buf_bytes =
883  query_mem_desc.getEntryCount() *
884  query_mem_desc.varlenOutputBufferElemSize().value();
886  row_set_mem_owner_->allocate(varlen_output_buf_bytes, thread_idx_);
888  varlen_output_info_->gpu_start_address = static_cast<int64_t>(varlen_output_buffer_);
890  }
891  if (render_allocator) {
892  CHECK_EQ(size_t(0), render_allocator->getAllocatedSize() % 8);
893  }
894  if (query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU)) {
895  CHECK(!render_allocator);
896 
897  const size_t step{query_mem_desc.threadsShareMemory() ? block_size_x : 1};
898  size_t groups_buffer_size{query_mem_desc.getBufferSizeBytes(
899  ExecutorDeviceType::GPU, dev_group_by_buffers.entry_count)};
900  auto group_by_dev_buffer = dev_group_by_buffers.data;
901  const size_t col_count = query_mem_desc.getSlotCount();
902  int8_t* col_widths_dev_ptr{nullptr};
903  if (output_columnar) {
904  std::vector<int8_t> compact_col_widths(col_count);
905  for (size_t idx = 0; idx < col_count; ++idx) {
906  compact_col_widths[idx] = query_mem_desc.getPaddedSlotWidthBytes(idx);
907  }
908  col_widths_dev_ptr = device_allocator_->alloc(col_count * sizeof(int8_t));
910  col_widths_dev_ptr, compact_col_widths.data(), col_count * sizeof(int8_t));
911  }
912  const int8_t warp_count =
913  query_mem_desc.interleavedBins(ExecutorDeviceType::GPU) ? warp_size : 1;
914  const auto num_group_by_buffers =
915  getGroupByBuffersSize() - (query_mem_desc.hasVarlenOutput() ? 1 : 0);
916  for (size_t i = 0; i < num_group_by_buffers; i += step) {
917  if (output_columnar) {
919  reinterpret_cast<int64_t*>(group_by_dev_buffer),
920  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
921  dev_group_by_buffers.entry_count,
922  query_mem_desc.getGroupbyColCount(),
923  col_count,
924  col_widths_dev_ptr,
925  /*need_padding = */ true,
926  query_mem_desc.hasKeylessHash(),
927  sizeof(int64_t),
928  block_size_x,
929  grid_size_x);
930  } else {
932  reinterpret_cast<int64_t*>(group_by_dev_buffer),
933  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
934  dev_group_by_buffers.entry_count,
935  query_mem_desc.getGroupbyColCount(),
936  query_mem_desc.getEffectiveKeyWidth(),
937  query_mem_desc.getRowSize() / sizeof(int64_t),
938  query_mem_desc.hasKeylessHash(),
939  warp_count,
940  block_size_x,
941  grid_size_x);
942  }
943  group_by_dev_buffer += groups_buffer_size;
944  }
945  }
946  return dev_group_by_buffers;
947 #else
948  UNREACHABLE();
949  return {};
950 #endif
951 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *device_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:70
GpuGroupByBuffers prepareTopNHeapsDevBuffer(const QueryMemoryDescriptor &query_mem_desc, const int8_t *init_agg_vals_dev_ptr, const size_t n, const int device_id, const unsigned block_size_x, const unsigned grid_size_x)
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:28
#define UNREACHABLE()
Definition: Logger.h:266
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getEffectiveKeyWidth() const
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
const size_t limit
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
size_t getGroupbyColCount() const
bool lazyInitGroups(const ExecutorDeviceType) const
size_t getAllocatedSize() const
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
std::vector< int64_t * > group_by_buffers_
std::optional< size_t > varlenOutputBufferElemSize() const
bool interleavedBins(const ExecutorDeviceType) const
#define CHECK(condition)
Definition: Logger.h:222
const auto getGroupByBuffersSize() const
std::shared_ptr< VarlenOutputInfo > varlen_output_info_
constexpr double n
Definition: Utm.h:38
const size_t offset

+ Here is the call graph for this function:

int64_t QueryMemoryInitializer::getAggInitValForIndex ( const size_t  index) const
inline

Definition at line 96 of file QueryMemoryInitializer.h.

References CHECK_LT, and init_agg_vals_.

96  {
97  CHECK_LT(index, init_agg_vals_.size());
98  return init_agg_vals_[index];
99  }
std::vector< int64_t > init_agg_vals_
#define CHECK_LT(x, y)
Definition: Logger.h:232
const auto QueryMemoryInitializer::getCountDistinctBitmapBytes ( ) const
inline

Definition at line 72 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_mem_bytes_.

72  {
74  }
const auto QueryMemoryInitializer::getCountDistinctBitmapPtr ( ) const
inline

Definition at line 68 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_mem_.

const auto QueryMemoryInitializer::getCountDistinctHostPtr ( ) const
inline

Definition at line 70 of file QueryMemoryInitializer.h.

References count_distinct_bitmap_host_mem_.

const auto QueryMemoryInitializer::getGroupByBuffersPtr ( )
inline

Definition at line 101 of file QueryMemoryInitializer.h.

References group_by_buffers_.

101  {
102  return reinterpret_cast<int64_t**>(group_by_buffers_.data());
103  }
std::vector< int64_t * > group_by_buffers_
const auto QueryMemoryInitializer::getGroupByBuffersSize ( ) const
inline

Definition at line 105 of file QueryMemoryInitializer.h.

References group_by_buffers_.

Referenced by createAndInitializeGroupByBufferGpu().

105 { return group_by_buffers_.size(); }
std::vector< int64_t * > group_by_buffers_

+ Here is the caller graph for this function:

const auto QueryMemoryInitializer::getNumBuffers ( ) const
inline

Definition at line 107 of file QueryMemoryInitializer.h.

References CHECK_EQ, group_by_buffers_, and num_buffers_.

107  {
109  return num_buffers_;
110  }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
std::vector< int64_t * > group_by_buffers_
ResultSet* QueryMemoryInitializer::getResultSet ( const size_t  index) const
inline

Definition at line 81 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

81  {
82  CHECK_LT(index, result_sets_.size());
83  return result_sets_[index].get();
84  }
#define CHECK_LT(x, y)
Definition: Logger.h:232
std::vector< std::unique_ptr< ResultSet > > result_sets_
std::unique_ptr<ResultSet> QueryMemoryInitializer::getResultSetOwned ( const size_t  index)
inline

Definition at line 86 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

86  {
87  CHECK_LT(index, result_sets_.size());
88  return std::move(result_sets_[index]);
89  }
#define CHECK_LT(x, y)
Definition: Logger.h:232
std::vector< std::unique_ptr< ResultSet > > result_sets_
const auto QueryMemoryInitializer::getVarlenOutputHostPtr ( ) const
inline

Definition at line 77 of file QueryMemoryInitializer.h.

References varlen_output_buffer_host_ptr_.

std::shared_ptr< VarlenOutputInfo > QueryMemoryInitializer::getVarlenOutputInfo ( )
private

Definition at line 1194 of file QueryMemoryInitializer.cpp.

References varlen_output_buffer_, varlen_output_buffer_host_ptr_, and varlen_output_info_.

Referenced by QueryMemoryInitializer().

1194  {
1195  if (varlen_output_info_) {
1196  return varlen_output_info_;
1197  }
1198 
1199  // shared_ptr so that both the ResultSet and QMI can hold on to the varlen info object
1200  // and update it as needed
1201  varlen_output_info_ = std::make_shared<VarlenOutputInfo>(VarlenOutputInfo{
1202  static_cast<int64_t>(varlen_output_buffer_), varlen_output_buffer_host_ptr_});
1203  return varlen_output_info_;
1204 }
std::shared_ptr< VarlenOutputInfo > varlen_output_info_

+ Here is the caller graph for this function:

const auto QueryMemoryInitializer::getVarlenOutputPtr ( ) const
inline

Definition at line 79 of file QueryMemoryInitializer.h.

References varlen_output_buffer_.

79 { return varlen_output_buffer_; }
void QueryMemoryInitializer::initColumnarGroups ( const QueryMemoryDescriptor query_mem_desc,
int64_t *  groups_buffer,
const std::vector< int64_t > &  init_vals,
const Executor executor 
)
private

Definition at line 540 of file QueryMemoryInitializer.cpp.

References align_to_int64(), CHECK, CHECK_LT, EMPTY_KEY_64, g_bigint_count, get_target_info(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getQueryDescriptionType(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::hasKeylessHash(), is_distinct_target(), and Projection.

Referenced by initGroupByBuffer().

544  {
545  CHECK(groups_buffer);
546 
547  for (const auto target_expr : executor->plan_state_->target_exprs_) {
548  const auto agg_info = get_target_info(target_expr, g_bigint_count);
549  CHECK(!is_distinct_target(agg_info));
550  }
551  const int32_t agg_col_count = query_mem_desc.getSlotCount();
552  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
553 
554  const auto groups_buffer_entry_count = query_mem_desc.getEntryCount();
555  if (!query_mem_desc.hasKeylessHash()) {
556  const size_t key_count{query_mem_desc.getGroupbyColCount()};
557  for (size_t i = 0; i < key_count; ++i) {
558  buffer_ptr = initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
559  EMPTY_KEY_64,
560  groups_buffer_entry_count);
561  }
562  }
563 
565  // initializing all aggregate columns:
566  int32_t init_val_idx = 0;
567  for (int32_t i = 0; i < agg_col_count; ++i) {
568  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
569  CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
570  switch (query_mem_desc.getPaddedSlotWidthBytes(i)) {
571  case 1:
572  buffer_ptr = initColumnarBuffer<int8_t>(
573  buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
574  break;
575  case 2:
576  buffer_ptr =
577  initColumnarBuffer<int16_t>(reinterpret_cast<int16_t*>(buffer_ptr),
578  init_vals[init_val_idx++],
579  groups_buffer_entry_count);
580  break;
581  case 4:
582  buffer_ptr =
583  initColumnarBuffer<int32_t>(reinterpret_cast<int32_t*>(buffer_ptr),
584  init_vals[init_val_idx++],
585  groups_buffer_entry_count);
586  break;
587  case 8:
588  buffer_ptr =
589  initColumnarBuffer<int64_t>(reinterpret_cast<int64_t*>(buffer_ptr),
590  init_vals[init_val_idx++],
591  groups_buffer_entry_count);
592  break;
593  case 0:
594  break;
595  default:
596  CHECK(false);
597  }
598 
599  buffer_ptr = align_to_int64(buffer_ptr);
600  }
601  }
602  }
603 }
#define EMPTY_KEY_64
TargetInfo get_target_info(const Analyzer::Expr *target_expr, const bool bigint_count)
Definition: TargetInfo.h:97
size_t getGroupbyColCount() const
bool g_bigint_count
bool is_distinct_target(const TargetInfo &target_info)
Definition: TargetInfo.h:107
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
QueryDescriptionType getQueryDescriptionType() const
#define CHECK_LT(x, y)
Definition: Logger.h:232
#define CHECK(condition)
Definition: Logger.h:222
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initColumnsPerRow ( const QueryMemoryDescriptor query_mem_desc,
int8_t *  row_ptr,
const std::vector< int64_t > &  init_vals,
const std::vector< int64_t > &  bitmap_sizes,
const std::vector< QuantileParam > &  quantile_params 
)
private

Definition at line 605 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBitmap(), allocateCountDistinctSet(), CHECK, CHECK_EQ, CHECK_LT, QueryMemoryDescriptor::getNextColOffInBytesRowOnly(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getSlotCount(), QueryMemoryDescriptor::isGroupBy(), and row_set_mem_owner_.

Referenced by initRowGroups().

610  {
611  int8_t* col_ptr = row_ptr;
612  size_t init_vec_idx = 0;
613  for (size_t col_idx = 0; col_idx < query_mem_desc.getSlotCount();
614  col_ptr += query_mem_desc.getNextColOffInBytesRowOnly(col_ptr, col_idx++)) {
615  const int64_t bm_sz{bitmap_sizes[col_idx]};
616  int64_t init_val{0};
617  if (bm_sz && query_mem_desc.isGroupBy()) {
618  // COUNT DISTINCT / APPROX_COUNT_DISTINCT
619  CHECK_EQ(static_cast<size_t>(query_mem_desc.getPaddedSlotWidthBytes(col_idx)),
620  sizeof(int64_t));
621  init_val =
623  ++init_vec_idx;
624  } else if (query_mem_desc.isGroupBy() && quantile_params[col_idx]) {
625  auto const q = *quantile_params[col_idx];
626  // allocate for APPROX_QUANTILE only when slot is used
627  init_val = reinterpret_cast<int64_t>(row_set_mem_owner_->nullTDigest(q));
628  ++init_vec_idx;
629  } else {
630  if (query_mem_desc.getPaddedSlotWidthBytes(col_idx) > 0) {
631  CHECK_LT(init_vec_idx, init_vals.size());
632  init_val = init_vals[init_vec_idx++];
633  }
634  }
635  switch (query_mem_desc.getPaddedSlotWidthBytes(col_idx)) {
636  case 1:
637  *col_ptr = static_cast<int8_t>(init_val);
638  break;
639  case 2:
640  *reinterpret_cast<int16_t*>(col_ptr) = (int16_t)init_val;
641  break;
642  case 4:
643  *reinterpret_cast<int32_t*>(col_ptr) = (int32_t)init_val;
644  break;
645  case 8:
646  *reinterpret_cast<int64_t*>(col_ptr) = init_val;
647  break;
648  case 0:
649  continue;
650  default:
651  CHECK(false);
652  }
653  }
654 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
#define CHECK_LT(x, y)
Definition: Logger.h:232
size_t getNextColOffInBytesRowOnly(const int8_t *col_ptr, const size_t col_idx) const
#define CHECK(condition)
Definition: Logger.h:222
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initGroupByBuffer ( int64_t *  buffer,
const RelAlgExecutionUnit ra_exe_unit,
const QueryMemoryDescriptor query_mem_desc,
const ExecutorDeviceType  device_type,
const bool  output_columnar,
const Executor executor 
)
private

Definition at line 412 of file QueryMemoryInitializer.cpp.

References streaming_top_n::get_rows_offset_of_heaps(), QueryMemoryDescriptor::getEntryCount(), GPU, init_agg_vals_, initColumnarGroups(), initRowGroups(), QueryMemoryDescriptor::interleavedBins(), SortInfo::limit, anonymous_namespace{Utm.h}::n, SortInfo::offset, RelAlgExecutionUnit::sort_info, and QueryMemoryDescriptor::useStreamingTopN().

Referenced by QueryMemoryInitializer().

418  {
419  if (output_columnar) {
420  initColumnarGroups(query_mem_desc, buffer, init_agg_vals_, executor);
421  } else {
422  auto rows_ptr = buffer;
423  auto actual_entry_count = query_mem_desc.getEntryCount();
424  const auto thread_count = device_type == ExecutorDeviceType::GPU
425  ? executor->blockSize() * executor->gridSize()
426  : 1;
427  auto warp_size =
428  query_mem_desc.interleavedBins(device_type) ? executor->warpSize() : 1;
429  if (query_mem_desc.useStreamingTopN()) {
430  const auto node_count_size = thread_count * sizeof(int64_t);
431  memset(rows_ptr, 0, node_count_size);
432  const auto n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
433  const auto rows_offset = streaming_top_n::get_rows_offset_of_heaps(n, thread_count);
434  memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
435  rows_ptr += rows_offset / sizeof(int64_t);
436  actual_entry_count = n * thread_count;
437  warp_size = 1;
438  }
439  initRowGroups(query_mem_desc,
440  rows_ptr,
442  actual_entry_count,
443  warp_size,
444  executor);
445  }
446 }
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
const size_t limit
std::vector< int64_t > init_agg_vals_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
bool interleavedBins(const ExecutorDeviceType) const
constexpr double n
Definition: Utm.h:38
const size_t offset
void initRowGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::initRowGroups ( const QueryMemoryDescriptor query_mem_desc,
int64_t *  groups_buffer,
const std::vector< int64_t > &  init_vals,
const int32_t  groups_buffer_entry_count,
const size_t  warp_size,
const Executor executor 
)
private

Definition at line 448 of file QueryMemoryInitializer.cpp.

References allocateCountDistinctBuffers(), allocateTDigests(), CHECK, result_set::fill_empty_key(), ResultSet::fixupQueryMemoryDescriptor(), g_optimize_row_initialization, QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getRowSize(), QueryMemoryDescriptor::hasKeylessHash(), and initColumnsPerRow().

Referenced by initGroupByBuffer().

453  {
454  const size_t key_count{query_mem_desc.getGroupbyColCount()};
455  const size_t row_size{query_mem_desc.getRowSize()};
456  const size_t col_base_off{query_mem_desc.getColOffInBytes(0)};
457 
458  auto agg_bitmap_size = allocateCountDistinctBuffers(query_mem_desc, true, executor);
459  auto quantile_params = allocateTDigests(query_mem_desc, true, executor);
460  auto buffer_ptr = reinterpret_cast<int8_t*>(groups_buffer);
461 
462  const auto query_mem_desc_fixedup =
464 
465  auto const is_true = [](auto const& x) { return static_cast<bool>(x); };
466  // not COUNT DISTINCT / APPROX_COUNT_DISTINCT / APPROX_QUANTILE
467  // we fallback to default implementation in that cases
468  if (!std::any_of(agg_bitmap_size.begin(), agg_bitmap_size.end(), is_true) &&
469  !std::any_of(quantile_params.begin(), quantile_params.end(), is_true) &&
471  std::vector<int8_t> sample_row(row_size - col_base_off);
472 
473  initColumnsPerRow(query_mem_desc_fixedup,
474  sample_row.data(),
475  init_vals,
476  agg_bitmap_size,
477  quantile_params);
478 
479  if (query_mem_desc.hasKeylessHash()) {
480  CHECK(warp_size >= 1);
481  CHECK(key_count == 1 || warp_size == 1);
482  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
483  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
484  ++bin, buffer_ptr += row_size) {
485  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
486  }
487  }
488  return;
489  }
490 
491  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
492  ++bin, buffer_ptr += row_size) {
493  memcpy(buffer_ptr + col_base_off, sample_row.data(), sample_row.size());
495  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
496  }
497  } else {
498  if (query_mem_desc.hasKeylessHash()) {
499  CHECK(warp_size >= 1);
500  CHECK(key_count == 1 || warp_size == 1);
501  for (size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
502  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
503  ++bin, buffer_ptr += row_size) {
504  initColumnsPerRow(query_mem_desc_fixedup,
505  &buffer_ptr[col_base_off],
506  init_vals,
507  agg_bitmap_size,
508  quantile_params);
509  }
510  }
511  return;
512  }
513 
514  for (size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
515  ++bin, buffer_ptr += row_size) {
517  buffer_ptr, key_count, query_mem_desc.getEffectiveKeyWidth());
518  initColumnsPerRow(query_mem_desc_fixedup,
519  &buffer_ptr[col_base_off],
520  init_vals,
521  agg_bitmap_size,
522  quantile_params);
523  }
524  }
525 }
void initColumnsPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< QuantileParam > &quantile_params)
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
size_t getEffectiveKeyWidth() const
std::vector< QuantileParam > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
size_t getGroupbyColCount() const
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
Definition: ResultSet.cpp:756
#define CHECK(condition)
Definition: Logger.h:222
bool g_optimize_row_initialization
Definition: Execute.cpp:101
size_t getColOffInBytes(const size_t col_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

GpuGroupByBuffers QueryMemoryInitializer::prepareTopNHeapsDevBuffer ( const QueryMemoryDescriptor query_mem_desc,
const int8_t *  init_agg_vals_dev_ptr,
const size_t  n,
const int  device_id,
const unsigned  block_size_x,
const unsigned  grid_size_x 
)
private

Definition at line 785 of file QueryMemoryInitializer.cpp.

References Allocator::alloc(), CHECK, DeviceAllocator::copyToDevice(), device_allocator_, streaming_top_n::get_heap_size(), streaming_top_n::get_rows_offset_of_heaps(), QueryMemoryDescriptor::getEffectiveKeyWidth(), QueryMemoryDescriptor::getGroupbyColCount(), QueryMemoryDescriptor::getRowSize(), GPU, QueryMemoryDescriptor::hasKeylessHash(), init_group_by_buffer_on_device(), QueryMemoryDescriptor::lazyInitGroups(), anonymous_namespace{Utm.h}::n, DeviceAllocator::setDeviceMem(), UNREACHABLE, and DeviceAllocator::zeroDeviceMem().

Referenced by createAndInitializeGroupByBufferGpu().

791  {
792 #ifdef HAVE_CUDA
794  const auto thread_count = block_size_x * grid_size_x;
795  const auto total_buff_size =
796  streaming_top_n::get_heap_size(query_mem_desc.getRowSize(), n, thread_count);
797  int8_t* dev_buffer = device_allocator_->alloc(total_buff_size);
798 
799  std::vector<int8_t*> dev_buffers(thread_count);
800 
801  for (size_t i = 0; i < thread_count; ++i) {
802  dev_buffers[i] = dev_buffer;
803  }
804 
805  auto dev_ptr = device_allocator_->alloc(thread_count * sizeof(int8_t*));
807  dev_ptr, dev_buffers.data(), thread_count * sizeof(int8_t*));
808 
810 
811  device_allocator_->zeroDeviceMem(reinterpret_cast<int8_t*>(dev_buffer),
812  thread_count * sizeof(int64_t));
813 
815  reinterpret_cast<int8_t*>(dev_buffer + thread_count * sizeof(int64_t)),
816  (unsigned char)-1,
817  thread_count * n * sizeof(int64_t));
818 
820  reinterpret_cast<int64_t*>(
821  dev_buffer + streaming_top_n::get_rows_offset_of_heaps(n, thread_count)),
822  reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
823  n * thread_count,
824  query_mem_desc.getGroupbyColCount(),
825  query_mem_desc.getEffectiveKeyWidth(),
826  query_mem_desc.getRowSize() / sizeof(int64_t),
827  query_mem_desc.hasKeylessHash(),
828  1,
829  block_size_x,
830  grid_size_x);
831 
832  return {dev_ptr, dev_buffer};
833 #else
834  UNREACHABLE();
835  return {};
836 #endif
837 }
DeviceAllocator * device_allocator_
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
#define UNREACHABLE()
Definition: Logger.h:266
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getEffectiveKeyWidth() const
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
size_t getGroupbyColCount() const
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
#define CHECK(condition)
Definition: Logger.h:222
constexpr double n
Definition: Utm.h:38
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryInitializer::resetResultSet ( const size_t  index)
inline

Definition at line 91 of file QueryMemoryInitializer.h.

References CHECK_LT, and result_sets_.

91  {
92  CHECK_LT(index, result_sets_.size());
93  result_sets_[index].reset();
94  }
#define CHECK_LT(x, y)
Definition: Logger.h:232
std::vector< std::unique_ptr< ResultSet > > result_sets_
GpuGroupByBuffers QueryMemoryInitializer::setupTableFunctionGpuBuffers ( const QueryMemoryDescriptor query_mem_desc,
const int  device_id,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const bool  zero_initialize_buffers 
)

Definition at line 953 of file QueryMemoryInitializer.cpp.

References align_to_int64(), Allocator::alloc(), CHECK, CHECK_GT, DeviceAllocator::copyToDevice(), device_allocator_, QueryMemoryDescriptor::getBufferColSlotCount(), QueryMemoryDescriptor::getColSlotContext(), ColSlotContext::getSlotInfo(), SlotSize::logical_size, num_rows_, and DeviceAllocator::zeroDeviceMem().

958  {
959  const size_t num_columns = query_mem_desc.getBufferColSlotCount();
960  CHECK_GT(num_columns, size_t(0));
961  size_t total_group_by_buffer_size{0};
962  const auto col_slot_context = query_mem_desc.getColSlotContext();
963 
964  std::vector<size_t> col_byte_offsets;
965  col_byte_offsets.reserve(num_columns);
966 
967  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
968  const size_t col_width = col_slot_context.getSlotInfo(col_idx).logical_size;
969  size_t group_buffer_size = num_rows_ * col_width;
970  col_byte_offsets.emplace_back(total_group_by_buffer_size);
971  total_group_by_buffer_size =
972  align_to_int64(total_group_by_buffer_size + group_buffer_size);
973  }
974 
975  int8_t* dev_buffers_allocation{nullptr};
976  dev_buffers_allocation = device_allocator_->alloc(total_group_by_buffer_size);
977  CHECK(dev_buffers_allocation);
978  if (zero_initialize_buffers) {
979  device_allocator_->zeroDeviceMem(dev_buffers_allocation, total_group_by_buffer_size);
980  }
981 
982  auto dev_buffers_mem = dev_buffers_allocation;
983  std::vector<int8_t*> dev_buffers(num_columns);
984  for (size_t col_idx = 0; col_idx < num_columns; ++col_idx) {
985  dev_buffers[col_idx] = dev_buffers_allocation + col_byte_offsets[col_idx];
986  }
987  auto dev_ptrs = device_allocator_->alloc(num_columns * sizeof(CUdeviceptr));
989  dev_ptrs, dev_buffers.data(), num_columns * sizeof(CUdeviceptr));
990 
991  return {dev_ptrs, dev_buffers_mem, (size_t)num_rows_};
992 }
int8_t logical_size
DeviceAllocator * device_allocator_
unsigned long long CUdeviceptr
Definition: nocuda.h:28
virtual int8_t * alloc(const size_t num_bytes)=0
#define CHECK_GT(x, y)
Definition: Logger.h:234
const SlotSize & getSlotInfo(const size_t slot_idx) const
virtual void copyToDevice(void *device_dst, const void *host_src, const size_t num_bytes) const =0
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
const ColSlotContext & getColSlotContext() const
#define CHECK(condition)
Definition: Logger.h:222
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

Friends And Related Function Documentation

friend class Executor
friend

Definition at line 245 of file QueryMemoryInitializer.h.

friend class QueryExecutionContext
friend

Definition at line 246 of file QueryMemoryInitializer.h.

Member Data Documentation

int8_t* QueryMemoryInitializer::count_distinct_bitmap_crt_ptr_
private
int8_t* QueryMemoryInitializer::count_distinct_bitmap_host_mem_
private
CUdeviceptr QueryMemoryInitializer::count_distinct_bitmap_mem_
private
size_t QueryMemoryInitializer::count_distinct_bitmap_mem_bytes_
private
DeviceAllocator* QueryMemoryInitializer::device_allocator_ {nullptr}
private
std::vector<int64_t> QueryMemoryInitializer::init_agg_vals_
private
size_t QueryMemoryInitializer::num_buffers_
private
const int64_t QueryMemoryInitializer::num_rows_
private
std::vector<std::unique_ptr<ResultSet> > QueryMemoryInitializer::result_sets_
private
std::vector<Data_Namespace::AbstractBuffer*> QueryMemoryInitializer::temporary_buffers_
private

Definition at line 241 of file QueryMemoryInitializer.h.

const size_t QueryMemoryInitializer::thread_idx_
private
CUdeviceptr QueryMemoryInitializer::varlen_output_buffer_
private
int8_t* QueryMemoryInitializer::varlen_output_buffer_host_ptr_
private
std::shared_ptr<VarlenOutputInfo> QueryMemoryInitializer::varlen_output_info_
private

The documentation for this class was generated from the following files: