OmniSciDB  7bf56492aa
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
QueryMemoryDescriptor Class Reference

#include <QueryMemoryDescriptor.h>

+ Collaboration diagram for QueryMemoryDescriptor:

Public Member Functions

 QueryMemoryDescriptor ()
 
 QueryMemoryDescriptor (const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const bool allow_multifrag, const bool keyless_hash, const bool interleaved_bins_on_gpu, const int32_t idx_target_as_key, const ColRangeInfo &col_range_info, const ColSlotContext &col_slot_context, const std::vector< int8_t > &group_col_widths, const int8_t group_col_compact_width, const std::vector< ssize_t > &target_groupby_indices, const size_t entry_count, const GroupByMemSharing sharing, const bool shared_mem_for_group_by, const CountDistinctDescriptors count_distinct_descriptors, const bool sort_on_gpu_hint, const bool output_columnar, const bool render_output, const bool must_use_baseline_sort, const bool use_streaming_top_n)
 
 QueryMemoryDescriptor (const Executor *executor, const size_t entry_count, const QueryDescriptionType query_desc_type, const bool is_table_function)
 
 QueryMemoryDescriptor (const QueryDescriptionType query_desc_type, const int64_t min_val, const int64_t max_val, const bool has_nulls, const std::vector< int8_t > &group_col_widths)
 
 QueryMemoryDescriptor (const TResultSetBufferDescriptor &thrift_query_memory_descriptor)
 
bool operator== (const QueryMemoryDescriptor &other) const
 
std::unique_ptr
< QueryExecutionContext
getQueryExecutionContext (const RelAlgExecutionUnit &, const Executor *executor, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const int device_id, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, std::shared_ptr< RowSetMemoryOwner >, const bool output_columnar, const bool sort_on_gpu, RenderInfo *) const
 
bool countDistinctDescriptorsLogicallyEmpty () const
 
const ExecutorgetExecutor () const
 
QueryDescriptionType getQueryDescriptionType () const
 
void setQueryDescriptionType (const QueryDescriptionType val)
 
bool isSingleColumnGroupByWithPerfectHash () const
 
bool hasKeylessHash () const
 
void setHasKeylessHash (const bool val)
 
bool hasInterleavedBinsOnGpu () const
 
void setHasInterleavedBinsOnGpu (const bool val)
 
int32_t getTargetIdxForKey () const
 
void setTargetIdxForKey (const int32_t val)
 
int8_t groupColWidth (const size_t key_idx) const
 
size_t getPrependedGroupColOffInBytes (const size_t group_idx) const
 
size_t getPrependedGroupBufferSizeInBytes () const
 
const auto groupColWidthsBegin () const
 
const auto groupColWidthsEnd () const
 
void clearGroupColWidths ()
 
bool isGroupBy () const
 
void setGroupColCompactWidth (const int8_t val)
 
size_t getColCount () const
 
size_t getSlotCount () const
 
const int8_t getPaddedSlotWidthBytes (const size_t slot_idx) const
 
const int8_t getLogicalSlotWidthBytes (const size_t slot_idx) const
 
const int8_t getSlotIndexForSingleSlotCol (const size_t col_idx) const
 
size_t getPaddedColWidthForRange (const size_t offset, const size_t range) const
 
void useConsistentSlotWidthSize (const int8_t slot_width_size)
 
size_t getRowWidth () const
 
int8_t updateActualMinByteWidth (const int8_t actual_min_byte_width) const
 
void addColSlotInfo (const std::vector< std::tuple< int8_t, int8_t >> &slots_for_col)
 
void clearSlotInfo ()
 
void alignPaddedSlots ()
 
ssize_t getTargetGroupbyIndex (const size_t target_idx) const
 
void setAllTargetGroupbyIndices (std::vector< ssize_t > group_by_indices)
 
size_t targetGroupbyIndicesSize () const
 
size_t targetGroupbyNegativeIndicesSize () const
 
void clearTargetGroupbyIndices ()
 
size_t getEntryCount () const
 
void setEntryCount (const size_t val)
 
int64_t getMinVal () const
 
int64_t getMaxVal () const
 
int64_t getBucket () const
 
bool hasNulls () const
 
GroupByMemSharing getGpuMemSharing () const
 
const CountDistinctDescriptorgetCountDistinctDescriptor (const size_t idx) const
 
size_t getCountDistinctDescriptorsSize () const
 
bool sortOnGpu () const
 
bool canOutputColumnar () const
 
bool didOutputColumnar () const
 
void setOutputColumnar (const bool val)
 
bool useStreamingTopN () const
 
bool isLogicalSizedColumnsAllowed () const
 
bool mustUseBaselineSort () const
 
bool forceFourByteFloat () const
 
void setForceFourByteFloat (const bool val)
 
size_t getGroupbyColCount () const
 
size_t getKeyCount () const
 
size_t getBufferColSlotCount () const
 
size_t getBufferSizeBytes (const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
 
size_t getBufferSizeBytes (const ExecutorDeviceType device_type) const
 
size_t getBufferSizeBytes (const ExecutorDeviceType device_type, const size_t override_entry_count) const
 
const ColSlotContextgetColSlotContext () const
 
bool usesGetGroupValueFast () const
 
bool blocksShareMemory () const
 
bool threadsShareMemory () const
 
bool lazyInitGroups (const ExecutorDeviceType) const
 
bool interleavedBins (const ExecutorDeviceType) const
 
size_t sharedMemBytes (const ExecutorDeviceType) const
 
size_t getColOffInBytes (const size_t col_idx) const
 
size_t getColOffInBytesInNextBin (const size_t col_idx) const
 
size_t getNextColOffInBytes (const int8_t *col_ptr, const size_t bin, const size_t col_idx) const
 
size_t getColOnlyOffInBytes (const size_t col_idx) const
 
size_t getRowSize () const
 
size_t getColsSize () const
 
size_t getWarpCount () const
 
size_t getCompactByteWidth () const
 
size_t getEffectiveKeyWidth () const
 
bool isWarpSyncRequired (const ExecutorDeviceType) const
 
std::string toString () const
 
std::string reductionKey () const
 

Static Public Member Functions

static TResultSetBufferDescriptor toThrift (const QueryMemoryDescriptor &)
 
static std::unique_ptr
< QueryMemoryDescriptor
init (const Executor *executor, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const ColRangeInfo &col_range_info, const KeylessInfo &keyless_info, const bool allow_multifrag, const ExecutorDeviceType device_type, const int8_t crt_min_byte_width, const bool sort_on_gpu_hint, const size_t shard_count, const size_t max_groups_buffer_entry_count, RenderInfo *render_info, const CountDistinctDescriptors count_distinct_descriptors, const bool must_use_baseline_sort, const bool output_columnar_hint, const bool streaming_top_n_hint)
 
static bool many_entries (const int64_t max_val, const int64_t min_val, const int64_t bucket)
 
static bool countDescriptorsLogicallyEmpty (const CountDistinctDescriptors &count_distinct_descriptors)
 
static int8_t pick_target_compact_width (const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const int8_t crt_min_byte_width)
 

Protected Member Functions

void resetGroupColWidths (const std::vector< int8_t > &new_group_col_widths)
 

Private Member Functions

size_t getTotalBytesOfColumnarBuffers () const
 
size_t getTotalBytesOfColumnarBuffers (const size_t num_entries_per_column) const
 
size_t getTotalBytesOfColumnarProjections (const size_t projection_count) const
 

Private Attributes

const Executorexecutor_
 
bool allow_multifrag_
 
QueryDescriptionType query_desc_type_
 
bool keyless_hash_
 
bool interleaved_bins_on_gpu_
 
int32_t idx_target_as_key_
 
std::vector< int8_t > group_col_widths_
 
int8_t group_col_compact_width_
 
std::vector< ssize_t > target_groupby_indices_
 
size_t entry_count_
 
int64_t min_val_
 
int64_t max_val_
 
int64_t bucket_
 
bool has_nulls_
 
GroupByMemSharing sharing_
 
CountDistinctDescriptors count_distinct_descriptors_
 
bool sort_on_gpu_
 
bool output_columnar_
 
bool render_output_
 
bool must_use_baseline_sort_
 
bool is_table_function_
 
bool use_streaming_top_n_
 
bool force_4byte_float_
 
ColSlotContext col_slot_context_
 

Friends

class ResultSet
 
class QueryExecutionContext
 

Detailed Description

Definition at line 74 of file QueryMemoryDescriptor.h.

Constructor & Destructor Documentation

QueryMemoryDescriptor::QueryMemoryDescriptor ( )

Definition at line 526 of file QueryMemoryDescriptor.cpp.

References Projection, and Shared.

527  : executor_(nullptr)
528  , allow_multifrag_(false)
530  , keyless_hash_(false)
531  , interleaved_bins_on_gpu_(false)
532  , idx_target_as_key_(0)
534  , entry_count_(0)
535  , min_val_(0)
536  , max_val_(0)
537  , bucket_(0)
538  , has_nulls_(false)
540  , sort_on_gpu_(false)
541  , output_columnar_(false)
542  , render_output_(false)
543  , must_use_baseline_sort_(false)
544  , is_table_function_(false)
545  , use_streaming_top_n_(false)
546  , force_4byte_float_(false) {}
QueryDescriptionType query_desc_type_
QueryMemoryDescriptor::QueryMemoryDescriptor ( const Executor executor,
const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const bool  allow_multifrag,
const bool  keyless_hash,
const bool  interleaved_bins_on_gpu,
const int32_t  idx_target_as_key,
const ColRangeInfo col_range_info,
const ColSlotContext col_slot_context,
const std::vector< int8_t > &  group_col_widths,
const int8_t  group_col_compact_width,
const std::vector< ssize_t > &  target_groupby_indices,
const size_t  entry_count,
const GroupByMemSharing  sharing,
const bool  shared_mem_for_group_by,
const CountDistinctDescriptors  count_distinct_descriptors,
const bool  sort_on_gpu_hint,
const bool  output_columnar,
const bool  render_output,
const bool  must_use_baseline_sort,
const bool  use_streaming_top_n 
)

Definition at line 413 of file QueryMemoryDescriptor.cpp.

References canOutputColumnar(), CHECK(), col_slot_context_, count_distinct_descriptors_, countDescriptorsLogicallyEmpty(), streaming_top_n::get_heap_size(), getEntryCount(), getRowSize(), GroupByBaselineHash, GroupByPerfectHash, interleaved_bins_on_gpu_, isLogicalSizedColumnsAllowed(), keyless_hash_, NonGroupedAggregate, output_columnar_, Projection, query_desc_type_, ColSlotContext::setAllSlotsPaddedSizeToLogicalSize(), ColSlotContext::setAllUnsetSlotsPaddedSize(), SharedForKeylessOneColumnKnownRange, sharing_, sort_on_gpu_, RelAlgExecutionUnit::use_bump_allocator, use_streaming_top_n_, and ColSlotContext::validate().

435  : executor_(executor)
436  , allow_multifrag_(allow_multifrag)
437  , query_desc_type_(col_range_info.hash_type_)
438  , keyless_hash_(keyless_hash)
439  , interleaved_bins_on_gpu_(interleaved_bins_on_gpu)
440  , idx_target_as_key_(idx_target_as_key)
441  , group_col_widths_(group_col_widths)
442  , group_col_compact_width_(group_col_compact_width)
443  , target_groupby_indices_(target_groupby_indices)
444  , entry_count_(entry_count)
445  , min_val_(col_range_info.min)
446  , max_val_(col_range_info.max)
447  , bucket_(col_range_info.bucket)
448  , has_nulls_(col_range_info.has_nulls)
449  , sharing_(sharing)
450  , count_distinct_descriptors_(count_distinct_descriptors)
451  , output_columnar_(false)
452  , render_output_(render_output)
453  , must_use_baseline_sort_(must_use_baseline_sort)
454  , is_table_function_(false)
456  , force_4byte_float_(false)
457  , col_slot_context_(col_slot_context) {
460 
461  // TODO(Saman): should remove this after implementing shared memory path
462  // completely through codegen We should not use the current shared memory path if
463  // more than 8 bytes per group is required
465  shared_mem_for_group_by && (getRowSize() <= sizeof(int64_t))) {
466  // TODO(adb / saman): Move this into a different enum so we can remove
467  // GroupByMemSharing
469  interleaved_bins_on_gpu_ = false;
470  }
471 
472  // Note that output_columnar_ currently defaults to false to avoid issues with
473  // getRowSize above. If output columnar is enable then shared_mem_for_group_by is not,
474  // and the above condition would never be true.
475 
476  sort_on_gpu_ = sort_on_gpu_hint && canOutputColumnar() && !keyless_hash_;
477 
478  if (sort_on_gpu_) {
479  CHECK(!ra_exe_unit.use_bump_allocator);
480  output_columnar_ = true;
481  } else {
482  switch (query_desc_type_) {
484  output_columnar_ = output_columnar_hint;
485  break;
490  break;
492  output_columnar_ = output_columnar_hint;
493  break;
498  break;
499  default:
500  output_columnar_ = false;
501  break;
502  }
503  }
504 
506  // TODO(adb): Ensure fixed size buffer allocations are correct with all logical column
507  // sizes
508  CHECK(!ra_exe_unit.use_bump_allocator);
511  }
512 
513 #ifdef HAVE_CUDA
514  // Check Streaming Top N heap usage, bail if > 2GB (max slab size, CUDA only)
515  if (use_streaming_top_n_) {
516  const auto thread_count = executor->blockSize() * executor->gridSize();
517  const auto total_buff_size =
519  if (total_buff_size > static_cast<size_t>(1L << 31)) {
520  throw StreamingTopNOOM(total_buff_size);
521  }
522  }
523 #endif
524 }
bool isLogicalSizedColumnsAllowed() const
QueryDescriptionType hash_type_
bool use_streaming_top_n(const RelAlgExecutionUnit &ra_exe_unit, const bool output_columnar)
CHECK(cgen_state)
CountDistinctDescriptors count_distinct_descriptors_
void validate() const
QueryDescriptionType query_desc_type_
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
void setAllSlotsPaddedSizeToLogicalSize()
std::vector< int8_t > group_col_widths_
static bool countDescriptorsLogicallyEmpty(const CountDistinctDescriptors &count_distinct_descriptors)
void setAllUnsetSlotsPaddedSize(const int8_t padded_size)
std::vector< ssize_t > target_groupby_indices_

+ Here is the call graph for this function:

QueryMemoryDescriptor::QueryMemoryDescriptor ( const Executor executor,
const size_t  entry_count,
const QueryDescriptionType  query_desc_type,
const bool  is_table_function 
)

Definition at line 548 of file QueryMemoryDescriptor.cpp.

References Shared.

552  : executor_(executor)
553  , allow_multifrag_(false)
554  , query_desc_type_(query_desc_type)
555  , keyless_hash_(false)
556  , interleaved_bins_on_gpu_(false)
557  , idx_target_as_key_(0)
559  , entry_count_(entry_count)
560  , min_val_(0)
561  , max_val_(0)
562  , bucket_(0)
563  , has_nulls_(false)
565  , sort_on_gpu_(false)
566  , output_columnar_(false)
567  , render_output_(false)
568  , must_use_baseline_sort_(false)
569  , is_table_function_(is_table_function)
570  , use_streaming_top_n_(false)
571  , force_4byte_float_(false) {}
QueryDescriptionType query_desc_type_
QueryMemoryDescriptor::QueryMemoryDescriptor ( const QueryDescriptionType  query_desc_type,
const int64_t  min_val,
const int64_t  max_val,
const bool  has_nulls,
const std::vector< int8_t > &  group_col_widths 
)

Definition at line 573 of file QueryMemoryDescriptor.cpp.

References Shared.

578  : executor_(nullptr)
579  , allow_multifrag_(false)
580  , query_desc_type_(query_desc_type)
581  , keyless_hash_(false)
582  , interleaved_bins_on_gpu_(false)
583  , idx_target_as_key_(0)
584  , group_col_widths_(group_col_widths)
586  , entry_count_(0)
587  , min_val_(min_val)
588  , max_val_(max_val)
589  , bucket_(0)
590  , has_nulls_(false)
592  , sort_on_gpu_(false)
593  , output_columnar_(false)
594  , render_output_(false)
595  , must_use_baseline_sort_(false)
596  , is_table_function_(false)
597  , use_streaming_top_n_(false)
598  , force_4byte_float_(false) {}
QueryDescriptionType query_desc_type_
std::vector< int8_t > group_col_widths_
QueryMemoryDescriptor::QueryMemoryDescriptor ( const TResultSetBufferDescriptor &  thrift_query_memory_descriptor)

Member Function Documentation

void QueryMemoryDescriptor::addColSlotInfo ( const std::vector< std::tuple< int8_t, int8_t >> &  slots_for_col)

Definition at line 1149 of file QueryMemoryDescriptor.cpp.

References ColSlotContext::addColumn(), and col_slot_context_.

Referenced by TableFunctionExecutionContext::launchCpuCode(), and TableFunctionExecutionContext::launchGpuCode().

1150  {
1151  col_slot_context_.addColumn(slots_for_col);
1152 }
void addColumn(const std::vector< std::tuple< int8_t, int8_t >> &slots_for_col)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryDescriptor::alignPaddedSlots ( )

Definition at line 1158 of file QueryMemoryDescriptor.cpp.

References ColSlotContext::alignPaddedSlots(), col_slot_context_, and sortOnGpu().

1158  {
1160 }
void alignPaddedSlots(const bool sort_on_gpu)

+ Here is the call graph for this function:

bool QueryMemoryDescriptor::blocksShareMemory ( ) const

Definition at line 1054 of file QueryMemoryDescriptor.cpp.

References bucket_, count_distinct_descriptors_, countDescriptorsLogicallyEmpty(), executor_, g_cluster, getGroupbyColCount(), GPU, GroupByBaselineHash, GroupByPerfectHash, is_table_function_, many_entries(), max_val_, min_val_, Projection, query_desc_type_, render_output_, and sharedMemBytes().

Referenced by canOutputColumnar(), ResultSetReductionJIT::codegen(), QueryMemoryInitializer::computeNumberOfBuffers(), copy_group_by_buffers_from_gpu(), create_dev_group_by_buffers(), and toString().

1054  {
1055  if (g_cluster || is_table_function_) {
1056  return true;
1057  }
1059  return true;
1060  }
1061  if (executor_->isCPUOnly() || render_output_ ||
1065  getGroupbyColCount() > 1)) {
1066  return true;
1067  }
1071 }
static bool many_entries(const int64_t max_val, const int64_t min_val, const int64_t bucket)
size_t getGroupbyColCount() const
CountDistinctDescriptors count_distinct_descriptors_
size_t sharedMemBytes(const ExecutorDeviceType) const
QueryDescriptionType query_desc_type_
bool g_cluster
static bool countDescriptorsLogicallyEmpty(const CountDistinctDescriptors &count_distinct_descriptors)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool QueryMemoryDescriptor::canOutputColumnar ( ) const

Definition at line 1162 of file QueryMemoryDescriptor.cpp.

References blocksShareMemory(), count_distinct_descriptors_, countDescriptorsLogicallyEmpty(), GPU, interleavedBins(), threadsShareMemory(), and usesGetGroupValueFast().

Referenced by QueryMemoryDescriptor().

1162  {
1166 }
CountDistinctDescriptors count_distinct_descriptors_
bool interleavedBins(const ExecutorDeviceType) const
static bool countDescriptorsLogicallyEmpty(const CountDistinctDescriptors &count_distinct_descriptors)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryDescriptor::clearGroupColWidths ( )
inline

Definition at line 201 of file QueryMemoryDescriptor.h.

References group_col_widths_.

201 { group_col_widths_.clear(); }
std::vector< int8_t > group_col_widths_
void QueryMemoryDescriptor::clearSlotInfo ( )

Definition at line 1154 of file QueryMemoryDescriptor.cpp.

References ColSlotContext::clear(), and col_slot_context_.

1154  {
1156 }

+ Here is the call graph for this function:

void QueryMemoryDescriptor::clearTargetGroupbyIndices ( )
inline

Definition at line 250 of file QueryMemoryDescriptor.h.

References target_groupby_indices_.

250 { target_groupby_indices_.clear(); }
std::vector< ssize_t > target_groupby_indices_
static bool QueryMemoryDescriptor::countDescriptorsLogicallyEmpty ( const CountDistinctDescriptors count_distinct_descriptors)
inlinestatic

Definition at line 156 of file QueryMemoryDescriptor.h.

References Invalid.

Referenced by blocksShareMemory(), canOutputColumnar(), countDistinctDescriptorsLogicallyEmpty(), lazyInitGroups(), and QueryMemoryDescriptor().

157  {
158  return std::all_of(count_distinct_descriptors.begin(),
159  count_distinct_descriptors.end(),
160  [](const CountDistinctDescriptor& desc) {
161  return desc.impl_type_ == CountDistinctImplType::Invalid;
162  });
163  }

+ Here is the caller graph for this function:

bool QueryMemoryDescriptor::countDistinctDescriptorsLogicallyEmpty ( ) const
inline

Definition at line 165 of file QueryMemoryDescriptor.h.

References count_distinct_descriptors_, and countDescriptorsLogicallyEmpty().

Referenced by QueryMemoryInitializer::allocateCountDistinctGpuMem().

165  {
167  }
CountDistinctDescriptors count_distinct_descriptors_
static bool countDescriptorsLogicallyEmpty(const CountDistinctDescriptors &count_distinct_descriptors)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool QueryMemoryDescriptor::didOutputColumnar ( ) const
inline

Definition at line 273 of file QueryMemoryDescriptor.h.

References output_columnar_.

Referenced by TargetExprCodegen::codegen(), ResultSetReductionJIT::codegen(), GroupByAndAggregate::codegen(), GroupByAndAggregate::codegenAggCalls(), GroupByAndAggregate::codegenAggColumnPtr(), GroupByAndAggregate::codegenGroupBy(), GroupByAndAggregate::codegenMultiColumnBaselineHash(), GroupByAndAggregate::codegenMultiColumnPerfectHash(), GroupByAndAggregate::codegenOutputSlot(), GroupByAndAggregate::codegenSingleColumnPerfectHash(), GroupByAndAggregate::codegenWindowRowPointer(), copy_projection_buffer_from_gpu_columnar(), ResultSetStorage::copyKeyColWise(), ResultSet::createComparator(), ResultSet::didOutputColumnar(), anonymous_namespace{ResultSetReduction.cpp}::fill_slots(), ResultSetStorage::fillOneEntryColWise(), ResultSetStorage::fillOneEntryRowWise(), ResultSet::fixupQueryMemoryDescriptor(), get_cols_ptr(), ResultSet::getTargetValueFromBufferColwise(), ResultSetStorage::initializeBaselineValueSlots(), anonymous_namespace{TargetExprBuilder.cpp}::is_columnar_projection(), ResultSetReductionJIT::isEmpty(), ResultSetStorage::isEmptyEntry(), ResultSetStorage::isEmptyEntryColumnar(), QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), ResultSet::makeGeoTargetValue(), ResultSetStorage::moveOneEntryToBuffer(), ResultSetStorage::reduce(), ResultSetStorage::reduceOneEntryBaseline(), ResultSetReductionJIT::reduceOneEntryBaselineIdx(), ResultSetStorage::reduceOneEntrySlotsBaseline(), ResultSetStorage::reduceOneSlotBaseline(), ResultSetStorage::reduceSingleRow(), and ResultSetStorage::rewriteAggregateBufferOffsets().

+ Here is the caller graph for this function:

bool QueryMemoryDescriptor::forceFourByteFloat ( ) const
inline

Definition at line 284 of file QueryMemoryDescriptor.h.

References force_4byte_float_.

Referenced by ResultSet::makeTargetValue().

+ Here is the caller graph for this function:

int64_t QueryMemoryDescriptor::getBucket ( ) const
inline

Definition at line 257 of file QueryMemoryDescriptor.h.

References bucket_.

Referenced by GroupByAndAggregate::codegenGroupBy(), and GroupByAndAggregate::codegenSingleColumnPerfectHash().

257 { return bucket_; }

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getBufferColSlotCount ( ) const

Definition at line 1034 of file QueryMemoryDescriptor.cpp.

References col_slot_context_, ColSlotContext::getSlotCount(), and target_groupby_indices_.

Referenced by anonymous_namespace{ResultSetIteration.cpp}::advance_col_buff_to_slot(), anonymous_namespace{ResultSetReduction.cpp}::fill_slots(), ResultSetStorage::fillOneEntryColWise(), and ResultSetStorage::fillOneEntryRowWise().

1034  {
1035  size_t total_slot_count = col_slot_context_.getSlotCount();
1036 
1037  if (target_groupby_indices_.empty()) {
1038  return total_slot_count;
1039  }
1040  return total_slot_count - std::count_if(target_groupby_indices_.begin(),
1042  [](const ssize_t i) { return i >= 0; });
1043 }
size_t getSlotCount() const
std::vector< ssize_t > target_groupby_indices_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getBufferSizeBytes ( const RelAlgExecutionUnit ra_exe_unit,
const unsigned  thread_count,
const ExecutorDeviceType  device_type 
) const

Definition at line 965 of file QueryMemoryDescriptor.cpp.

References entry_count_, streaming_top_n::get_heap_size(), getRowSize(), SortInfo::limit, SortInfo::offset, RelAlgExecutionUnit::sort_info, and use_streaming_top_n_.

Referenced by QueryMemoryInitializer::applyStreamingTopNOffsetCpu(), QueryMemoryInitializer::copyGroupByBuffersFromGpu(), create_dev_group_by_buffers(), getBufferSizeBytes(), and QueryMemoryInitializer::QueryMemoryInitializer().

968  {
969  if (use_streaming_top_n_) {
970  const size_t n = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
971  return streaming_top_n::get_heap_size(getRowSize(), n, thread_count);
972  }
973  return getBufferSizeBytes(device_type, entry_count_);
974 }
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
const size_t limit
const SortInfo sort_info
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
const size_t offset

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getBufferSizeBytes ( const ExecutorDeviceType  device_type) const

Definition at line 1011 of file QueryMemoryDescriptor.cpp.

References entry_count_, and getBufferSizeBytes().

1012  {
1013  return getBufferSizeBytes(device_type, entry_count_);
1014 }
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const

+ Here is the call graph for this function:

size_t QueryMemoryDescriptor::getBufferSizeBytes ( const ExecutorDeviceType  device_type,
const size_t  entry_count 
) const

Returns total amount of output buffer memory for each device (CPU/GPU)

Columnar: if projection: it returns index buffer + columnar buffer (all non-lazy columns) if group by: it returns the amount required for each group column (assumes 64-bit per group) + columnar buffer (all involved agg columns)

Row-wise: returns required memory per row multiplied by number of entries

Definition at line 987 of file QueryMemoryDescriptor.cpp.

References align_to_int64(), CHECK_GE, executor_, getColsSize(), getRowSize(), getTotalBytesOfColumnarBuffers(), group_col_widths_, interleavedBins(), keyless_hash_, output_columnar_, Projection, and query_desc_type_.

988  {
990  CHECK_GE(group_col_widths_.size(), size_t(1));
991  auto row_bytes = align_to_int64(getColsSize());
992 
993  return (interleavedBins(device_type) ? executor_->warpSize() : 1) * entry_count *
994  row_bytes;
995  }
996 
997  constexpr size_t row_index_width = sizeof(int64_t);
998  size_t total_bytes{0};
999  if (output_columnar_) {
1001  ? row_index_width * entry_count
1002  : sizeof(int64_t) * group_col_widths_.size() * entry_count) +
1004  } else {
1005  total_bytes = getRowSize() * entry_count;
1006  }
1007 
1008  return total_bytes;
1009 }
#define CHECK_GE(x, y)
Definition: Logger.h:210
size_t getTotalBytesOfColumnarBuffers() const
QueryDescriptionType query_desc_type_
bool interleavedBins(const ExecutorDeviceType) const
std::vector< int8_t > group_col_widths_
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

size_t QueryMemoryDescriptor::getColCount ( ) const

Definition at line 1111 of file QueryMemoryDescriptor.cpp.

References col_slot_context_, and ColSlotContext::getColCount().

1111  {
1112  return col_slot_context_.getColCount();
1113 }
size_t getColCount() const

+ Here is the call graph for this function:

size_t QueryMemoryDescriptor::getColOffInBytes ( const size_t  col_idx) const

Definition at line 861 of file QueryMemoryDescriptor.cpp.

References align_to_int64(), CHECK(), CHECK_EQ, entry_count_, getColOnlyOffInBytes(), getEffectiveKeyWidth(), getPaddedSlotWidthBytes(), getPrependedGroupBufferSizeInBytes(), getWarpCount(), group_col_widths_, GroupByPerfectHash, keyless_hash_, output_columnar_, and query_desc_type_.

Referenced by TargetExprCodegen::codegen(), GroupByAndAggregate::codegenAggColumnPtr(), GroupByAndAggregate::codegenOutputSlot(), anonymous_namespace{QueryMemoryInitializer.cpp}::compact_projection_buffer_for_cpu_columnar(), copy_projection_buffer_from_gpu_columnar(), get_cols_ptr(), QueryExecutionContext::groupBufferToDeinterleavedResults(), QueryMemoryInitializer::initGroups(), inplace_sort_gpu(), and anonymous_namespace{Execute.cpp}::permute_storage_columnar().

861  {
862  const auto warp_count = getWarpCount();
863  if (output_columnar_) {
864  CHECK_EQ(size_t(1), warp_count);
865  size_t offset{0};
866  if (!keyless_hash_) {
868  }
869  for (size_t index = 0; index < col_idx; ++index) {
871  }
872  return offset;
873  }
874 
875  size_t offset{0};
876  if (keyless_hash_) {
877  // ignore, there's no group column in the output buffer
879  } else {
880  offset += group_col_widths_.size() * getEffectiveKeyWidth();
881  offset = align_to_int64(offset);
882  }
883  offset += getColOnlyOffInBytes(col_idx);
884  return offset;
885 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
size_t getEffectiveKeyWidth() const
size_t getColOnlyOffInBytes(const size_t col_idx) const
CHECK(cgen_state)
size_t getPrependedGroupBufferSizeInBytes() const
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
QueryDescriptionType query_desc_type_
std::vector< int8_t > group_col_widths_
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getColOffInBytesInNextBin ( const size_t  col_idx) const

Definition at line 920 of file QueryMemoryDescriptor.cpp.

References CHECK_EQ, getPaddedSlotWidthBytes(), getRowSize(), getWarpCount(), group_col_widths_, and output_columnar_.

Referenced by QueryExecutionContext::groupBufferToDeinterleavedResults().

920  {
921  auto warp_count = getWarpCount();
922  if (output_columnar_) {
923  CHECK_EQ(size_t(1), group_col_widths_.size());
924  CHECK_EQ(size_t(1), warp_count);
925  return getPaddedSlotWidthBytes(col_idx);
926  }
927 
928  return warp_count * getRowSize();
929 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
std::vector< int8_t > group_col_widths_

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getColOnlyOffInBytes ( const size_t  col_idx) const

Definition at line 848 of file QueryMemoryDescriptor.cpp.

References col_slot_context_, and ColSlotContext::getColOnlyOffInBytes().

Referenced by TargetExprCodegen::codegen(), GroupByAndAggregate::codegenAggColumnPtr(), getColOffInBytes(), and ResultSetStorage::reduceSingleRow().

848  {
849  return col_slot_context_.getColOnlyOffInBytes(col_idx);
850 }
size_t getColOnlyOffInBytes(const size_t slot_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

const ColSlotContext& QueryMemoryDescriptor::getColSlotContext ( ) const
inline

Definition at line 299 of file QueryMemoryDescriptor.h.

References col_slot_context_.

Referenced by ResultSetStorage::reduceEntriesNoCollisionsColWise(), and ResultSetReductionJIT::reduceOneEntryTargetsNoCollisions().

299 { return col_slot_context_; }

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getColsSize ( ) const

Definition at line 787 of file QueryMemoryDescriptor.cpp.

References col_slot_context_, and ColSlotContext::getAllSlotsAlignedPaddedSize().

Referenced by getBufferSizeBytes(), getRowSize(), and QueryExecutionContext::launchCpuCode().

787  {
789 }
size_t getAllSlotsAlignedPaddedSize() const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getCompactByteWidth ( ) const

Definition at line 809 of file QueryMemoryDescriptor.cpp.

References col_slot_context_, and ColSlotContext::getCompactByteWidth().

Referenced by anonymous_namespace{TargetExprBuilder.cpp}::get_initial_agg_val(), and anonymous_namespace{OutputBufferInitialization.cpp}::init_agg_val_vec().

809  {
811 }
size_t getCompactByteWidth() const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

const CountDistinctDescriptor& QueryMemoryDescriptor::getCountDistinctDescriptor ( const size_t  idx) const
inline
size_t QueryMemoryDescriptor::getCountDistinctDescriptorsSize ( ) const
inline

Definition at line 266 of file QueryMemoryDescriptor.h.

References count_distinct_descriptors_.

Referenced by QueryMemoryInitializer::allocateCountDistinctGpuMem(), anonymous_namespace{QueryMemoryInitializer.cpp}::check_total_bitmap_memory(), ResultSetReductionJIT::reduceOneCountDistinctSlot(), and ResultSetStorage::reduceOneCountDistinctSlot().

266  {
267  return count_distinct_descriptors_.size();
268  }
CountDistinctDescriptors count_distinct_descriptors_

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getEntryCount ( ) const
inline

Definition at line 252 of file QueryMemoryDescriptor.h.

References entry_count_.

Referenced by advance_to_next_columnar_target_buff(), QueryMemoryInitializer::allocateCountDistinctGpuMem(), QueryMemoryInitializer::applyStreamingTopNOffsetCpu(), QueryMemoryInitializer::applyStreamingTopNOffsetGpu(), anonymous_namespace{QueryMemoryInitializer.cpp}::check_total_bitmap_memory(), ResultSetReductionJIT::codegen(), GroupByAndAggregate::codegenMultiColumnBaselineHash(), GroupByAndAggregate::codegenMultiColumnPerfectHash(), GroupByAndAggregate::codegenOutputSlot(), GroupByAndAggregate::codegenWindowRowPointer(), anonymous_namespace{QueryMemoryInitializer.cpp}::compact_projection_buffer_for_cpu_columnar(), QueryMemoryInitializer::compactProjectionBuffersCpu(), QueryMemoryInitializer::compactProjectionBuffersGpu(), copy_group_by_buffers_from_gpu(), create_dev_group_by_buffers(), Executor::dispatchFragments(), ResultSet::entryCount(), ResultSetStorage::fillOneEntryColWise(), ResultSetStorage::fillOneEntryRowWise(), anonymous_namespace{ResultSetReduction.cpp}::get_matching_group_value_reduction(), getPrependedGroupBufferSizeInBytes(), getPrependedGroupColOffInBytes(), ResultSet::getTargetValueFromBufferColwise(), QueryMemoryInitializer::initColumnarGroups(), ResultSetStorage::initializeBaselineValueSlots(), ResultSetStorage::initializeColWise(), ResultSetStorage::initializeRowWise(), inplace_sort_gpu(), QueryExecutionContext::launchGpuCode(), ResultSetStorage::moveEntriesToBuffer(), ResultSetStorage::moveOneEntryToBuffer(), query_group_by_template_impl(), QueryMemoryDescriptor(), QueryMemoryInitializer::QueryMemoryInitializer(), ResultSetStorage::reduce(), ResultSetStorage::reduceOneEntryBaseline(), ResultSetStorage::reduceOneEntrySlotsBaseline(), ResultSetStorage::reduceOneSlotBaseline(), and ResultSetStorage::rewriteAggregateBufferOffsets().

252 { return entry_count_; }

+ Here is the caller graph for this function:

const Executor* QueryMemoryDescriptor::getExecutor ( ) const
inline

Definition at line 174 of file QueryMemoryDescriptor.h.

References executor_.

Referenced by anonymous_namespace{Execute.cpp}::build_row_for_empty_input(), ResultSetReductionJIT::codegen(), anonymous_namespace{Execute.cpp}::fill_entries_for_empty_input(), ResultSet::getVarlenOrderEntry(), ResultSet::makeGeoTargetValue(), and ResultSet::makeVarlenTargetValue().

174 { return executor_; }

+ Here is the caller graph for this function:

GroupByMemSharing QueryMemoryDescriptor::getGpuMemSharing ( ) const
inline

Definition at line 260 of file QueryMemoryDescriptor.h.

References sharing_.

Referenced by TargetExprCodegen::codegen(), and query_group_by_template_impl().

260 { return sharing_; }

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getKeyCount ( ) const
inline

Definition at line 289 of file QueryMemoryDescriptor.h.

References getGroupbyColCount(), and keyless_hash_.

Referenced by anonymous_namespace{Execute.cpp}::permute_storage_columnar().

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

const int8_t QueryMemoryDescriptor::getLogicalSlotWidthBytes ( const size_t  slot_idx) const

Definition at line 1123 of file QueryMemoryDescriptor.cpp.

References col_slot_context_, ColSlotContext::getSlotInfo(), and SlotSize::logical_size.

Referenced by QueryMemoryInitializer::allocateCountDistinctBuffers(), TargetExprCodegen::codegen(), and ResultSet::getTargetValueFromBufferRowwise().

1124  {
1125  return col_slot_context_.getSlotInfo(slot_idx).logical_size;
1126 }
int8_t logical_size
const SlotSize & getSlotInfo(const size_t slot_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int64_t QueryMemoryDescriptor::getMaxVal ( ) const
inline

Definition at line 256 of file QueryMemoryDescriptor.h.

References max_val_.

Referenced by GroupByAndAggregate::codegenGroupBy().

256 { return max_val_; }

+ Here is the caller graph for this function:

int64_t QueryMemoryDescriptor::getMinVal ( ) const
inline

Definition at line 255 of file QueryMemoryDescriptor.h.

References min_val_.

Referenced by GroupByAndAggregate::codegenSingleColumnPerfectHash().

255 { return min_val_; }

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getNextColOffInBytes ( const int8_t *  col_ptr,
const size_t  bin,
const size_t  col_idx 
) const

Definition at line 931 of file QueryMemoryDescriptor.cpp.

References align_to_int64(), CHECK(), CHECK_EQ, entry_count_, getPaddedSlotWidthBytes(), getSlotCount(), getWarpCount(), group_col_widths_, and output_columnar_.

Referenced by QueryMemoryInitializer::initColumnPerRow().

933  {
935  size_t offset{0};
936  auto warp_count = getWarpCount();
937  const auto chosen_bytes = getPaddedSlotWidthBytes(col_idx);
938  const auto total_slot_count = getSlotCount();
939  if (col_idx + 1 == total_slot_count) {
940  if (output_columnar_) {
941  return (entry_count_ - bin) * chosen_bytes;
942  } else {
943  return static_cast<size_t>(align_to_int64(col_ptr + chosen_bytes) - col_ptr);
944  }
945  }
946 
947  const auto next_chosen_bytes = getPaddedSlotWidthBytes(col_idx + 1);
948  if (output_columnar_) {
949  CHECK_EQ(size_t(1), group_col_widths_.size());
950  CHECK_EQ(size_t(1), warp_count);
951 
952  offset = align_to_int64(entry_count_ * chosen_bytes);
953 
954  offset += bin * (next_chosen_bytes - chosen_bytes);
955  return offset;
956  }
957 
958  if (next_chosen_bytes == sizeof(int64_t)) {
959  return static_cast<size_t>(align_to_int64(col_ptr + chosen_bytes) - col_ptr);
960  } else {
961  return chosen_bytes;
962  }
963 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
CHECK(cgen_state)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
std::vector< int8_t > group_col_widths_
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getPaddedColWidthForRange ( const size_t  offset,
const size_t  range 
) const
inline

Definition at line 215 of file QueryMemoryDescriptor.h.

References getPaddedSlotWidthBytes().

Referenced by get_byteoff_of_slot(), and ResultSet::makeGeoTargetValue().

215  {
216  size_t ret = 0;
217  for (size_t i = offset; i < offset + range; i++) {
218  ret += static_cast<size_t>(getPaddedSlotWidthBytes(i));
219  }
220  return ret;
221  }
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

const int8_t QueryMemoryDescriptor::getPaddedSlotWidthBytes ( const size_t  slot_idx) const

Definition at line 1119 of file QueryMemoryDescriptor.cpp.

References col_slot_context_, ColSlotContext::getSlotInfo(), and SlotSize::padded_size.

Referenced by advance_target_ptr_row_wise(), advance_to_next_columnar_target_buff(), TargetExprCodegen::codegen(), GroupByAndAggregate::codegenOutputSlot(), compact_init_vals(), anonymous_namespace{QueryMemoryInitializer.cpp}::compact_projection_buffer_for_cpu_columnar(), copy_projection_buffer_from_gpu_columnar(), ResultSet::copyColumnIntoBuffer(), Executor::executePlanWithoutGroupBy(), get_width_for_slot(), getColOffInBytes(), getColOffInBytesInNextBin(), getNextColOffInBytes(), getPaddedColWidthForRange(), ResultSet::getPaddedSlotWidthBytes(), ResultSet::getTargetValueFromBufferColwise(), ResultSet::getTargetValueFromBufferRowwise(), anonymous_namespace{OutputBufferInitialization.cpp}::init_agg_val_vec(), QueryMemoryInitializer::initColumnarGroups(), QueryMemoryInitializer::initColumnPerRow(), inplace_sort_gpu(), ResultSetReductionJIT::isEmpty(), ResultSetStorage::isEmptyEntry(), ResultSetStorage::isEmptyEntryColumnar(), ResultSet::makeGeoTargetValue(), TargetExprCodegenBuilder::operator()(), anonymous_namespace{Execute.cpp}::permute_storage_columnar(), ResultSetStorage::reduceEntriesNoCollisionsColWise(), ResultSetReductionJIT::reduceOneAggregateSlot(), ResultSetReductionJIT::reduceOneEntryTargetsNoCollisions(), ResultSetStorage::reduceOneSlot(), ResultSetStorage::reduceSingleRow(), and ResultSetStorage::rewriteAggregateBufferOffsets().

1119  {
1120  return col_slot_context_.getSlotInfo(slot_idx).padded_size;
1121 }
const SlotSize & getSlotInfo(const size_t slot_idx) const
int8_t padded_size

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getPrependedGroupBufferSizeInBytes ( ) const

Definition at line 909 of file QueryMemoryDescriptor.cpp.

References align_to_int64(), CHECK(), getEntryCount(), getGroupbyColCount(), groupColWidth(), and output_columnar_.

Referenced by getColOffInBytes().

909  {
911  size_t buffer_size{0};
912  for (size_t group_idx = 0; group_idx < getGroupbyColCount(); group_idx++) {
913  buffer_size += align_to_int64(
914  std::max(groupColWidth(group_idx), static_cast<int8_t>(sizeof(int64_t))) *
915  getEntryCount());
916  }
917  return buffer_size;
918 }
int8_t groupColWidth(const size_t key_idx) const
CHECK(cgen_state)
size_t getGroupbyColCount() const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getPrependedGroupColOffInBytes ( const size_t  group_idx) const

Definition at line 891 of file QueryMemoryDescriptor.cpp.

References align_to_int64(), CHECK(), getEntryCount(), getGroupbyColCount(), groupColWidth(), and output_columnar_.

Referenced by ResultSetStorage::copyKeyColWise(), ResultSetStorage::isEmptyEntryColumnar(), and anonymous_namespace{Execute.cpp}::permute_storage_columnar().

892  {
894  CHECK(group_idx < getGroupbyColCount());
895  size_t offset{0};
896  for (size_t col_idx = 0; col_idx < group_idx; col_idx++) {
897  // TODO(Saman): relax that int64_bit part immediately
898  offset += align_to_int64(
899  std::max(groupColWidth(col_idx), static_cast<int8_t>(sizeof(int64_t))) *
900  getEntryCount());
901  }
902  return offset;
903 }
int8_t groupColWidth(const size_t key_idx) const
CHECK(cgen_state)
size_t getGroupbyColCount() const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

QueryDescriptionType QueryMemoryDescriptor::getQueryDescriptionType ( ) const
inline

Definition at line 176 of file QueryMemoryDescriptor.h.

References query_desc_type_.

Referenced by ResultSetReductionJIT::codegen(), GroupByAndAggregate::codegen(), GroupByAndAggregate::codegenAggCalls(), GroupByAndAggregate::codegenAggColumnPtr(), GroupByAndAggregate::codegenGroupBy(), GroupByAndAggregate::codegenMultiColumnPerfectHash(), GroupByAndAggregate::codegenOutputSlot(), Executor::collectAllDeviceResults(), copy_projection_buffer_from_gpu_columnar(), Executor::dispatchFragments(), ResultSet::getQueryDescriptionType(), init_agg_val_vec(), anonymous_namespace{TargetExprBuilder.cpp}::is_columnar_projection(), ResultSetReductionJIT::isEmpty(), ResultSetStorage::isEmptyEntry(), ResultSetStorage::isEmptyEntryColumnar(), isSingleColumnGroupByWithPerfectHash(), QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), ResultSetStorage::moveEntriesToBuffer(), TargetExprCodegenBuilder::operator()(), ResultSetStorage::reduce(), Executor::reduceMultiDeviceResultSets(), ResultSetStorage::reduceOneEntryBaseline(), ResultSetReductionJIT::reduceOneEntryBaselineIdx(), ResultSetReductionJIT::reduceOneEntryNoCollisionsIdx(), Executor::ExecutionDispatch::run(), Executor::ExecutionDispatch::runImpl(), target_exprs_to_infos(), and ResultSet::updateStorageEntryCount().

176 { return query_desc_type_; }
QueryDescriptionType query_desc_type_

+ Here is the caller graph for this function:

std::unique_ptr< QueryExecutionContext > QueryMemoryDescriptor::getQueryExecutionContext ( const RelAlgExecutionUnit ra_exe_unit,
const Executor executor,
const ExecutorDeviceType  device_type,
const ExecutorDispatchMode  dispatch_mode,
const int  device_id,
const int64_t  num_rows,
const std::vector< std::vector< const int8_t * >> &  col_buffers,
const std::vector< std::vector< uint64_t >> &  frag_offsets,
std::shared_ptr< RowSetMemoryOwner row_set_mem_owner,
const bool  output_columnar,
const bool  sort_on_gpu,
RenderInfo render_info 
) const

Definition at line 667 of file QueryMemoryDescriptor.cpp.

References DEBUG_TIMER, and QueryExecutionContext.

Referenced by Executor::ExecutionDispatch::runImpl().

679  {
680  auto timer = DEBUG_TIMER(__func__);
681  if (frag_offsets.empty()) {
682  return nullptr;
683  }
684  return std::unique_ptr<QueryExecutionContext>(
685  new QueryExecutionContext(ra_exe_unit,
686  *this,
687  executor,
688  device_type,
689  dispatch_mode,
690  device_id,
691  num_rows,
692  col_buffers,
693  frag_offsets,
694  row_set_mem_owner,
695  output_columnar,
696  sort_on_gpu,
697  render_info));
698 }
const int8_t const int64_t * num_rows
#define DEBUG_TIMER(name)
Definition: Logger.h:313
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getRowSize ( ) const

Definition at line 791 of file QueryMemoryDescriptor.cpp.

References align_to_int64(), CHECK(), getColsSize(), getEffectiveKeyWidth(), group_col_widths_, GroupByPerfectHash, keyless_hash_, output_columnar_, and query_desc_type_.

Referenced by QueryMemoryInitializer::applyStreamingTopNOffsetCpu(), QueryMemoryInitializer::applyStreamingTopNOffsetGpu(), GroupByAndAggregate::codegenGroupBy(), GroupByAndAggregate::codegenOutputSlot(), GroupByAndAggregate::codegenWindowRowPointer(), QueryMemoryInitializer::copyGroupByBuffersFromGpu(), create_dev_group_by_buffers(), getBufferSizeBytes(), getColOffInBytesInNextBin(), QueryMemoryInitializer::initGroups(), anonymous_namespace{Execute.cpp}::permute_storage_row_wise(), QueryMemoryDescriptor(), QueryMemoryInitializer::QueryMemoryInitializer(), ResultSetStorage::reduceSingleRow(), and sharedMemBytes().

791  {
793  size_t total_bytes{0};
794  if (keyless_hash_) {
795  // ignore, there's no group column in the output buffer
797  } else {
798  total_bytes += group_col_widths_.size() * getEffectiveKeyWidth();
799  total_bytes = align_to_int64(total_bytes);
800  }
801  total_bytes += getColsSize();
802  return align_to_int64(total_bytes);
803 }
size_t getEffectiveKeyWidth() const
CHECK(cgen_state)
QueryDescriptionType query_desc_type_
std::vector< int8_t > group_col_widths_
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getRowWidth ( ) const

Definition at line 1139 of file QueryMemoryDescriptor.cpp.

References col_slot_context_, and ColSlotContext::getAllSlotsPaddedSize().

Referenced by get_row_bytes().

1139  {
1140  // Note: Actual row size may include padding (see ResultSetBufferAccessors.h)
1142 }
size_t getAllSlotsPaddedSize() const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getSlotCount ( ) const
const int8_t QueryMemoryDescriptor::getSlotIndexForSingleSlotCol ( const size_t  col_idx) const

Definition at line 1128 of file QueryMemoryDescriptor.cpp.

References CHECK_EQ, col_slot_context_, and ColSlotContext::getSlotsForCol().

Referenced by QueryMemoryInitializer::allocateCountDistinctBuffers().

1129  {
1130  const auto& col_slots = col_slot_context_.getSlotsForCol(col_idx);
1131  CHECK_EQ(col_slots.size(), size_t(1));
1132  return col_slots.front();
1133 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
const std::vector< size_t > & getSlotsForCol(const size_t col_idx) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

ssize_t QueryMemoryDescriptor::getTargetGroupbyIndex ( const size_t  target_idx) const
inline

Definition at line 234 of file QueryMemoryDescriptor.h.

References CHECK_LT, and target_groupby_indices_.

Referenced by ResultSet::getTargetValueFromBufferColwise(), ResultSet::getTargetValueFromBufferRowwise(), ResultSetReductionJIT::reduceOneEntryBaseline(), ResultSetStorage::reduceOneEntrySlotsBaseline(), ResultSetReductionJIT::reduceOneEntryTargetsNoCollisions(), ResultSetReductionJIT::reduceOneSlot(), ResultSetStorage::reduceOneSlot(), and reductionKey().

234  {
235  CHECK_LT(target_idx, target_groupby_indices_.size());
236  return target_groupby_indices_[target_idx];
237  }
#define CHECK_LT(x, y)
Definition: Logger.h:207
std::vector< ssize_t > target_groupby_indices_

+ Here is the caller graph for this function:

int32_t QueryMemoryDescriptor::getTargetIdxForKey ( ) const
inline

Definition at line 189 of file QueryMemoryDescriptor.h.

References idx_target_as_key_.

Referenced by ResultSetReductionJIT::isEmpty(), ResultSetStorage::isEmptyEntry(), ResultSetStorage::isEmptyEntryColumnar(), query_group_by_template_impl(), ResultSetStorage::reduceSingleRow(), and reductionKey().

189 { return idx_target_as_key_; }

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getTotalBytesOfColumnarBuffers ( ) const
private

Returns the maximum total number of bytes (including required paddings) to store all non-lazy columns' results for columnar cases.

Definition at line 818 of file QueryMemoryDescriptor.cpp.

References CHECK(), col_slot_context_, entry_count_, ColSlotContext::getTotalBytesOfColumnarBuffers(), and output_columnar_.

Referenced by getBufferSizeBytes(), and getTotalBytesOfColumnarProjections().

818  {
821 }
CHECK(cgen_state)
size_t getTotalBytesOfColumnarBuffers(const size_t entry_count) const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::getTotalBytesOfColumnarBuffers ( const size_t  num_entries_per_column) const
private

This is a helper function that returns the total number of bytes (including required paddings) to store all non-lazy columns' results for columnar cases.

Definition at line 827 of file QueryMemoryDescriptor.cpp.

References col_slot_context_, and ColSlotContext::getTotalBytesOfColumnarBuffers().

828  {
829  return col_slot_context_.getTotalBytesOfColumnarBuffers(num_entries_per_column);
830 }
size_t getTotalBytesOfColumnarBuffers(const size_t entry_count) const

+ Here is the call graph for this function:

size_t QueryMemoryDescriptor::getTotalBytesOfColumnarProjections ( const size_t  projection_count) const
private

Returns the effective total number of bytes from columnar projections, which includes 1) total number of bytes used to store all non-lazy columns 2) total number of bytes used to store row indices (for lazy fetches, etc.)

NOTE: this function does not represent the buffer sizes dedicated for the results, but the required memory to fill all valid results into a compact new buffer (with no holes in it)

Definition at line 841 of file QueryMemoryDescriptor.cpp.

References getTotalBytesOfColumnarBuffers().

842  {
843  constexpr size_t row_index_width = sizeof(int64_t);
844  return getTotalBytesOfColumnarBuffers(projection_count) +
845  row_index_width * projection_count;
846 }
size_t getTotalBytesOfColumnarBuffers() const

+ Here is the call graph for this function:

size_t QueryMemoryDescriptor::getWarpCount ( ) const

Definition at line 805 of file QueryMemoryDescriptor.cpp.

References executor_, and interleaved_bins_on_gpu_.

Referenced by getColOffInBytes(), getColOffInBytesInNextBin(), and getNextColOffInBytes().

805  {
806  return (interleaved_bins_on_gpu_ ? executor_->warpSize() : 1);
807 }

+ Here is the caller graph for this function:

int8_t QueryMemoryDescriptor::groupColWidth ( const size_t  key_idx) const
inline

Definition at line 192 of file QueryMemoryDescriptor.h.

References CHECK_LT, and group_col_widths_.

Referenced by ResultSetStorage::copyKeyColWise(), getPrependedGroupBufferSizeInBytes(), getPrependedGroupColOffInBytes(), ResultSetStorage::isEmptyEntryColumnar(), and anonymous_namespace{Execute.cpp}::permute_storage_columnar().

192  {
193  CHECK_LT(key_idx, group_col_widths_.size());
194  return group_col_widths_[key_idx];
195  }
#define CHECK_LT(x, y)
Definition: Logger.h:207
std::vector< int8_t > group_col_widths_

+ Here is the caller graph for this function:

const auto QueryMemoryDescriptor::groupColWidthsBegin ( ) const
inline

Definition at line 199 of file QueryMemoryDescriptor.h.

References group_col_widths_.

199 { return group_col_widths_.begin(); }
std::vector< int8_t > group_col_widths_
const auto QueryMemoryDescriptor::groupColWidthsEnd ( ) const
inline

Definition at line 200 of file QueryMemoryDescriptor.h.

References group_col_widths_.

200 { return group_col_widths_.end(); }
std::vector< int8_t > group_col_widths_
bool QueryMemoryDescriptor::hasInterleavedBinsOnGpu ( ) const
inline

Definition at line 186 of file QueryMemoryDescriptor.h.

References interleaved_bins_on_gpu_.

bool QueryMemoryDescriptor::hasNulls ( ) const
inline

Definition at line 259 of file QueryMemoryDescriptor.h.

References has_nulls_.

Referenced by GroupByAndAggregate::codegenGroupBy().

259 { return has_nulls_; }

+ Here is the caller graph for this function:

std::unique_ptr< QueryMemoryDescriptor > QueryMemoryDescriptor::init ( const Executor executor,
const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const ColRangeInfo col_range_info,
const KeylessInfo keyless_info,
const bool  allow_multifrag,
const ExecutorDeviceType  device_type,
const int8_t  crt_min_byte_width,
const bool  sort_on_gpu_hint,
const size_t  shard_count,
const size_t  max_groups_buffer_entry_count,
RenderInfo render_info,
const CountDistinctDescriptors  count_distinct_descriptors,
const bool  must_use_baseline_sort,
const bool  output_columnar_hint,
const bool  streaming_top_n_hint 
)
static

Definition at line 180 of file QueryMemoryDescriptor.cpp.

References get_col_byte_widths(), and RelAlgExecutionUnit::groupby_exprs.

196  {
197  auto group_col_widths = get_col_byte_widths(ra_exe_unit.groupby_exprs, {});
198  const bool is_group_by{!group_col_widths.empty()};
199 
200  auto col_slot_context = ColSlotContext(ra_exe_unit.target_exprs, {});
201 
202  const auto min_slot_size = QueryMemoryDescriptor::pick_target_compact_width(
203  ra_exe_unit, query_infos, crt_min_byte_width);
204 
205  col_slot_context.setAllSlotsPaddedSize(min_slot_size);
206  col_slot_context.validate();
207 
208  if (!is_group_by) {
209  CHECK(!must_use_baseline_sort);
210 
211  return std::make_unique<QueryMemoryDescriptor>(
212  executor,
213  ra_exe_unit,
214  query_infos,
215  allow_multifrag,
216  false,
217  false,
218  -1,
219  ColRangeInfo{ra_exe_unit.estimator ? QueryDescriptionType::Estimator
221  0,
222  0,
223  0,
224  false},
225  col_slot_context,
226  std::vector<int8_t>{},
227  /*group_col_compact_width=*/0,
228  std::vector<ssize_t>{},
229  /*entry_count=*/1,
231  false,
232  count_distinct_descriptors,
233  false,
234  output_columnar_hint,
235  render_info && render_info->isPotentialInSituRender(),
236  must_use_baseline_sort,
237  /*use_streaming_top_n=*/false);
238  }
239 
240  size_t entry_count = 1;
241  auto actual_col_range_info = col_range_info;
242  auto sharing = GroupByMemSharing::Shared;
243  bool interleaved_bins_on_gpu = false;
244  bool keyless_hash = false;
245  bool shared_mem_for_group_by = false;
246  bool streaming_top_n = false;
247  int8_t group_col_compact_width = 0;
248  int32_t idx_target_as_key = -1;
249  auto output_columnar = output_columnar_hint;
250  std::vector<ssize_t> target_groupby_indices;
251 
252  switch (col_range_info.hash_type_) {
254  if (render_info) {
255  render_info->setInSituDataIfUnset(false);
256  }
257  // keyless hash: whether or not group columns are stored at the beginning of the
258  // output buffer
259  keyless_hash =
260  (!sort_on_gpu_hint ||
262  col_range_info.max, col_range_info.min, col_range_info.bucket)) &&
263  !col_range_info.bucket && !must_use_baseline_sort && keyless_info.keyless;
264 
265  // if keyless, then this target index indicates wheter an entry is empty or not
266  // (acts as a key)
267  idx_target_as_key = keyless_info.target_index;
268 
269  if (group_col_widths.size() > 1) {
270  // col range info max contains the expected cardinality of the output
271  entry_count = static_cast<size_t>(actual_col_range_info.max);
272  actual_col_range_info.bucket = 0;
273  } else {
274  // single column perfect hash
275  entry_count = std::max(
276  GroupByAndAggregate::getBucketedCardinality(col_range_info), int64_t(1));
277  const size_t interleaved_max_threshold{512};
278 
279  size_t gpu_smem_max_threshold{0};
280  if (device_type == ExecutorDeviceType::GPU) {
281  const auto cuda_mgr = executor->getCatalog()->getDataMgr().getCudaMgr();
282  CHECK(cuda_mgr);
283  /*
284  * We only use shared memory strategy if GPU hardware provides native shared
285  *memory atomics support. From CUDA Toolkit documentation:
286  *https://docs.nvidia.com/cuda/pascal-tuning-guide/index.html#atomic-ops "Like
287  *Maxwell, Pascal [and Volta] provides native shared memory atomic operations
288  *for 32-bit integer arithmetic, along with native 32 or 64-bit compare-and-swap
289  *(CAS)."
290  *
291  **/
292  if (cuda_mgr->isArchMaxwellOrLaterForAll()) {
293  // TODO(Saman): threshold should be eventually set as an optimized policy per
294  // architecture.
295  gpu_smem_max_threshold =
296  std::min((cuda_mgr->isArchVoltaForAll()) ? 4095LU : 2047LU,
297  (cuda_mgr->getMaxSharedMemoryForAll() / sizeof(int64_t) - 1));
298  }
299  }
300 
301  if (must_use_baseline_sort) {
302  target_groupby_indices = target_expr_group_by_indices(ra_exe_unit.groupby_exprs,
303  ra_exe_unit.target_exprs);
304  col_slot_context =
305  ColSlotContext(ra_exe_unit.target_exprs, target_groupby_indices);
306  }
307 
308  const auto group_expr = ra_exe_unit.groupby_exprs.front().get();
309  shared_mem_for_group_by =
310  g_enable_smem_group_by && keyless_hash && keyless_info.shared_mem_support &&
311  (entry_count <= gpu_smem_max_threshold) &&
314  count_distinct_descriptors) &&
315  !output_columnar; // TODO(Saman): add columnar support with the new smem
316  // support.
317 
318  bool has_varlen_sample_agg = false;
319  for (const auto& target_expr : ra_exe_unit.target_exprs) {
320  if (target_expr->get_contains_agg()) {
321  const auto agg_expr = dynamic_cast<Analyzer::AggExpr*>(target_expr);
322  CHECK(agg_expr);
323  if (agg_expr->get_aggtype() == kSAMPLE &&
324  agg_expr->get_type_info().is_varlen()) {
325  has_varlen_sample_agg = true;
326  break;
327  }
328  }
329  }
330 
331  interleaved_bins_on_gpu = keyless_hash && !has_varlen_sample_agg &&
332  (entry_count <= interleaved_max_threshold) &&
333  (device_type == ExecutorDeviceType::GPU) &&
335  count_distinct_descriptors) &&
336  !output_columnar;
337  }
338  break;
339  }
341  if (render_info) {
342  render_info->setInSituDataIfUnset(false);
343  }
344  entry_count = shard_count
345  ? (max_groups_buffer_entry_count + shard_count - 1) / shard_count
346  : max_groups_buffer_entry_count;
347  target_groupby_indices = target_expr_group_by_indices(ra_exe_unit.groupby_exprs,
348  ra_exe_unit.target_exprs);
349  col_slot_context = ColSlotContext(ra_exe_unit.target_exprs, target_groupby_indices);
350 
351  group_col_compact_width =
352  output_columnar ? 8
353  : pick_baseline_key_width(ra_exe_unit, query_infos, executor);
354 
355  actual_col_range_info =
357  break;
358  }
360  CHECK(!must_use_baseline_sort);
361 
362  if (streaming_top_n_hint && use_streaming_top_n(ra_exe_unit, output_columnar)) {
363  streaming_top_n = true;
364  entry_count = ra_exe_unit.sort_info.offset + ra_exe_unit.sort_info.limit;
365  } else {
366  if (ra_exe_unit.use_bump_allocator) {
367  output_columnar = false;
368  entry_count = 0;
369  } else {
370  entry_count = ra_exe_unit.scan_limit
371  ? static_cast<size_t>(ra_exe_unit.scan_limit)
372  : max_groups_buffer_entry_count;
373  }
374  }
375 
376  const auto catalog = executor->getCatalog();
377  CHECK(catalog);
378  target_groupby_indices = executor->plan_state_->allow_lazy_fetch_
379  ? target_expr_proj_indices(ra_exe_unit, *catalog)
380  : std::vector<ssize_t>{};
381 
382  col_slot_context = ColSlotContext(ra_exe_unit.target_exprs, target_groupby_indices);
383  break;
384  }
385  default:
386  UNREACHABLE() << "Unknown query type";
387  }
388 
389  return std::make_unique<QueryMemoryDescriptor>(
390  executor,
391  ra_exe_unit,
392  query_infos,
393  allow_multifrag,
394  keyless_hash,
395  interleaved_bins_on_gpu,
396  idx_target_as_key,
397  actual_col_range_info,
398  col_slot_context,
399  group_col_widths,
400  group_col_compact_width,
401  target_groupby_indices,
402  entry_count,
403  sharing,
404  shared_mem_for_group_by,
405  count_distinct_descriptors,
406  sort_on_gpu_hint,
407  output_columnar,
408  render_info && render_info->isPotentialInSituRender(),
409  must_use_baseline_sort,
410  streaming_top_n);
411 }
std::vector< Analyzer::Expr * > target_exprs
static bool many_entries(const int64_t max_val, const int64_t min_val, const int64_t bucket)
bool g_enable_smem_group_by
static int64_t getBucketedCardinality(const ColRangeInfo &col_range_info)
const bool shared_mem_support
const bool keyless
std::vector< int8_t > get_col_byte_widths(const T &col_expr_list, const std::vector< ssize_t > &col_exprs_to_not_project)
bool setInSituDataIfUnset(const bool is_in_situ_data)
Definition: RenderInfo.cpp:95
QueryDescriptionType hash_type_
#define UNREACHABLE()
Definition: Logger.h:241
static bool supportedExprForGpuSharedMemUsage(Analyzer::Expr *expr)
bool use_streaming_top_n(const RelAlgExecutionUnit &ra_exe_unit, const bool output_columnar)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
CHECK(cgen_state)
std::vector< ssize_t > target_expr_group_by_indices(const std::list< std::shared_ptr< Analyzer::Expr >> &groupby_exprs, const std::vector< Analyzer::Expr * > &target_exprs)
static int8_t pick_target_compact_width(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const int8_t crt_min_byte_width)
const int32_t target_index
bool isPotentialInSituRender() const
Definition: RenderInfo.cpp:61
std::vector< ssize_t > target_expr_proj_indices(const RelAlgExecutionUnit &ra_exe_unit, const Catalog_Namespace::Catalog &cat)
static bool countDescriptorsLogicallyEmpty(const CountDistinctDescriptors &count_distinct_descriptors)
int8_t pick_baseline_key_width(const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos, const Executor *executor)

+ Here is the call graph for this function:

bool QueryMemoryDescriptor::interleavedBins ( const ExecutorDeviceType  device_type) const

Definition at line 1078 of file QueryMemoryDescriptor.cpp.

References GPU, and interleaved_bins_on_gpu_.

Referenced by canOutputColumnar(), GroupByAndAggregate::codegenSingleColumnPerfectHash(), getBufferSizeBytes(), QueryExecutionContext::groupBufferToResults(), and QueryMemoryInitializer::QueryMemoryInitializer().

1078  {
1079  return interleaved_bins_on_gpu_ && device_type == ExecutorDeviceType::GPU;
1080 }

+ Here is the caller graph for this function:

bool QueryMemoryDescriptor::isGroupBy ( ) const
inline

Definition at line 203 of file QueryMemoryDescriptor.h.

References group_col_widths_.

Referenced by anonymous_namespace{TargetExprBuilder.cpp}::get_initial_agg_val(), anonymous_namespace{OutputBufferInitialization.cpp}::init_agg_val_vec(), QueryMemoryInitializer::initColumnPerRow(), QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), and QueryMemoryInitializer::QueryMemoryInitializer().

203 { return !group_col_widths_.empty(); }
std::vector< int8_t > group_col_widths_

+ Here is the caller graph for this function:

bool QueryMemoryDescriptor::isLogicalSizedColumnsAllowed ( ) const

Definition at line 1027 of file QueryMemoryDescriptor.cpp.

References g_cluster, output_columnar_, Projection, and query_desc_type_.

Referenced by TargetExprCodegen::codegen(), TargetExprCodegenBuilder::codegenMultiSlotSampleExpressions(), TargetExprCodegenBuilder::codegenSlotEmptyKey(), anonymous_namespace{OutputBufferInitialization.cpp}::init_agg_val_vec(), ResultSet::makeTargetValue(), QueryMemoryDescriptor(), ResultSetStorage::reduceOneSlot(), ResultSetStorage::reduceOneSlotSingleValue(), and setOutputColumnar().

1027  {
1028  // In distributed mode, result sets are serialized using rowwise iterators, so we use
1029  // consistent slot widths for now
1030  return output_columnar_ && !g_cluster &&
1032 }
QueryDescriptionType query_desc_type_
bool g_cluster

+ Here is the caller graph for this function:

bool QueryMemoryDescriptor::isSingleColumnGroupByWithPerfectHash ( ) const
inline

Definition at line 178 of file QueryMemoryDescriptor.h.

References getGroupbyColCount(), getQueryDescriptionType(), and GroupByPerfectHash.

Referenced by GroupByAndAggregate::codegenGroupBy(), and ResultSet::getTargetValueFromBufferRowwise().

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool QueryMemoryDescriptor::isWarpSyncRequired ( const ExecutorDeviceType  device_type) const

Definition at line 1100 of file QueryMemoryDescriptor.cpp.

References CHECK(), executor_, and GPU.

Referenced by query_group_by_template_impl().

1101  {
1102  if (device_type != ExecutorDeviceType::GPU) {
1103  return false;
1104  } else {
1105  auto cuda_mgr = executor_->getCatalog()->getDataMgr().getCudaMgr();
1106  CHECK(cuda_mgr);
1107  return cuda_mgr->isArchVoltaForAll();
1108  }
1109 }
CHECK(cgen_state)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool QueryMemoryDescriptor::lazyInitGroups ( const ExecutorDeviceType  device_type) const

Definition at line 1073 of file QueryMemoryDescriptor.cpp.

References count_distinct_descriptors_, countDescriptorsLogicallyEmpty(), GPU, and render_output_.

Referenced by create_dev_group_by_buffers(), QueryMemoryInitializer::QueryMemoryInitializer(), and toString().

1073  {
1074  return device_type == ExecutorDeviceType::GPU && !render_output_ &&
1076 }
CountDistinctDescriptors count_distinct_descriptors_
static bool countDescriptorsLogicallyEmpty(const CountDistinctDescriptors &count_distinct_descriptors)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

static bool QueryMemoryDescriptor::many_entries ( const int64_t  max_val,
const int64_t  min_val,
const int64_t  bucket 
)
inlinestatic

Definition at line 150 of file QueryMemoryDescriptor.h.

Referenced by blocksShareMemory().

152  {
153  return max_val - min_val > 10000 * std::max(bucket, int64_t(1));
154  }

+ Here is the caller graph for this function:

bool QueryMemoryDescriptor::mustUseBaselineSort ( ) const
inline

Definition at line 280 of file QueryMemoryDescriptor.h.

References must_use_baseline_sort_.

Referenced by GroupByAndAggregate::codegenSingleColumnPerfectHash().

+ Here is the caller graph for this function:

bool QueryMemoryDescriptor::operator== ( const QueryMemoryDescriptor other) const

Definition at line 600 of file QueryMemoryDescriptor.cpp.

References bucket_, col_slot_context_, count_distinct_descriptors_, force_4byte_float_, group_col_compact_width_, group_col_widths_, has_nulls_, idx_target_as_key_, interleaved_bins_on_gpu_, keyless_hash_, max_val_, min_val_, output_columnar_, query_desc_type_, sharing_, sort_on_gpu_, and target_groupby_indices_.

600  {
601  // Note that this method does not check ptr reference members (e.g. executor_) or
602  // entry_count_
603  if (query_desc_type_ != other.query_desc_type_) {
604  return false;
605  }
606  if (keyless_hash_ != other.keyless_hash_) {
607  return false;
608  }
610  return false;
611  }
612  if (idx_target_as_key_ != other.idx_target_as_key_) {
613  return false;
614  }
615  if (force_4byte_float_ != other.force_4byte_float_) {
616  return false;
617  }
618  if (group_col_widths_ != other.group_col_widths_) {
619  return false;
620  }
622  return false;
623  }
625  return false;
626  }
627  if (min_val_ != other.min_val_) {
628  return false;
629  }
630  if (max_val_ != other.max_val_) {
631  return false;
632  }
633  if (bucket_ != other.bucket_) {
634  return false;
635  }
636  if (has_nulls_ != other.has_nulls_) {
637  return false;
638  }
639  if (sharing_ != other.sharing_) {
640  return false;
641  }
643  return false;
644  } else {
645  // Count distinct descriptors can legitimately differ in device only.
646  for (size_t i = 0; i < count_distinct_descriptors_.size(); ++i) {
647  auto ref_count_distinct_desc = other.count_distinct_descriptors_[i];
648  auto count_distinct_desc = count_distinct_descriptors_[i];
649  count_distinct_desc.device_type = ref_count_distinct_desc.device_type;
650  if (ref_count_distinct_desc != count_distinct_desc) {
651  return false;
652  }
653  }
654  }
655  if (sort_on_gpu_ != other.sort_on_gpu_) {
656  return false;
657  }
658  if (output_columnar_ != other.output_columnar_) {
659  return false;
660  }
661  if (col_slot_context_ != other.col_slot_context_) {
662  return false;
663  }
664  return true;
665 }
CountDistinctDescriptors count_distinct_descriptors_
QueryDescriptionType query_desc_type_
std::vector< int8_t > group_col_widths_
std::vector< ssize_t > target_groupby_indices_
int8_t QueryMemoryDescriptor::pick_target_compact_width ( const RelAlgExecutionUnit ra_exe_unit,
const std::vector< InputTableInfo > &  query_infos,
const int8_t  crt_min_byte_width 
)
static

Definition at line 700 of file QueryMemoryDescriptor.cpp.

References CHECK(), CHECK_EQ, g_bigint_count, get_col_byte_widths(), Analyzer::UOper::get_operand(), Analyzer::Expr::get_type_info(), RelAlgExecutionUnit::groupby_exprs, RelAlgExecutionUnit::input_col_descs, anonymous_namespace{QueryMemoryDescriptor.cpp}::is_int_and_no_bigger_than(), kCOUNT, kENCODING_DICT, kUNNEST, and RelAlgExecutionUnit::target_exprs.

703  {
704  if (g_bigint_count) {
705  return sizeof(int64_t);
706  }
707  int8_t compact_width{0};
708  auto col_it = ra_exe_unit.input_col_descs.begin();
709  int unnest_array_col_id{std::numeric_limits<int>::min()};
710  for (const auto groupby_expr : ra_exe_unit.groupby_exprs) {
711  const auto uoper = dynamic_cast<Analyzer::UOper*>(groupby_expr.get());
712  if (uoper && uoper->get_optype() == kUNNEST) {
713  const auto& arg_ti = uoper->get_operand()->get_type_info();
714  CHECK(arg_ti.is_array());
715  const auto& elem_ti = arg_ti.get_elem_type();
716  if (elem_ti.is_string() && elem_ti.get_compression() == kENCODING_DICT) {
717  unnest_array_col_id = (*col_it)->getColId();
718  } else {
719  compact_width = crt_min_byte_width;
720  break;
721  }
722  }
723  ++col_it;
724  }
725  if (!compact_width &&
726  (ra_exe_unit.groupby_exprs.size() != 1 || !ra_exe_unit.groupby_exprs.front())) {
727  compact_width = crt_min_byte_width;
728  }
729  if (!compact_width) {
730  col_it = ra_exe_unit.input_col_descs.begin();
731  std::advance(col_it, ra_exe_unit.groupby_exprs.size());
732  for (const auto target : ra_exe_unit.target_exprs) {
733  const auto& ti = target->get_type_info();
734  const auto agg = dynamic_cast<const Analyzer::AggExpr*>(target);
735  if (agg && agg->get_arg()) {
736  compact_width = crt_min_byte_width;
737  break;
738  }
739 
740  if (agg) {
741  CHECK_EQ(kCOUNT, agg->get_aggtype());
742  CHECK(!agg->get_is_distinct());
743  ++col_it;
744  continue;
745  }
746 
747  if (is_int_and_no_bigger_than(ti, 4) ||
748  (ti.is_string() && ti.get_compression() == kENCODING_DICT)) {
749  ++col_it;
750  continue;
751  }
752 
753  const auto uoper = dynamic_cast<Analyzer::UOper*>(target);
754  if (uoper && uoper->get_optype() == kUNNEST &&
755  (*col_it)->getColId() == unnest_array_col_id) {
756  const auto arg_ti = uoper->get_operand()->get_type_info();
757  CHECK(arg_ti.is_array());
758  const auto& elem_ti = arg_ti.get_elem_type();
759  if (elem_ti.is_string() && elem_ti.get_compression() == kENCODING_DICT) {
760  ++col_it;
761  continue;
762  }
763  }
764 
765  compact_width = crt_min_byte_width;
766  break;
767  }
768  }
769  if (!compact_width) {
770  size_t total_tuples{0};
771  for (const auto& qi : query_infos) {
772  total_tuples += qi.info.getNumTuples();
773  }
774  return total_tuples <= static_cast<size_t>(std::numeric_limits<uint32_t>::max()) ||
775  unnest_array_col_id != std::numeric_limits<int>::min()
776  ? 4
777  : crt_min_byte_width;
778  } else {
779  // TODO(miyu): relax this condition to allow more cases just w/o padding
780  for (auto wid : get_col_byte_widths(ra_exe_unit.target_exprs, {})) {
781  compact_width = std::max(compact_width, wid);
782  }
783  return compact_width;
784  }
785 }
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:205
std::vector< int8_t > get_col_byte_widths(const T &col_expr_list, const std::vector< ssize_t > &col_exprs_to_not_project)
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
CHECK(cgen_state)
bool g_bigint_count
const SQLTypeInfo & get_type_info() const
Definition: Analyzer.h:78
const Expr * get_operand() const
Definition: Analyzer.h:365
Definition: sqldefs.h:76
bool is_int_and_no_bigger_than(const SQLTypeInfo &ti, const size_t byte_width)
std::list< std::shared_ptr< const InputColDescriptor > > input_col_descs

+ Here is the call graph for this function:

std::string QueryMemoryDescriptor::reductionKey ( ) const

Definition at line 1215 of file QueryMemoryDescriptor.cpp.

References anonymous_namespace{QueryMemoryDescriptor.cpp}::boolToString(), col_slot_context_, getEffectiveKeyWidth(), getGroupbyColCount(), getTargetGroupbyIndex(), getTargetIdxForKey(), join(), keyless_hash_, query_desc_type_, anonymous_namespace{QueryMemoryDescriptor.cpp}::queryDescTypeToString(), targetGroupbyIndicesSize(), to_string(), and ColSlotContext::toString().

Referenced by ResultSetReductionJIT::cacheKey(), and toString().

1215  {
1216  std::string str;
1217  str += "Query Memory Descriptor State\n";
1218  str += "\tQuery Type: " + queryDescTypeToString(query_desc_type_) + "\n";
1219  str +=
1220  "\tKeyless Hash: " + boolToString(keyless_hash_) +
1221  (keyless_hash_ ? ", target index for key: " + std::to_string(getTargetIdxForKey())
1222  : "") +
1223  "\n";
1224  str += "\tEffective key width: " + std::to_string(getEffectiveKeyWidth()) + "\n";
1225  str += "\tNumber of group columns: " + std::to_string(getGroupbyColCount()) + "\n";
1226  const auto group_indices_size = targetGroupbyIndicesSize();
1227  if (group_indices_size) {
1228  std::vector<std::string> group_indices_strings;
1229  for (size_t target_idx = 0; target_idx < group_indices_size; ++target_idx) {
1230  group_indices_strings.push_back(std::to_string(getTargetGroupbyIndex(target_idx)));
1231  }
1232  str += "\tTarget group by indices: " +
1233  boost::algorithm::join(group_indices_strings, ",") + "\n";
1234  }
1235  str += "\t" + col_slot_context_.toString();
1236  return str;
1237 }
std::string join(T const &container, std::string const &delim)
size_t getEffectiveKeyWidth() const
std::string to_string(char const *&&v)
std::string queryDescTypeToString(const QueryDescriptionType val)
size_t getGroupbyColCount() const
size_t targetGroupbyIndicesSize() const
ssize_t getTargetGroupbyIndex(const size_t target_idx) const
QueryDescriptionType query_desc_type_
std::string toString() const
int32_t getTargetIdxForKey() const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryDescriptor::resetGroupColWidths ( const std::vector< int8_t > &  new_group_col_widths)
inlineprotected

Definition at line 336 of file QueryMemoryDescriptor.h.

References group_col_widths_.

336  {
337  group_col_widths_ = new_group_col_widths;
338  }
std::vector< int8_t > group_col_widths_
void QueryMemoryDescriptor::setAllTargetGroupbyIndices ( std::vector< ssize_t >  group_by_indices)
inline

Definition at line 239 of file QueryMemoryDescriptor.h.

References target_groupby_indices_.

239  {
240  target_groupby_indices_ = group_by_indices;
241  }
std::vector< ssize_t > target_groupby_indices_
void QueryMemoryDescriptor::setEntryCount ( const size_t  val)
inline

Definition at line 253 of file QueryMemoryDescriptor.h.

References entry_count_.

Referenced by Executor::reduceMultiDeviceResultSets(), ResultSetStorage::updateEntryCount(), and ResultSet::updateStorageEntryCount().

253 { entry_count_ = val; }

+ Here is the caller graph for this function:

void QueryMemoryDescriptor::setForceFourByteFloat ( const bool  val)
inline

Definition at line 285 of file QueryMemoryDescriptor.h.

References force_4byte_float_.

void QueryMemoryDescriptor::setGroupColCompactWidth ( const int8_t  val)
inline

Definition at line 205 of file QueryMemoryDescriptor.h.

References group_col_compact_width_.

void QueryMemoryDescriptor::setHasInterleavedBinsOnGpu ( const bool  val)
inline

Definition at line 187 of file QueryMemoryDescriptor.h.

References interleaved_bins_on_gpu_.

void QueryMemoryDescriptor::setHasKeylessHash ( const bool  val)
inline

Definition at line 184 of file QueryMemoryDescriptor.h.

References keyless_hash_.

void QueryMemoryDescriptor::setOutputColumnar ( const bool  val)

Definition at line 1016 of file QueryMemoryDescriptor.cpp.

References col_slot_context_, isLogicalSizedColumnsAllowed(), output_columnar_, and ColSlotContext::setAllSlotsPaddedSizeToLogicalSize().

Referenced by TableFunctionExecutionContext::launchCpuCode(), and TableFunctionExecutionContext::launchGpuCode().

1016  {
1017  output_columnar_ = val;
1020  }
1021 }
bool isLogicalSizedColumnsAllowed() const
void setAllSlotsPaddedSizeToLogicalSize()

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void QueryMemoryDescriptor::setQueryDescriptionType ( const QueryDescriptionType  val)
inline

Definition at line 177 of file QueryMemoryDescriptor.h.

References query_desc_type_.

177 { query_desc_type_ = val; }
QueryDescriptionType query_desc_type_
void QueryMemoryDescriptor::setTargetIdxForKey ( const int32_t  val)
inline

Definition at line 190 of file QueryMemoryDescriptor.h.

References idx_target_as_key_.

size_t QueryMemoryDescriptor::sharedMemBytes ( const ExecutorDeviceType  device_type) const

Definition at line 1082 of file QueryMemoryDescriptor.cpp.

References CHECK(), CHECK_EQ, CPU, entry_count_, executor_, getRowSize(), GPU, SharedForKeylessOneColumnKnownRange, and sharing_.

Referenced by blocksShareMemory(), QueryExecutionContext::launchGpuCode(), and query_group_by_template_impl().

1082  {
1083  CHECK(device_type == ExecutorDeviceType::CPU || device_type == ExecutorDeviceType::GPU);
1084  if (device_type == ExecutorDeviceType::CPU) {
1085  return 0;
1086  }
1087  // if performing keyless aggregate query with a single column group-by:
1089  CHECK_EQ(getRowSize(),
1090  sizeof(int64_t)); // Currently just designed for this scenario
1091  size_t shared_mem_size =
1092  (/*bin_count=*/entry_count_ + 1) * sizeof(int64_t); // one extra for NULL values
1093  CHECK(shared_mem_size <=
1094  executor_->getCatalog()->getDataMgr().getCudaMgr()->getMaxSharedMemoryForAll());
1095  return shared_mem_size;
1096  }
1097  return 0;
1098 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
CHECK(cgen_state)

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool QueryMemoryDescriptor::sortOnGpu ( ) const
inline

Definition at line 270 of file QueryMemoryDescriptor.h.

References sort_on_gpu_.

Referenced by alignPaddedSlots(), QueryExecutionContext::launchGpuCode(), Executor::ExecutionDispatch::runImpl(), and use_speculative_top_n().

+ Here is the caller graph for this function:

size_t QueryMemoryDescriptor::targetGroupbyIndicesSize ( ) const
inline
size_t QueryMemoryDescriptor::targetGroupbyNegativeIndicesSize ( ) const
inline

Definition at line 244 of file QueryMemoryDescriptor.h.

References target_groupby_indices_.

244  {
245  return std::count_if(
246  target_groupby_indices_.begin(),
248  [](const ssize_t& target_group_by_index) { return target_group_by_index < 0; });
249  }
std::vector< ssize_t > target_groupby_indices_
bool QueryMemoryDescriptor::threadsShareMemory ( ) const
std::string QueryMemoryDescriptor::toString ( ) const

Definition at line 1194 of file QueryMemoryDescriptor.cpp.

References allow_multifrag_, blocksShareMemory(), anonymous_namespace{QueryMemoryDescriptor.cpp}::boolToString(), bucket_, entry_count_, GPU, interleaved_bins_on_gpu_, lazyInitGroups(), max_val_, min_val_, must_use_baseline_sort_, output_columnar_, reductionKey(), render_output_, sort_on_gpu_, threadsShareMemory(), to_string(), use_streaming_top_n_, and usesGetGroupValueFast().

Referenced by Executor::dispatchFragments().

1194  {
1195  auto str = reductionKey();
1196  str += "\tAllow Multifrag: " + boolToString(allow_multifrag_) + "\n";
1197  str += "\tInterleaved Bins on GPU: " + boolToString(interleaved_bins_on_gpu_) + "\n";
1198  str += "\tBlocks Share Memory: " + boolToString(blocksShareMemory()) + "\n";
1199  str += "\tThreads Share Memory: " + boolToString(threadsShareMemory()) + "\n";
1200  str += "\tUses Fast Group Values: " + boolToString(usesGetGroupValueFast()) + "\n";
1201  str += "\tLazy Init Groups (GPU): " +
1203  str += "\tEntry Count: " + std::to_string(entry_count_) + "\n";
1204  str += "\tMin Val (perfect hash only): " + std::to_string(min_val_) + "\n";
1205  str += "\tMax Val (perfect hash only): " + std::to_string(max_val_) + "\n";
1206  str += "\tBucket Val (perfect hash only): " + std::to_string(bucket_) + "\n";
1207  str += "\tSort on GPU: " + boolToString(sort_on_gpu_) + "\n";
1208  str += "\tUse Streaming Top N: " + boolToString(use_streaming_top_n_) + "\n";
1209  str += "\tOutput Columnar: " + boolToString(output_columnar_) + "\n";
1210  str += "\tRender Output: " + boolToString(render_output_) + "\n";
1211  str += "\tUse Baseline Sort: " + boolToString(must_use_baseline_sort_) + "\n";
1212  return str;
1213 }
std::string to_string(char const *&&v)
bool lazyInitGroups(const ExecutorDeviceType) const
std::string reductionKey() const

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

static TResultSetBufferDescriptor QueryMemoryDescriptor::toThrift ( const QueryMemoryDescriptor )
static
int8_t QueryMemoryDescriptor::updateActualMinByteWidth ( const int8_t  actual_min_byte_width) const

Definition at line 1144 of file QueryMemoryDescriptor.cpp.

References col_slot_context_, and ColSlotContext::getMinPaddedByteSize().

1145  {
1146  return col_slot_context_.getMinPaddedByteSize(actual_min_byte_width);
1147 }
int8_t getMinPaddedByteSize(const int8_t actual_min_byte_width) const

+ Here is the call graph for this function:

void QueryMemoryDescriptor::useConsistentSlotWidthSize ( const int8_t  slot_width_size)

Definition at line 1135 of file QueryMemoryDescriptor.cpp.

References col_slot_context_, and ColSlotContext::setAllSlotsSize().

1135  {
1136  col_slot_context_.setAllSlotsSize(slot_width_size);
1137 }
void setAllSlotsSize(const int8_t slot_width_size)

+ Here is the call graph for this function:

bool QueryMemoryDescriptor::usesGetGroupValueFast ( ) const

Definition at line 1045 of file QueryMemoryDescriptor.cpp.

References getGroupbyColCount(), GroupByPerfectHash, and query_desc_type_.

Referenced by canOutputColumnar(), GroupByAndAggregate::codegen(), GroupByAndAggregate::codegenSingleColumnPerfectHash(), and toString().

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool QueryMemoryDescriptor::useStreamingTopN ( ) const
inline

Definition at line 276 of file QueryMemoryDescriptor.h.

References use_streaming_top_n_.

Referenced by GroupByAndAggregate::codegen(), GroupByAndAggregate::codegenOutputSlot(), QueryMemoryInitializer::copyGroupByBuffersFromGpu(), QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), and QueryMemoryInitializer::QueryMemoryInitializer().

+ Here is the caller graph for this function:

Friends And Related Function Documentation

friend class QueryExecutionContext
friend

Definition at line 379 of file QueryMemoryDescriptor.h.

Referenced by getQueryExecutionContext().

friend class ResultSet
friend

Definition at line 378 of file QueryMemoryDescriptor.h.

Member Data Documentation

bool QueryMemoryDescriptor::allow_multifrag_
private

Definition at line 342 of file QueryMemoryDescriptor.h.

Referenced by toString().

int64_t QueryMemoryDescriptor::bucket_
private

Definition at line 359 of file QueryMemoryDescriptor.h.

Referenced by blocksShareMemory(), getBucket(), operator==(), and toString().

size_t QueryMemoryDescriptor::entry_count_
private
const Executor* QueryMemoryDescriptor::executor_
private
bool QueryMemoryDescriptor::force_4byte_float_
private

Definition at line 370 of file QueryMemoryDescriptor.h.

Referenced by forceFourByteFloat(), operator==(), and setForceFourByteFloat().

int8_t QueryMemoryDescriptor::group_col_compact_width_
private
bool QueryMemoryDescriptor::has_nulls_
private

Definition at line 360 of file QueryMemoryDescriptor.h.

Referenced by hasNulls(), and operator==().

int32_t QueryMemoryDescriptor::idx_target_as_key_
private

Definition at line 346 of file QueryMemoryDescriptor.h.

Referenced by getTargetIdxForKey(), operator==(), and setTargetIdxForKey().

bool QueryMemoryDescriptor::interleaved_bins_on_gpu_
private
bool QueryMemoryDescriptor::is_table_function_
private

Definition at line 367 of file QueryMemoryDescriptor.h.

Referenced by blocksShareMemory().

bool QueryMemoryDescriptor::keyless_hash_
private
int64_t QueryMemoryDescriptor::max_val_
private

Definition at line 358 of file QueryMemoryDescriptor.h.

Referenced by blocksShareMemory(), getMaxVal(), operator==(), and toString().

int64_t QueryMemoryDescriptor::min_val_
private

Definition at line 356 of file QueryMemoryDescriptor.h.

Referenced by blocksShareMemory(), getMinVal(), operator==(), and toString().

bool QueryMemoryDescriptor::must_use_baseline_sort_
private

Definition at line 366 of file QueryMemoryDescriptor.h.

Referenced by mustUseBaselineSort(), and toString().

bool QueryMemoryDescriptor::render_output_
private

Definition at line 365 of file QueryMemoryDescriptor.h.

Referenced by blocksShareMemory(), lazyInitGroups(), and toString().

GroupByMemSharing QueryMemoryDescriptor::sharing_
private
bool QueryMemoryDescriptor::sort_on_gpu_
private

Definition at line 363 of file QueryMemoryDescriptor.h.

Referenced by operator==(), QueryMemoryDescriptor(), sortOnGpu(), and toString().

std::vector<ssize_t> QueryMemoryDescriptor::target_groupby_indices_
private
bool QueryMemoryDescriptor::use_streaming_top_n_
private

The documentation for this class was generated from the following files: