35 const int32_t groups_buffer_entry_count = query_mem_desc.
getEntryCount();
37 const size_t num_count_distinct_descs =
39 for (
size_t i = 0;
i < num_count_distinct_descs;
i++) {
46 int64_t total_bytes{0};
50 total_bytes =
static_cast<int64_t
>(total_bytes_per_group * groups_buffer_entry_count);
64 if (render_allocator_map) {
68 const auto gpu_idx = 0;
70 return reinterpret_cast<int64_t*
>(render_allocator_ptr->alloc(numBytes));
72 return reinterpret_cast<int64_t*
>(mem_owner->
allocate(numBytes));
77 if (frag_offsets.size() < 2) {
80 const auto frag_size = frag_offsets[1] - frag_offsets[0];
81 for (
size_t i = 2;
i < frag_offsets.size(); ++
i) {
82 const auto curr_size = frag_offsets[
i] - frag_offsets[
i - 1];
83 if (curr_size != frag_size) {
87 return !frag_size ? std::numeric_limits<int64_t>::max()
88 :
static_cast<int64_t
>(frag_size);
92 const std::vector<std::vector<uint64_t>>& frag_offsets) {
93 if (frag_offsets.empty()) {
96 std::vector<int64_t> frag_sizes;
97 for (
size_t tab_idx = 0; tab_idx < frag_offsets[0].size(); ++tab_idx) {
98 std::vector<uint64_t> tab_offs;
99 for (
auto& offsets : frag_offsets) {
100 tab_offs.push_back(offsets[tab_idx]);
108 const std::vector<Analyzer::Expr*>& target_exprs,
109 const std::vector<int64_t>& table_frag_sizes) {
110 std::vector<int64_t> col_frag_sizes;
111 for (
auto expr : target_exprs) {
112 if (
const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
113 if (col_var->get_rte_idx() < 0) {
114 CHECK_EQ(-1, col_var->get_rte_idx());
115 col_frag_sizes.push_back(int64_t(-1));
117 col_frag_sizes.push_back(table_frag_sizes[col_var->get_rte_idx()]);
120 col_frag_sizes.push_back(int64_t(-1));
123 return col_frag_sizes;
127 const std::vector<Analyzer::Expr*>& target_exprs,
128 const std::vector<std::vector<uint64_t>>& table_frag_offsets) {
129 std::vector<std::vector<int64_t>> col_frag_offsets;
130 for (
auto& table_offsets : table_frag_offsets) {
131 std::vector<int64_t> col_offsets;
132 for (
auto expr : target_exprs) {
133 if (
const auto col_var = dynamic_cast<Analyzer::ColumnVar*>(expr)) {
134 if (col_var->get_rte_idx() < 0) {
135 CHECK_EQ(-1, col_var->get_rte_idx());
136 col_offsets.push_back(int64_t(-1));
138 CHECK_LT(static_cast<size_t>(col_var->get_rte_idx()), table_offsets.size());
139 col_offsets.push_back(
140 static_cast<int64_t>(table_offsets[col_var->get_rte_idx()]));
143 col_offsets.push_back(int64_t(-1));
146 col_frag_offsets.push_back(col_offsets);
148 return col_frag_offsets;
159 const bool output_columnar,
161 const int64_t num_rows,
162 const std::vector<std::vector<const int8_t*>>& col_buffers,
163 const std::vector<std::vector<uint64_t>>& frag_offsets,
166 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
168 const Executor* executor)
169 : num_rows_(num_rows)
170 , row_set_mem_owner_(row_set_mem_owner)
171 , init_agg_vals_(executor->plan_state_->init_agg_vals_)
172 ,
num_buffers_(computeNumberOfBuffers(query_mem_desc, device_type, executor))
178 CHECK(!sort_on_gpu || output_columnar);
181 if (consistent_frag_sizes.empty()) {
192 if (render_allocator_map || !query_mem_desc.
isGroupBy()) {
205 ? executor->blockSize() * executor->gridSize()
208 size_t group_buffer_size{0};
213 group_buffer_size = num_rows * query_mem_desc.
getRowSize();
222 CHECK_GE(group_buffer_size,
size_t(0));
225 int64_t* group_by_buffer_template{
nullptr};
226 if (!query_mem_desc.
lazyInitGroups(device_type) && group_buffers_count > 1) {
227 group_by_buffer_template =
244 ? executor->blockSize()
250 const auto actual_group_buffer_size =
251 group_buffer_size + index_buffer_qw *
sizeof(int64_t);
252 CHECK_GE(actual_group_buffer_size, group_buffer_size);
254 for (
size_t i = 0;
i < group_buffers_count;
i += step) {
258 if (group_by_buffer_template) {
259 memcpy(group_by_buffer + index_buffer_qw,
260 group_by_buffer_template,
272 for (
size_t j = 1;
j < step; ++
j) {
275 const auto column_frag_offsets =
277 const auto column_frag_sizes =
281 executor->getColLazyFetchInfo(ra_exe_unit.
target_exprs),
289 executor->getCatalog(),
290 executor->blockSize(),
291 executor->gridSize()));
292 result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
293 executor->plan_state_->init_agg_vals_);
294 for (
size_t j = 1;
j < step; ++
j) {
305 const int64_t num_rows,
306 const std::vector<std::vector<const int8_t*>>& col_buffers,
307 const std::vector<std::vector<uint64_t>>& frag_offsets,
308 std::shared_ptr<RowSetMemoryOwner> row_set_mem_owner,
310 const Executor* executor)
311 : num_rows_(num_rows)
312 , row_set_mem_owner_(row_set_mem_owner)
322 if (consistent_frag_sizes.empty()) {
327 size_t group_buffer_size{0};
328 const size_t num_columns =
query_mem_desc.getBufferColSlotCount();
329 group_buffer_size = num_rows_ * num_columns *
sizeof(int64_t);
330 CHECK_GE(group_buffer_size,
size_t(0));
332 const auto index_buffer_qw =
336 const auto actual_group_buffer_size =
337 group_buffer_size + index_buffer_qw *
sizeof(int64_t);
338 CHECK_GE(actual_group_buffer_size, group_buffer_size);
341 auto group_by_buffer =
345 query_mem_desc, group_by_buffer + index_buffer_qw, init_agg_vals_, executor);
347 group_by_buffers_.push_back(group_by_buffer);
349 const auto column_frag_offsets =
351 const auto column_frag_sizes =
353 result_sets_.emplace_back(
363 executor->getCatalog(),
364 executor->blockSize(),
365 executor->gridSize()));
366 result_sets_.back()->allocateStorage(reinterpret_cast<int8_t*>(group_by_buffer),
375 const bool output_columnar,
376 const Executor* executor) {
377 if (output_columnar) {
380 auto rows_ptr = buffer;
383 ? executor->blockSize() * executor->gridSize()
386 query_mem_desc.
interleavedBins(device_type) ? executor->warpSize() : 1;
388 const auto node_count_size = thread_count *
sizeof(int64_t);
389 memset(rows_ptr, 0, node_count_size);
392 memset(rows_ptr + thread_count, -1, rows_offset - node_count_size);
393 rows_ptr += rows_offset /
sizeof(int64_t);
394 actual_entry_count = n * thread_count;
407 int64_t* groups_buffer,
408 const std::vector<int64_t>& init_vals,
409 const int32_t groups_buffer_entry_count,
410 const size_t warp_size,
411 const Executor* executor) {
413 const size_t row_size{query_mem_desc.
getRowSize()};
418 auto buffer_ptr =
reinterpret_cast<int8_t*
>(groups_buffer);
420 const auto query_mem_desc_fixedup =
424 CHECK(warp_size >= 1);
425 CHECK(key_count == 1 || warp_size == 1);
426 for (
size_t warp_idx = 0; warp_idx < warp_size; ++warp_idx) {
427 for (
size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
428 ++bin, buffer_ptr += row_size) {
430 &buffer_ptr[col_base_off],
440 for (
size_t bin = 0; bin < static_cast<size_t>(groups_buffer_entry_count);
441 ++bin, buffer_ptr += row_size) {
445 &buffer_ptr[col_base_off],
455 template <
typename T>
457 static_assert(
sizeof(
T) <=
sizeof(int64_t),
"Unsupported template type");
458 for (uint32_t
i = 0;
i < entry_count; ++
i) {
459 buffer_ptr[
i] = init_val;
461 return reinterpret_cast<int8_t*
>(buffer_ptr + entry_count);
468 int64_t* groups_buffer,
469 const std::vector<int64_t>& init_vals,
470 const Executor* executor) {
471 CHECK(groups_buffer);
472 for (
const auto target_expr : executor->plan_state_->target_exprs_) {
476 const int32_t agg_col_count = query_mem_desc.
getSlotCount();
477 auto buffer_ptr =
reinterpret_cast<int8_t*
>(groups_buffer);
479 const auto groups_buffer_entry_count = query_mem_desc.
getEntryCount();
482 for (
size_t i = 0;
i < key_count; ++
i) {
483 buffer_ptr = initColumnarBuffer<int64_t>(
reinterpret_cast<int64_t*
>(buffer_ptr),
485 groups_buffer_entry_count);
491 int32_t init_val_idx = 0;
492 for (int32_t
i = 0;
i < agg_col_count; ++
i) {
494 CHECK_LT(static_cast<size_t>(init_val_idx), init_vals.size());
497 buffer_ptr = initColumnarBuffer<int8_t>(
498 buffer_ptr, init_vals[init_val_idx++], groups_buffer_entry_count);
502 initColumnarBuffer<int16_t>(
reinterpret_cast<int16_t*
>(buffer_ptr),
503 init_vals[init_val_idx++],
504 groups_buffer_entry_count);
508 initColumnarBuffer<int32_t>(
reinterpret_cast<int32_t*
>(buffer_ptr),
509 init_vals[init_val_idx++],
510 groups_buffer_entry_count);
514 initColumnarBuffer<int64_t>(
reinterpret_cast<int64_t*
>(buffer_ptr),
515 init_vals[init_val_idx++],
516 groups_buffer_entry_count);
533 const std::vector<int64_t>& init_vals,
534 const std::vector<int64_t>& bitmap_sizes,
535 const std::vector<bool>& tdigest_deferred) {
536 int8_t* col_ptr = row_ptr;
537 size_t init_vec_idx = 0;
538 for (
size_t col_idx = 0; col_idx < query_mem_desc.
getSlotCount();
540 const int64_t bm_sz{bitmap_sizes[col_idx]};
542 if (bm_sz && query_mem_desc.
isGroupBy()) {
549 }
else if (query_mem_desc.
isGroupBy() && tdigest_deferred[col_idx]) {
555 CHECK_LT(init_vec_idx, init_vals.size());
556 init_val = init_vals[init_vec_idx++];
561 *col_ptr =
static_cast<int8_t
>(init_val);
564 *
reinterpret_cast<int16_t*
>(col_ptr) = (int16_t)init_val;
567 *
reinterpret_cast<int32_t*
>(col_ptr) = (int32_t)init_val;
570 *
reinterpret_cast<int64_t*
>(col_ptr) = init_val;
587 size_t total_bytes_per_entry{0};
588 const size_t num_count_distinct_descs =
590 for (
size_t i = 0;
i < num_count_distinct_descs;
i++) {
596 total_bytes_per_entry += count_distinct_desc.bitmapPaddedSizeBytes();
615 const Executor* executor) {
616 const size_t agg_col_count{query_mem_desc.
getSlotCount()};
617 std::vector<int64_t> agg_bitmap_size(deferred ? agg_col_count : 0);
619 CHECK_GE(agg_col_count, executor->plan_state_->target_exprs_.size());
620 for (
size_t target_idx = 0; target_idx < executor->plan_state_->target_exprs_.size();
622 const auto target_expr = executor->plan_state_->target_exprs_[target_idx];
625 CHECK(agg_info.is_agg &&
627 CHECK(!agg_info.sql_type.is_varlen());
630 CHECK_LT(static_cast<size_t>(agg_col_idx), agg_col_count);
634 const auto& count_distinct_desc =
638 const auto bitmap_byte_sz = count_distinct_desc.bitmapPaddedSizeBytes();
640 agg_bitmap_size[agg_col_idx] = bitmap_byte_sz;
647 agg_bitmap_size[agg_col_idx] = -1;
655 return agg_bitmap_size;
664 ptr, bitmap_byte_sz,
false);
665 return reinterpret_cast<int64_t
>(ptr);
667 return reinterpret_cast<int64_t
>(
672 auto count_distinct_set =
new std::set<int64_t>();
674 return reinterpret_cast<int64_t
>(count_distinct_set);
680 const Executor* executor) {
681 size_t const slot_count = query_mem_desc.
getSlotCount();
682 size_t const ntargets = executor->plan_state_->target_exprs_.size();
684 std::vector<bool> tdigest_deferred(deferred ? slot_count : 0);
686 for (
size_t target_idx = 0; target_idx < ntargets; ++target_idx) {
687 auto const target_expr = executor->plan_state_->target_exprs_[target_idx];
688 if (
auto const agg_expr = dynamic_cast<const Analyzer::AggExpr*>(target_expr)) {
690 size_t const agg_col_idx =
694 static_cast<int8_t
>(
sizeof(int64_t)));
696 tdigest_deferred[agg_col_idx] =
true;
705 return tdigest_deferred;
714 const unsigned block_size_x,
715 const unsigned grid_size_x) {
717 const auto thread_count = block_size_x * grid_size_x;
718 const auto total_buff_size =
723 std::vector<CUdeviceptr> dev_buffers(thread_count);
725 for (
size_t i = 0;
i < thread_count; ++
i) {
726 dev_buffers[
i] = dev_buffer;
731 reinterpret_cast<int8_t*>(dev_buffers.data()),
737 thread_count *
sizeof(int64_t));
740 reinterpret_cast<int8_t*>(dev_buffer + thread_count *
sizeof(int64_t)),
742 thread_count * n *
sizeof(int64_t));
745 reinterpret_cast<int64_t*>(
747 reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
751 query_mem_desc.
getRowSize() /
sizeof(int64_t),
757 return {
reinterpret_cast<CUdeviceptr>(dev_ptr), dev_buffer};
766 const unsigned block_size_x,
767 const unsigned grid_size_x,
768 const int8_t warp_size,
769 const bool can_sort_on_gpu,
770 const bool output_columnar,
773 if (render_allocator) {
777 CHECK(!output_columnar);
779 return prepareTopNHeapsDevBuffer(
780 query_mem_desc, init_agg_vals_dev_ptr, n, device_id, block_size_x, grid_size_x);
796 if (render_allocator) {
800 CHECK(!render_allocator);
805 auto group_by_dev_buffer = dev_group_by_buffers.second;
807 int8_t* col_widths_dev_ptr{
nullptr};
808 if (output_columnar) {
809 std::vector<int8_t> compact_col_widths(col_count);
810 for (
size_t idx = 0; idx < col_count; ++idx) {
815 col_widths_dev_ptr, compact_col_widths.data(), col_count *
sizeof(int8_t));
817 const int8_t warp_count =
820 if (output_columnar) {
822 reinterpret_cast<int64_t*>(group_by_dev_buffer),
823 reinterpret_cast<const int64_t*>(init_agg_vals_dev_ptr),
824 dev_group_by_buffers.entry_count,
835 reinterpret_cast<int64_t*>(init_agg_vals_dev_ptr),
836 dev_group_by_buffers.entry_count,
839 query_mem_desc.
getRowSize() /
sizeof(int64_t),
845 group_by_dev_buffer += groups_buffer_size;
848 return dev_group_by_buffers;
854 const unsigned block_size_x,
855 const unsigned grid_size_x) {
859 const size_t column_size =
num_rows_ *
sizeof(int64_t);
860 const size_t groups_buffer_size = num_columns * (column_size == 0 ? 1 : column_size);
861 const size_t mem_size =
864 int8_t* dev_buffers_allocation{
nullptr};
866 CHECK(dev_buffers_allocation);
869 const size_t step{block_size_x};
870 const size_t num_ptrs{block_size_x * grid_size_x};
871 std::vector<CUdeviceptr> dev_buffers(num_columns * num_ptrs);
872 auto dev_buffer = dev_buffers_mem;
873 for (
size_t i = 0;
i < num_ptrs;
i += step) {
874 for (
size_t j = 0;
j < step;
j += 1) {
875 for (
size_t k = 0;
k < num_columns;
k++) {
876 dev_buffers[(
i +
j) * num_columns +
k] = dev_buffer +
k * column_size;
880 dev_buffer += groups_buffer_size;
886 reinterpret_cast<int8_t*>(dev_buffers.data()),
892 void QueryMemoryInitializer::copyFromTableFunctionGpuBuffers(
895 const size_t entry_count,
898 const unsigned block_size_x,
899 const unsigned grid_size_x) {
901 const size_t column_size = entry_count *
sizeof(int64_t);
902 const size_t orig_column_size = gpu_group_by_buffers.
entry_count *
sizeof(int64_t);
903 int8_t* dev_buffer =
reinterpret_cast<int8_t*
>(gpu_group_by_buffers.
second);
905 CHECK_LE(column_size, orig_column_size);
906 if (orig_column_size == column_size) {
909 reinterpret_cast<CUdeviceptr>(dev_buffer),
910 column_size * num_columns,
913 for (
size_t k = 0;
k < num_columns; ++
k) {
916 reinterpret_cast<CUdeviceptr>(dev_buffer),
919 dev_buffer += orig_column_size;
920 host_buffer += column_size;
933 : executor->blockSize() *
942 int8_t* projection_buffer,
943 const size_t projection_count) {
946 constexpr
size_t row_index_width =
sizeof(int64_t);
947 size_t buffer_offset1{projection_count * row_index_width};
951 auto column_proj_size =
954 if (buffer_offset1 + column_proj_size >= buffer_offset2) {
956 std::memmove(projection_buffer + buffer_offset1,
957 projection_buffer + buffer_offset2,
960 std::memcpy(projection_buffer + buffer_offset1,
961 projection_buffer + buffer_offset2,
973 const size_t projection_count) {
974 const auto num_allocated_rows =
985 result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
992 const size_t projection_count,
993 const int device_id) {
995 const auto num_allocated_rows =
1001 gpu_group_by_buffers,
1009 result_sets_.front()->updateStorageEntryCount(num_allocated_rows);
1015 const size_t entry_count,
1018 const unsigned block_size_x,
1019 const unsigned grid_size_x,
1020 const int device_id,
1021 const bool prepend_index_buffer)
const {
1022 const auto thread_count = block_size_x * grid_size_x;
1024 size_t total_buff_size{0};
1036 gpu_group_by_buffers.
second,
1041 prepend_index_buffer);
1064 const unsigned total_thread_count,
1065 const int device_id) {
1069 const auto rows_copy = pick_top_n_rows_from_dev_heaps(
1071 reinterpret_cast<int64_t*>(gpu_group_by_buffers.
second),
std::vector< Analyzer::Expr * > target_exprs
size_t getSlotCount() const
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
void initGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const int32_t groups_buffer_entry_count, const size_t warp_size, const Executor *executor)
RenderAllocator * getRenderAllocator(size_t device_id)
bool countDistinctDescriptorsLogicallyEmpty() const
size_t getEntryCount() const
bool useCudaBuffers() const
size_t count_distinct_bitmap_mem_bytes_
boost::multiprecision::number< boost::multiprecision::cpp_int_backend< 64, 64, boost::multiprecision::signed_magnitude, boost::multiprecision::checked, void >> checked_int64_t
void compact_projection_buffer_for_cpu_columnar(const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count)
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *cuda_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, Allocator *insitu_allocator)
DeviceAllocator * device_allocator_
Streaming Top N algorithm.
TargetInfo get_target_info(const PointerType target_expr, const bool bigint_count)
const size_t num_buffers_
size_t get_rows_offset_of_heaps(const size_t n, const size_t thread_count)
std::vector< int64_t > allocateCountDistinctBuffers(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
unsigned long long CUdeviceptr
virtual void copyToDevice(int8_t *device_dst, const int8_t *host_src, const size_t num_bytes) const =0
std::vector< bool > allocateTDigests(const QueryMemoryDescriptor &query_mem_desc, const bool deferred, const Executor *executor)
bool hasKeylessHash() const
void init_columnar_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t agg_col_count, const int8_t *col_sizes, const bool need_padding, const bool keyless, const int8_t key_size, const size_t block_size_x, const size_t grid_size_x)
CUdeviceptr count_distinct_bitmap_mem_
void check_total_bitmap_memory(const QueryMemoryDescriptor &query_mem_desc)
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getNextColOffInBytes(const int8_t *col_ptr, const size_t bin, const size_t col_idx) const
size_t getEffectiveKeyWidth() const
int8_t * initColumnarBuffer(T *buffer_ptr, const T init_val, const uint32_t entry_count)
size_t computeNumberOfBuffers(const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const Executor *executor) const
size_t getRowSize() const
void init_group_by_buffer_on_device(int64_t *groups_buffer, const int64_t *init_vals, const uint32_t groups_buffer_entry_count, const uint32_t key_count, const uint32_t key_width, const uint32_t row_size_quad, const bool keyless, const int8_t warp_size, const size_t block_size_x, const size_t grid_size_x)
std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner_
int8_t * allocate(const size_t num_bytes) override
void compactProjectionBuffersGpu(const QueryMemoryDescriptor &query_mem_desc, Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const size_t projection_count, const int device_id)
bool blocksShareMemory() const
int64_t g_bitmap_memory_limit
std::vector< int64_t > init_agg_vals_
bool useStreamingTopN() const
size_t getGroupbyColCount() const
void applyStreamingTopNOffsetCpu(const QueryMemoryDescriptor &query_mem_desc, const RelAlgExecutionUnit &ra_exe_unit)
void fill_empty_key(void *key_ptr, const size_t key_count, const size_t key_width)
virtual void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes) const =0
bool lazyInitGroups(const ExecutorDeviceType) const
void copy_from_gpu(Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
size_t g_max_memory_allocation_size
size_t getAllocatedSize() const
bool is_distinct_target(const TargetInfo &target_info)
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
int8_t * count_distinct_bitmap_crt_ptr_
const std::shared_ptr< Analyzer::Estimator > estimator
void initGroupByBuffer(int64_t *buffer, const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const ExecutorDeviceType device_type, const bool output_columnar, const Executor *executor)
size_t getCountDistinctDescriptorsSize() const
int64_t allocateCountDistinctSet()
QueryDescriptionType getQueryDescriptionType() const
void compactProjectionBuffersCpu(const QueryMemoryDescriptor &query_mem_desc, const size_t projection_count)
std::vector< int64_t * > group_by_buffers_
void initColumnarGroups(const QueryMemoryDescriptor &query_mem_desc, int64_t *groups_buffer, const std::vector< int64_t > &init_vals, const Executor *executor)
const CountDistinctDescriptor & getCountDistinctDescriptor(const size_t idx) const
count_distinct_bitmap_mem_(0)
size_t bitmapPaddedSizeBytes() const
std::vector< int8_t > get_rows_copy_from_heaps(const int64_t *heaps, const size_t heaps_size, const size_t n, const size_t thread_count)
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer)
std::vector< int64_t > get_consistent_frags_sizes(const std::vector< std::vector< uint64_t >> &frag_offsets)
static QueryMemoryDescriptor fixupQueryMemoryDescriptor(const QueryMemoryDescriptor &)
void copyGroupByBuffersFromGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const size_t entry_count, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit *ra_exe_unit, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) const
count_distinct_bitmap_host_mem_(nullptr)
void initColumnPerRow(const QueryMemoryDescriptor &query_mem_desc, int8_t *row_ptr, const size_t bin, const std::vector< int64_t > &init_vals, const std::vector< int64_t > &bitmap_sizes, const std::vector< bool > &tdigest_deferred)
size_t get_heap_size(const size_t row_size, const size_t n, const size_t thread_count)
device_allocator_(device_allocator)
bool interleavedBins(const ExecutorDeviceType) const
bool threadsShareMemory() const
QueryMemoryInitializer(const RelAlgExecutionUnit &ra_exe_unit, const QueryMemoryDescriptor &query_mem_desc, const int device_id, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const bool output_columnar, const bool sort_on_gpu, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, RenderAllocatorMap *render_allocator_map, RenderInfo *render_info, std::shared_ptr< RowSetMemoryOwner > row_set_mem_owner, DeviceAllocator *gpu_allocator, const Executor *executor)
void applyStreamingTopNOffsetGpu(Data_Namespace::DataMgr *data_mgr, const QueryMemoryDescriptor &query_mem_desc, const GpuGroupByBuffers &gpu_group_by_buffers, const RelAlgExecutionUnit &ra_exe_unit, const unsigned total_thread_count, const int device_id)
const auto getGroupByBuffersSize() const
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
Basic constructors and methods of the row set interface.
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
void sort_on_gpu(int64_t *val_buff, int32_t *idx_buff, const uint64_t entry_count, const bool desc, const uint32_t chosen_bytes, ThrustAllocator &alloc)
count_distinct_bitmap_crt_ptr_(nullptr)
int64_t get_consistent_frag_size(const std::vector< uint64_t > &frag_offsets)
int8_t * count_distinct_bitmap_host_mem_
count_distinct_bitmap_mem_bytes_(0)
std::vector< int64_t > init_agg_val_vec(const std::vector< TargetInfo > &targets, const QueryMemoryDescriptor &query_mem_desc)
const int8_t getSlotIndexForSingleSlotCol(const size_t col_idx) const
size_t getBufferColSlotCount() const
const int8_t getLogicalSlotWidthBytes(const size_t slot_idx) const
size_t getColOffInBytes(const size_t col_idx) const
std::vector< std::unique_ptr< ResultSet > > result_sets_
int64_t * alloc_group_by_buffer(const size_t numBytes, RenderAllocatorMap *render_allocator_map, RowSetMemoryOwner *mem_owner)
virtual void setDeviceMem(int8_t *device_ptr, unsigned char uc, const size_t num_bytes) const =0
void allocateCountDistinctGpuMem(const QueryMemoryDescriptor &query_mem_desc)
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
int64_t allocateCountDistinctBitmap(const size_t bitmap_byte_sz)
std::vector< std::vector< int64_t > > get_col_frag_offsets(const std::vector< Analyzer::Expr * > &target_exprs, const std::vector< std::vector< uint64_t >> &table_frag_offsets)