23 #include "../CudaMgr/CudaMgr.h"
33 const size_t num_bytes,
34 const int device_id) {
37 cuMemcpyHtoD(dst, src, num_bytes);
43 cuda_mgr->copyHostToDevice(reinterpret_cast<int8_t*>(dst),
44 static_cast<const int8_t*>(src),
52 const size_t group_by_one_buffer_size,
53 const unsigned grid_size_x) {
55 return grid_size_x * group_by_one_buffer_size;
62 const std::vector<int64_t*>& group_by_buffers,
64 const unsigned block_size_x,
65 const unsigned grid_size_x,
68 const int64_t num_input_rows,
69 const bool prepend_index_buffer,
70 const bool always_init_group_by_on_host,
71 const bool use_bump_allocator,
73 if (group_by_buffers.empty() && !insitu_allocator) {
76 CHECK(cuda_allocator);
78 size_t groups_buffer_size{0};
81 size_t entry_count{0};
83 if (use_bump_allocator) {
84 CHECK(!prepend_index_buffer);
85 CHECK(!insitu_allocator);
93 CHECK_GT(num_input_rows, int64_t(0));
94 entry_count = num_input_rows;
101 group_by_dev_buffers_mem =
110 entry_count = max_memory_size / query_mem_desc.
getRowSize();
118 CHECK_LE(entry_count, std::numeric_limits<uint32_t>::max());
121 group_by_dev_buffers_mem =
130 LOG(
WARNING) <<
"Ran out of memory for projection query output. Retrying with "
138 LOG(
INFO) <<
"Projection query allocation succeeded with " << groups_buffer_size
139 <<
" bytes allocated (max entry count " << entry_count <<
")";
148 const size_t prepended_buff_size{
149 prepend_index_buffer ?
align_to_int64(entry_count *
sizeof(int32_t)) : 0};
151 int8_t* group_by_dev_buffers_allocation{
nullptr};
152 if (insitu_allocator) {
153 group_by_dev_buffers_allocation =
154 insitu_allocator->
alloc(mem_size + prepended_buff_size);
156 group_by_dev_buffers_allocation =
157 cuda_allocator->
alloc(mem_size + prepended_buff_size);
159 CHECK(group_by_dev_buffers_allocation);
161 group_by_dev_buffers_mem =
162 reinterpret_cast<CUdeviceptr>(group_by_dev_buffers_allocation) +
165 CHECK_GT(groups_buffer_size,
size_t(0));
166 CHECK(group_by_dev_buffers_mem);
169 const size_t step{block_size_x};
171 if (!insitu_allocator && (always_init_group_by_on_host ||
173 std::vector<int8_t> buff_to_gpu(mem_size);
174 auto buff_to_gpu_ptr = buff_to_gpu.data();
176 for (
size_t i = 0;
i < group_by_buffers.size();
i += step) {
177 memcpy(buff_to_gpu_ptr, group_by_buffers[
i], groups_buffer_size);
178 buff_to_gpu_ptr += groups_buffer_size;
180 cuda_allocator->
copyToDevice(reinterpret_cast<int8_t*>(group_by_dev_buffers_mem),
185 auto group_by_dev_buffer = group_by_dev_buffers_mem;
187 const size_t num_ptrs{block_size_x * grid_size_x};
189 std::vector<CUdeviceptr> group_by_dev_buffers(num_ptrs);
191 for (
size_t i = 0;
i < num_ptrs;
i += step) {
192 for (
size_t j = 0;
j < step; ++
j) {
193 group_by_dev_buffers[
i +
j] = group_by_dev_buffer;
196 group_by_dev_buffer += groups_buffer_size;
200 auto group_by_dev_ptr = cuda_allocator->
alloc(num_ptrs *
sizeof(
CUdeviceptr));
202 reinterpret_cast<int8_t*>(group_by_dev_buffers.data()),
205 return {
reinterpret_cast<CUdeviceptr>(group_by_dev_ptr),
206 group_by_dev_buffers_mem,
213 const size_t num_bytes,
214 const int device_id) {
217 cuda_mgr->copyDeviceToHost(static_cast<int8_t*>(dst),
218 reinterpret_cast<const int8_t*>(src),
224 const std::vector<int64_t*>& group_by_buffers,
225 const size_t groups_buffer_size,
228 const unsigned block_size_x,
229 const unsigned grid_size_x,
231 const bool prepend_index_buffer) {
232 if (group_by_buffers.empty()) {
235 const unsigned block_buffer_count{query_mem_desc.
blocksShareMemory() ? 1 : grid_size_x};
236 if (block_buffer_count == 1 && !prepend_index_buffer) {
241 group_by_dev_buffers_mem,
246 const size_t index_buffer_sz{
247 prepend_index_buffer ? query_mem_desc.
getEntryCount() *
sizeof(int64_t) : 0};
248 std::vector<int8_t> buff_from_gpu(
249 coalesced_size(query_mem_desc, groups_buffer_size, block_buffer_count) +
253 group_by_dev_buffers_mem - index_buffer_sz,
254 buff_from_gpu.size(),
256 auto buff_from_gpu_ptr = &buff_from_gpu[0];
257 for (
size_t i = 0;
i < block_buffer_count; ++
i) {
258 CHECK_LT(
i * block_size_x, group_by_buffers.size());
259 memcpy(group_by_buffers[
i * block_size_x],
261 groups_buffer_size + index_buffer_sz);
262 buff_from_gpu_ptr += groups_buffer_size;
274 const int device_id) {
276 copy_from_gpu(data_mgr, &num_rows, projection_size_gpu,
sizeof(num_rows), device_id);
277 CHECK(num_rows >= 0);
278 return static_cast<size_t>(num_rows);
292 int8_t* projection_buffer,
293 const size_t projection_count,
294 const int device_id) {
297 constexpr
size_t row_index_width =
sizeof(int64_t);
300 reinterpret_cast<int64_t*>(projection_buffer),
301 gpu_group_by_buffers.
second,
302 projection_count * row_index_width,
304 size_t buffer_offset_cpu{projection_count * row_index_width};
308 const auto column_proj_size =
311 projection_buffer + buffer_offset_cpu,
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
size_t getSlotCount() const
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
size_t getEntryCount() const
double g_bump_allocator_step_reduction
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *cuda_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, Allocator *insitu_allocator)
Streaming Top N algorithm.
unsigned long long CUdeviceptr
virtual void copyToDevice(int8_t *device_dst, const int8_t *host_src, const size_t num_bytes) const =0
virtual int8_t * alloc(const size_t num_bytes)=0
size_t getRowSize() const
bool blocksShareMemory() const
void copy_to_gpu(Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id)
bool lazyInitGroups(const ExecutorDeviceType) const
void copy_from_gpu(Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
size_t g_max_memory_allocation_size
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
QueryDescriptionType getQueryDescriptionType() const
size_t g_min_memory_allocation_size
size_t get_num_allocated_rows_from_gpu(Data_Namespace::DataMgr *data_mgr, CUdeviceptr projection_size_gpu, const int device_id)
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer)
bool didOutputColumnar() const
bool threadsShareMemory() const
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
Allocate GPU memory using GpuBuffers via DataMgr.
size_t coalesced_size(const QueryMemoryDescriptor &query_mem_desc, const size_t group_by_one_buffer_size, const unsigned grid_size_x)
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)