#include "DataMgr/Allocators/DeviceAllocator.h"
#include "GpuInitGroups.h"
#include "GpuMemUtils.h"
#include "Logger/Logger.h"
#include "StreamingTopN.h"
#include "../CudaMgr/CudaMgr.h"
#include "GroupByAndAggregate.h"

Include dependency graph for GpuMemUtils.cpp:

Namespaces
	anonymous_namespace{GpuMemUtils.cpp}

Functions
void	copy_to_nvidia_gpu (Data_Namespace::DataMgr data_mgr, CUdeviceptr dst, const void src, const size_t num_bytes, const int device_id)

size_t	anonymous_namespace{GpuMemUtils.cpp}::coalesced_size (const QueryMemoryDescriptor &query_mem_desc, const size_t group_by_one_buffer_size, const unsigned grid_size_x)

GpuGroupByBuffers	create_dev_group_by_buffers (DeviceAllocator device_allocator, const std::vector< int64_t > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)

void	copy_group_by_buffers_from_gpu (DeviceAllocator &device_allocator, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const int8_t *group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)

size_t	get_num_allocated_rows_from_gpu (DeviceAllocator &device_allocator, int8_t *projection_size_gpu, const int device_id)

void	copy_projection_buffer_from_gpu_columnar (Data_Namespace::DataMgr data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t projection_buffer, const size_t projection_count, const int device_id)

Variables
size_t	g_max_memory_allocation_size

size_t	g_min_memory_allocation_size

double	g_bump_allocator_step_reduction

Function Documentation

void copy_group_by_buffers_from_gpu	(	DeviceAllocator &	device_allocator,
		const std::vector< int64_t * > &	group_by_buffers,
		const size_t	groups_buffer_size,
		const int8_t *	group_by_dev_buffers_mem,
		const QueryMemoryDescriptor &	query_mem_desc,
		const unsigned	block_size_x,
		const unsigned	grid_size_x,
		const int	device_id,
		const bool	prepend_index_buffer,
		const bool	has_varlen_output
	)

Definition at line 228 of file GpuMemUtils.cpp.

References QueryMemoryDescriptor::blocksShareMemory(), CHECK_EQ, CHECK_LT, anonymous_namespace{GpuMemUtils.cpp}::coalesced_size(), DeviceAllocator::copyFromDevice(), and QueryMemoryDescriptor::getEntryCount().

Referenced by QueryMemoryInitializer::copyGroupByBuffersFromGpu(), and ResultSet::radixSortOnGpu().

                                                                   {
   if (group_by_buffers.empty()) {
     return;
   }
   const size_t first_group_buffer_idx = has_varlen_output ? 1 : 0;
 
   const unsigned block_buffer_count{query_mem_desc.blocksShareMemory() ? 1 : grid_size_x};
   if (block_buffer_count == 1 && !prepend_index_buffer) {
     CHECK_EQ(coalesced_size(query_mem_desc, groups_buffer_size, block_buffer_count),
              groups_buffer_size);
     device_allocator.copyFromDevice(group_by_buffers[first_group_buffer_idx],
                                     group_by_dev_buffers_mem,
                                     groups_buffer_size);
     return;
   }
   const size_t index_buffer_sz{
       prepend_index_buffer ? query_mem_desc.getEntryCount() * sizeof(int64_t) : 0};
   std::vector<int8_t> buff_from_gpu(
       coalesced_size(query_mem_desc, groups_buffer_size, block_buffer_count) +
       index_buffer_sz);
   device_allocator.copyFromDevice(&buff_from_gpu[0],
                                   group_by_dev_buffers_mem - index_buffer_sz,
                                   buff_from_gpu.size());
   auto buff_from_gpu_ptr = &buff_from_gpu[0];
   for (size_t i = 0; i < block_buffer_count; ++i) {
     const size_t buffer_idx = (i * block_size_x) + first_group_buffer_idx;
     CHECK_LT(buffer_idx, group_by_buffers.size());
     memcpy(group_by_buffers[buffer_idx],
            buff_from_gpu_ptr,
            groups_buffer_size + index_buffer_sz);
     buff_from_gpu_ptr += groups_buffer_size;
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void copy_projection_buffer_from_gpu_columnar	(	Data_Namespace::DataMgr *	data_mgr,
		const GpuGroupByBuffers &	gpu_group_by_buffers,
		const QueryMemoryDescriptor &	query_mem_desc,
		int8_t *	projection_buffer,
		const size_t	projection_count,
		const int	device_id
	)

For projection queries we only copy back as many elements as necessary, not the whole output buffer. The goal is to be able to build a compact ResultSet, particularly useful for columnar outputs.

NOTE: Saman: we should revisit this function when we have a bump allocator

Definition at line 293 of file GpuMemUtils.cpp.

References align_to_int64(), CHECK, GpuGroupByBuffers::data, QueryMemoryDescriptor::didOutputColumnar(), QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getQueryDescriptionType(), getQueryEngineCudaStreamForDevice(), QueryMemoryDescriptor::getSlotCount(), and heavyai::Projection.

Referenced by QueryMemoryInitializer::compactProjectionBuffersGpu().

                          {
 #ifdef HAVE_CUDA
   CHECK(query_mem_desc.didOutputColumnar());
   CHECK(query_mem_desc.getQueryDescriptionType() == QueryDescriptionType::Projection);
   constexpr size_t row_index_width = sizeof(int64_t);
 
   auto allocator = std::make_unique<CudaAllocator>(
       data_mgr, device_id, getQueryEngineCudaStreamForDevice(device_id));
   // copy all the row indices back to the host
   allocator->copyFromDevice(
       projection_buffer, gpu_group_by_buffers.data, projection_count * row_index_width);
   size_t buffer_offset_cpu{projection_count * row_index_width};
   // other columns are actual non-lazy columns for the projection:
   for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
     if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
       const auto column_proj_size =
           projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
       allocator->copyFromDevice(
           projection_buffer + buffer_offset_cpu,
           gpu_group_by_buffers.data + query_mem_desc.getColOffInBytes(i),
           column_proj_size);
       buffer_offset_cpu += align_to_int64(column_proj_size);
     }
   }
 #else
   CHECK(false);
 #endif  // HAVE_CUDA
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void copy_to_nvidia_gpu	(	Data_Namespace::DataMgr *	data_mgr,
		CUdeviceptr	dst,
		const void *	src,
		const size_t	num_bytes,
		const int	device_id
	)

Definition at line 35 of file GpuMemUtils.cpp.

References CHECK, checkCudaErrors(), Data_Namespace::DataMgr::getCudaMgr(), and getQueryEngineCudaStreamForDevice().

Referenced by TreeModelPredictionMgr::createKernelBuffers(), StringDictionaryTranslationMgr::createKernelBuffers(), and anonymous_namespace{ResultSetSortImpl.cu}::get_device_copy_ptr().

                                              {
 #ifdef HAVE_CUDA
   auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
   if (!data_mgr) {  // only for unit tests
     checkCudaErrors(cuMemcpyHtoDAsync(dst, src, num_bytes, qe_cuda_stream));
     checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
     return;
   }
   const auto cuda_mgr = data_mgr->getCudaMgr();
   CHECK(cuda_mgr);
   cuda_mgr->copyHostToDevice(reinterpret_cast<int8_t*>(dst),
                              static_cast<const int8_t*>(src),
                              num_bytes,
                              device_id,
                              qe_cuda_stream);
 #else
   CHECK(false);
 #endif  // HAVE_CUDA
 }

Here is the call graph for this function:

Here is the caller graph for this function:

GpuGroupByBuffers create_dev_group_by_buffers	(	DeviceAllocator *	device_allocator,
		const std::vector< int64_t * > &	group_by_buffers,
		const QueryMemoryDescriptor &	query_mem_desc,
		const unsigned	block_size_x,
		const unsigned	grid_size_x,
		const int	device_id,
		const ExecutorDispatchMode	dispatch_mode,
		const int64_t	num_input_rows,
		const bool	prepend_index_buffer,
		const bool	always_init_group_by_on_host,
		const bool	use_bump_allocator,
		const bool	has_varlen_output,
		Allocator *	insitu_allocator
	)

Definition at line 70 of file GpuMemUtils.cpp.

References align_to_int64(), Allocator::alloc(), QueryMemoryDescriptor::blocksShareMemory(), CHECK, CHECK_GT, CHECK_LE, anonymous_namespace{GpuMemUtils.cpp}::coalesced_size(), DeviceAllocator::copyToDevice(), g_bump_allocator_step_reduction, g_max_memory_allocation_size, g_min_memory_allocation_size, QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), GPU, logger::INFO, KernelPerFragment, QueryMemoryDescriptor::lazyInitGroups(), LOG, QueryMemoryDescriptor::threadsShareMemory(), to_string(), QueryMemoryDescriptor::varlenOutputBufferElemSize(), and logger::WARNING.

Referenced by QueryMemoryInitializer::createAndInitializeGroupByBufferGpu(), and ResultSet::radixSortOnGpu().

                                  {
   if (group_by_buffers.empty() && !insitu_allocator) {
     return {0, 0, 0, 0};
   }
   CHECK(device_allocator);
 
   size_t groups_buffer_size{0};
   int8_t* group_by_dev_buffers_mem{nullptr};
   size_t mem_size{0};
   size_t entry_count{0};
 
   if (use_bump_allocator) {
     CHECK(!prepend_index_buffer);
     CHECK(!insitu_allocator);
 
     if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
       // Allocate an output buffer equal to the size of the number of rows in the
       // fragment. The kernel per fragment path is only used for projections with lazy
       // fetched outputs. Therefore, the resulting output buffer should be relatively
       // narrow compared to the width of an input row, offsetting the larger allocation.
 
       CHECK_GT(num_input_rows, int64_t(0));
       entry_count = num_input_rows;
       groups_buffer_size =
           query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
       mem_size = coalesced_size(query_mem_desc,
                                 groups_buffer_size,
                                 query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
       // TODO(adb): render allocator support
       group_by_dev_buffers_mem = device_allocator->alloc(mem_size);
     } else {
       // Attempt to allocate increasingly small buffers until we have less than 256B of
       // memory remaining on the device. This may have the side effect of evicting
       // memory allocated for previous queries. However, at current maximum slab sizes
       // (2GB) we expect these effects to be minimal.
       size_t max_memory_size{g_max_memory_allocation_size};
       while (true) {
         entry_count = max_memory_size / query_mem_desc.getRowSize();
         groups_buffer_size =
             query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
 
         try {
           mem_size = coalesced_size(query_mem_desc,
                                     groups_buffer_size,
                                     query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
           CHECK_LE(entry_count, std::numeric_limits<uint32_t>::max());
 
           // TODO(adb): render allocator support
           group_by_dev_buffers_mem = device_allocator->alloc(mem_size);
         } catch (const OutOfMemory& e) {
           LOG(WARNING) << e.what();
           max_memory_size = max_memory_size * g_bump_allocator_step_reduction;
           if (max_memory_size < g_min_memory_allocation_size) {
             throw;
           }
 
           LOG(WARNING) << "Ran out of memory for projection query output. Retrying with "
                        << std::to_string(max_memory_size) << " bytes";
 
           continue;
         }
         break;
       }
     }
     LOG(INFO) << "Projection query allocation succeeded with " << groups_buffer_size
               << " bytes allocated (max entry count " << entry_count << ")";
   } else {
     entry_count = query_mem_desc.getEntryCount();
     CHECK_GT(entry_count, size_t(0));
     groups_buffer_size =
         query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
     mem_size = coalesced_size(query_mem_desc,
                               groups_buffer_size,
                               query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
     const size_t prepended_buff_size{
         prepend_index_buffer ? align_to_int64(entry_count * sizeof(int32_t)) : 0};
 
     int8_t* group_by_dev_buffers_allocation{nullptr};
     if (insitu_allocator) {
       group_by_dev_buffers_allocation =
           insitu_allocator->alloc(mem_size + prepended_buff_size);
     } else {
       group_by_dev_buffers_allocation =
           device_allocator->alloc(mem_size + prepended_buff_size);
     }
     CHECK(group_by_dev_buffers_allocation);
 
     group_by_dev_buffers_mem = group_by_dev_buffers_allocation + prepended_buff_size;
   }
   CHECK_GT(groups_buffer_size, size_t(0));
   CHECK(group_by_dev_buffers_mem);
 
   CHECK(query_mem_desc.threadsShareMemory());
   const size_t step{block_size_x};
 
   if (!insitu_allocator && (always_init_group_by_on_host ||
                             !query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU))) {
     std::vector<int8_t> buff_to_gpu(mem_size);
     auto buff_to_gpu_ptr = buff_to_gpu.data();
 
     const size_t start = has_varlen_output ? 1 : 0;
     for (size_t i = start; i < group_by_buffers.size(); i += step) {
       memcpy(buff_to_gpu_ptr, group_by_buffers[i], groups_buffer_size);
       buff_to_gpu_ptr += groups_buffer_size;
     }
     device_allocator->copyToDevice(reinterpret_cast<int8_t*>(group_by_dev_buffers_mem),
                                    buff_to_gpu.data(),
                                    buff_to_gpu.size());
   }
 
   auto group_by_dev_buffer = group_by_dev_buffers_mem;
 
   const size_t num_ptrs =
       (block_size_x * grid_size_x) + (has_varlen_output ? size_t(1) : size_t(0));
 
   std::vector<int8_t*> group_by_dev_buffers(num_ptrs);
 
   const size_t start_index = has_varlen_output ? 1 : 0;
   for (size_t i = start_index; i < num_ptrs; i += step) {
     for (size_t j = 0; j < step; ++j) {
       group_by_dev_buffers[i + j] = group_by_dev_buffer;
     }
     if (!query_mem_desc.blocksShareMemory()) {
       group_by_dev_buffer += groups_buffer_size;
     }
   }
 
   int8_t* varlen_output_buffer{nullptr};
   if (has_varlen_output) {
     const auto varlen_buffer_elem_size_opt = query_mem_desc.varlenOutputBufferElemSize();
     CHECK(varlen_buffer_elem_size_opt);  // TODO(adb): relax
 
     group_by_dev_buffers[0] = device_allocator->alloc(
         query_mem_desc.getEntryCount() * varlen_buffer_elem_size_opt.value());
     varlen_output_buffer = group_by_dev_buffers[0];
   }
 
   auto group_by_dev_ptr = device_allocator->alloc(num_ptrs * sizeof(CUdeviceptr));
   device_allocator->copyToDevice(group_by_dev_ptr,
                                  reinterpret_cast<int8_t*>(group_by_dev_buffers.data()),
                                  num_ptrs * sizeof(CUdeviceptr));
 
   return {group_by_dev_ptr, group_by_dev_buffers_mem, entry_count, varlen_output_buffer};
 }

Here is the call graph for this function:

Here is the caller graph for this function:

size_t get_num_allocated_rows_from_gpu	(	DeviceAllocator &	device_allocator,
		int8_t *	projection_size_gpu,
		const int	device_id
	)

Returns back total number of allocated rows per device (i.e., number of matched elements in projections).

TODO(Saman): revisit this for bump allocators

Definition at line 277 of file GpuMemUtils.cpp.

References CHECK, and DeviceAllocator::copyFromDevice().

Referenced by QueryExecutionContext::launchGpuCode().

                                                             {
   int32_t num_rows{0};
   device_allocator.copyFromDevice(&num_rows, projection_size_gpu, sizeof(num_rows));
   CHECK(num_rows >= 0);
   return static_cast<size_t>(num_rows);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

Variable Documentation

double g_bump_allocator_step_reduction

Definition at line 133 of file Execute.cpp.

size_t g_max_memory_allocation_size

Definition at line 128 of file Execute.cpp.

size_t g_min_memory_allocation_size

Definition at line 129 of file Execute.cpp.

Namespaces

Functions

Variables

Function Documentation

Variable Documentation