OmniSciDB
1dac507f6e
|
#include "GpuMemUtils.h"
#include "Allocators/CudaAllocator.h"
#include "Allocators/ThrustAllocator.h"
#include "GpuInitGroups.h"
#include "Shared/Logger.h"
#include "StreamingTopN.h"
#include "../CudaMgr/CudaMgr.h"
#include "GroupByAndAggregate.h"
Go to the source code of this file.
Namespaces | |
anonymous_namespace{GpuMemUtils.cpp} | |
Functions | |
void | copy_to_gpu (Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id) |
size_t | anonymous_namespace{GpuMemUtils.cpp}::coalesced_size (const QueryMemoryDescriptor &query_mem_desc, const size_t group_by_one_buffer_size, const unsigned grid_size_x) |
GpuGroupByBuffers | create_dev_group_by_buffers (DeviceAllocator *cuda_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, Allocator *insitu_allocator) |
void | copy_from_gpu (Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id) |
void | copy_group_by_buffers_from_gpu (Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer) |
size_t | get_num_allocated_rows_from_gpu (Data_Namespace::DataMgr *data_mgr, CUdeviceptr projection_size_gpu, const int device_id) |
void | copy_projection_buffer_from_gpu_columnar (Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id) |
Variables | |
size_t | g_max_memory_allocation_size |
size_t | g_min_memory_allocation_size |
double | g_bump_allocator_step_reduction |
void copy_from_gpu | ( | Data_Namespace::DataMgr * | data_mgr, |
void * | dst, | ||
const CUdeviceptr | src, | ||
const size_t | num_bytes, | ||
const int | device_id | ||
) |
Definition at line 211 of file GpuMemUtils.cpp.
References CHECK(), and Data_Namespace::DataMgr::getCudaMgr().
Referenced by OverlapsJoinHashTable::approximateTupleCount(), BaselineJoinHashTable::approximateTupleCount(), OverlapsJoinHashTable::computeBucketSizes(), copy_group_by_buffers_from_gpu(), copy_projection_buffer_from_gpu_columnar(), BaselineJoinHashTable::decodeJoinHashBuffer(), JoinHashTable::decodeJoinHashBuffer(), anonymous_namespace{ResultSetIteration.cpp}::fetch_data_from_gpu(), get_num_allocated_rows_from_gpu(), ResultSet::getVarlenOrderEntry(), JoinHashTable::initHashTableForDevice(), OverlapsJoinHashTable::initHashTableOnGpu(), BaselineJoinHashTable::initHashTableOnGpu(), QueryExecutionContext::launchGpuCode(), ResultSet::makeVarlenTargetValue(), ResultSet::syncEstimatorBuffer(), BaselineJoinHashTable::toString(), and JoinHashTable::toString().
void copy_group_by_buffers_from_gpu | ( | Data_Namespace::DataMgr * | data_mgr, |
const std::vector< int64_t * > & | group_by_buffers, | ||
const size_t | groups_buffer_size, | ||
const CUdeviceptr | group_by_dev_buffers_mem, | ||
const QueryMemoryDescriptor & | query_mem_desc, | ||
const unsigned | block_size_x, | ||
const unsigned | grid_size_x, | ||
const int | device_id, | ||
const bool | prepend_index_buffer | ||
) |
Definition at line 224 of file GpuMemUtils.cpp.
References QueryMemoryDescriptor::blocksShareMemory(), CHECK_EQ, CHECK_LT, anonymous_namespace{GpuMemUtils.cpp}::coalesced_size(), copy_from_gpu(), and QueryMemoryDescriptor::getEntryCount().
Referenced by QueryMemoryInitializer::copyGroupByBuffersFromGpu(), and ResultSet::radixSortOnGpu().
void copy_projection_buffer_from_gpu_columnar | ( | Data_Namespace::DataMgr * | data_mgr, |
const GpuGroupByBuffers & | gpu_group_by_buffers, | ||
const QueryMemoryDescriptor & | query_mem_desc, | ||
int8_t * | projection_buffer, | ||
const size_t | projection_count, | ||
const int | device_id | ||
) |
For projection queries we only copy back as many elements as necessary, not the whole output buffer. The goal is to be able to build a compact ResultSet, particularly useful for columnar outputs.
NOTE: Saman: we should revisit this function when we have a bump allocator
Definition at line 289 of file GpuMemUtils.cpp.
References align_to_int64(), CHECK(), copy_from_gpu(), QueryMemoryDescriptor::didOutputColumnar(), QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getQueryDescriptionType(), QueryMemoryDescriptor::getSlotCount(), Projection, and GpuGroupByBuffers::second.
Referenced by QueryMemoryInitializer::compactProjectionBuffersGpu().
void copy_to_gpu | ( | Data_Namespace::DataMgr * | data_mgr, |
CUdeviceptr | dst, | ||
const void * | src, | ||
const size_t | num_bytes, | ||
const int | device_id | ||
) |
Definition at line 31 of file GpuMemUtils.cpp.
References CHECK(), and Data_Namespace::DataMgr::getCudaMgr().
Referenced by OverlapsJoinHashTable::approximateTupleCount(), BaselineJoinHashTable::fetchColumn(), JoinHashTable::fetchFragments(), anonymous_namespace{ResultSetSortImpl.cu}::get_device_copy_ptr(), ColumnFetcher::getOneTableColumnFragment(), JoinHashTable::initHashTableForDevice(), BaselineJoinHashTable::initHashTableForDevice(), OverlapsJoinHashTable::initHashTableOnGpu(), BaselineJoinHashTable::initHashTableOnGpu(), JoinHashTable::initOneToManyHashTable(), InValuesBitmap::InValuesBitmap(), QueryExecutionContext::launchGpuCode(), transfer_object_to_gpu(), transfer_pod_vector_to_gpu(), and ColumnFetcher::transferColumnIfNeeded().
GpuGroupByBuffers create_dev_group_by_buffers | ( | DeviceAllocator * | cuda_allocator, |
const std::vector< int64_t * > & | group_by_buffers, | ||
const QueryMemoryDescriptor & | query_mem_desc, | ||
const unsigned | block_size_x, | ||
const unsigned | grid_size_x, | ||
const int | device_id, | ||
const ExecutorDispatchMode | dispatch_mode, | ||
const int64_t | num_input_rows, | ||
const bool | prepend_index_buffer, | ||
const bool | always_init_group_by_on_host, | ||
const bool | use_bump_allocator, | ||
Allocator * | insitu_allocator | ||
) |
Definition at line 61 of file GpuMemUtils.cpp.
References align_to_int64(), Allocator::alloc(), QueryMemoryDescriptor::blocksShareMemory(), CHECK(), CHECK_GT, CHECK_LE, anonymous_namespace{GpuMemUtils.cpp}::coalesced_size(), DeviceAllocator::copyToDevice(), g_bump_allocator_step_reduction, g_max_memory_allocation_size, g_min_memory_allocation_size, QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), GPU, logger::INFO, KernelPerFragment, QueryMemoryDescriptor::lazyInitGroups(), LOG, QueryMemoryDescriptor::threadsShareMemory(), to_string(), and logger::WARNING.
Referenced by ResultSet::radixSortOnGpu().
size_t get_num_allocated_rows_from_gpu | ( | Data_Namespace::DataMgr * | data_mgr, |
CUdeviceptr | projection_size_gpu, | ||
const int | device_id | ||
) |
Returns back total number of allocated rows per device (i.e., number of matched elements in projections).
TODO(Saman): revisit this for bump allocators
Definition at line 273 of file GpuMemUtils.cpp.
References CHECK(), copy_from_gpu(), and num_rows.
Referenced by QueryExecutionContext::launchGpuCode().
double g_bump_allocator_step_reduction |
Definition at line 100 of file Execute.cpp.
Referenced by create_dev_group_by_buffers().
size_t g_max_memory_allocation_size |
Definition at line 95 of file Execute.cpp.
size_t g_min_memory_allocation_size |
Definition at line 96 of file Execute.cpp.