OmniSciDB  c07336695a
GpuMemUtils.h File Reference
#include "CompilationOptions.h"
#include <cstddef>
#include <cstdint>
#include <memory>
#include <utility>
#include <vector>
#include "../Shared/nocuda.h"
+ Include dependency graph for GpuMemUtils.h:
+ This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  GpuGroupByBuffers
 

Namespaces

 CudaMgr_Namespace
 
 Data_Namespace
 

Functions

void copy_to_gpu (Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id)
 
void copy_from_gpu (Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
 
GpuGroupByBuffers create_dev_group_by_buffers (DeviceAllocator *device_allocator, const std::vector< int64_t *> &group_by_buffers, const QueryMemoryDescriptor &, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, Allocator *insitu_allocator)
 
void copy_group_by_buffers_from_gpu (Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t *> &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer)
 
size_t get_num_allocated_rows_from_gpu (Data_Namespace::DataMgr *data_mgr, CUdeviceptr projection_size_gpu, const int device_id)
 
void copy_projection_buffer_from_gpu_columnar (Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_query_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
 

Function Documentation

◆ copy_from_gpu()

void copy_from_gpu ( Data_Namespace::DataMgr data_mgr,
void *  dst,
const CUdeviceptr  src,
const size_t  num_bytes,
const int  device_id 
)

Definition at line 211 of file GpuMemUtils.cpp.

References CHECK, and Data_Namespace::DataMgr::getCudaMgr().

Referenced by OverlapsJoinHashTable::approximateTupleCount(), BaselineJoinHashTable::approximateTupleCount(), OverlapsJoinHashTable::computeBucketSizes(), copy_group_by_buffers_from_gpu(), copy_projection_buffer_from_gpu_columnar(), anonymous_namespace{ResultSetIteration.cpp}::fetch_data_from_gpu(), get_num_allocated_rows_from_gpu(), ResultSet::getVarlenOrderEntry(), JoinHashTable::initHashTableForDevice(), OverlapsJoinHashTable::initHashTableOnGpu(), BaselineJoinHashTable::initHashTableOnGpu(), QueryExecutionContext::launchGpuCode(), ResultSet::makeVarlenTargetValue(), and ResultSet::syncEstimatorBuffer().

215  {
216  const auto cuda_mgr = data_mgr->getCudaMgr();
217  CHECK(cuda_mgr);
218  cuda_mgr->copyDeviceToHost(static_cast<int8_t*>(dst),
219  reinterpret_cast<const int8_t*>(src),
220  num_bytes,
221  device_id);
222 }
int64_t * src
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:116
#define CHECK(condition)
Definition: Logger.h:187
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ copy_group_by_buffers_from_gpu()

void copy_group_by_buffers_from_gpu ( Data_Namespace::DataMgr data_mgr,
const std::vector< int64_t *> &  group_by_buffers,
const size_t  groups_buffer_size,
const CUdeviceptr  group_by_dev_buffers_mem,
const QueryMemoryDescriptor query_mem_desc,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int  device_id,
const bool  prepend_index_buffer 
)

Definition at line 224 of file GpuMemUtils.cpp.

References QueryMemoryDescriptor::blocksShareMemory(), CHECK_EQ, CHECK_LT, anonymous_namespace{GpuMemUtils.cpp}::coalesced_size(), copy_from_gpu(), QueryMemoryDescriptor::getEntryCount(), and groups_buffer_size.

Referenced by QueryMemoryInitializer::copyGroupByBuffersFromGpu(), and ResultSet::radixSortOnGpu().

232  {
233  if (group_by_buffers.empty()) {
234  return;
235  }
236  const unsigned block_buffer_count{query_mem_desc.blocksShareMemory() ? 1 : grid_size_x};
237  if (block_buffer_count == 1 && !prepend_index_buffer) {
238  CHECK_EQ(block_size_x, group_by_buffers.size());
239  CHECK_EQ(coalesced_size(query_mem_desc, groups_buffer_size, block_buffer_count),
241  copy_from_gpu(data_mgr,
242  group_by_buffers[0],
243  group_by_dev_buffers_mem,
245  device_id);
246  return;
247  }
248  const size_t index_buffer_sz{
249  prepend_index_buffer ? query_mem_desc.getEntryCount() * sizeof(int64_t) : 0};
250  std::vector<int8_t> buff_from_gpu(
251  coalesced_size(query_mem_desc, groups_buffer_size, block_buffer_count) +
252  index_buffer_sz);
253  copy_from_gpu(data_mgr,
254  &buff_from_gpu[0],
255  group_by_dev_buffers_mem - index_buffer_sz,
256  buff_from_gpu.size(),
257  device_id);
258  auto buff_from_gpu_ptr = &buff_from_gpu[0];
259  for (size_t i = 0; i < block_buffer_count; ++i) {
260  CHECK_LT(i * block_size_x, group_by_buffers.size());
261  memcpy(group_by_buffers[i * block_size_x],
262  buff_from_gpu_ptr,
263  groups_buffer_size + index_buffer_sz);
264  buff_from_gpu_ptr += groups_buffer_size;
265  }
266 }
#define CHECK_EQ(x, y)
Definition: Logger.h:195
const int32_t groups_buffer_size
void copy_from_gpu(Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
#define CHECK_LT(x, y)
Definition: Logger.h:197
size_t coalesced_size(const QueryMemoryDescriptor &query_mem_desc, const size_t group_by_one_buffer_size, const unsigned grid_size_x)
Definition: GpuMemUtils.cpp:52
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ copy_projection_buffer_from_gpu_columnar()

void copy_projection_buffer_from_gpu_columnar ( Data_Namespace::DataMgr data_mgr,
const GpuGroupByBuffers gpu_group_by_buffers,
const QueryMemoryDescriptor query_mem_desc,
int8_t *  projection_buffer,
const size_t  projection_count,
const int  device_id 
)

For projection queries we only copy back as many elements as necessary, not the whole output buffer. The goal is to be able to build a compact ResultSet, particularly useful for columnar outputs.

NOTE: Saman: we should revisit this function when we have a bump allocator

Definition at line 290 of file GpuMemUtils.cpp.

References align_to_int64(), CHECK, copy_from_gpu(), QueryMemoryDescriptor::didOutputColumnar(), QueryMemoryDescriptor::getColOffInBytes(), QueryMemoryDescriptor::getPaddedSlotWidthBytes(), QueryMemoryDescriptor::getQueryDescriptionType(), QueryMemoryDescriptor::getSlotCount(), Projection, and GpuGroupByBuffers::second.

Referenced by QueryMemoryInitializer::compactProjectionBuffersGpu().

296  {
297  CHECK(query_mem_desc.didOutputColumnar());
299  constexpr size_t row_index_width = sizeof(int64_t);
300  // copy all the row indices back to the host
301  copy_from_gpu(data_mgr,
302  reinterpret_cast<int64_t*>(projection_buffer),
303  gpu_group_by_buffers.second,
304  projection_count * row_index_width,
305  device_id);
306  size_t buffer_offset_cpu{projection_count * row_index_width};
307  // other columns are actual non-lazy columns for the projection:
308  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
309  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
310  const auto column_proj_size =
311  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
312  copy_from_gpu(data_mgr,
313  projection_buffer + buffer_offset_cpu,
314  gpu_group_by_buffers.second + query_mem_desc.getColOffInBytes(i),
315  column_proj_size,
316  device_id);
317  buffer_offset_cpu += align_to_int64(column_proj_size);
318  }
319  }
320 }
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
CUdeviceptr second
Definition: GpuMemUtils.h:61
void copy_from_gpu(Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
#define CHECK(condition)
Definition: Logger.h:187
size_t getColOffInBytes(const size_t col_idx) const
QueryDescriptionType getQueryDescriptionType() const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ copy_to_gpu()

void copy_to_gpu ( Data_Namespace::DataMgr data_mgr,
CUdeviceptr  dst,
const void *  src,
const size_t  num_bytes,
const int  device_id 
)

Definition at line 31 of file GpuMemUtils.cpp.

References CHECK, and Data_Namespace::DataMgr::getCudaMgr().

Referenced by OverlapsJoinHashTable::approximateTupleCount(), BaselineJoinHashTable::fetchColumn(), JoinHashTable::fetchFragments(), ColumnFetcher::getOneTableColumnFragment(), BaselineJoinHashTable::initHashTableForDevice(), JoinHashTable::initHashTableForDevice(), OverlapsJoinHashTable::initHashTableOnGpu(), BaselineJoinHashTable::initHashTableOnGpu(), JoinHashTable::initOneToManyHashTable(), InValuesBitmap::InValuesBitmap(), QueryExecutionContext::launchCpuCode(), QueryExecutionContext::launchGpuCode(), transfer_object_to_gpu(), transfer_pod_vector_to_gpu(), and ColumnFetcher::transferColumnIfNeeded().

35  {
36 #ifdef HAVE_CUDA
37  if (!data_mgr) { // only for unit tests
38  cuMemcpyHtoD(dst, src, num_bytes);
39  return;
40  }
41 #endif // HAVE_CUDA
42  const auto cuda_mgr = data_mgr->getCudaMgr();
43  CHECK(cuda_mgr);
44  cuda_mgr->copyHostToDevice(reinterpret_cast<int8_t*>(dst),
45  static_cast<const int8_t*>(src),
46  num_bytes,
47  device_id);
48 }
int64_t * src
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:116
#define CHECK(condition)
Definition: Logger.h:187
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ create_dev_group_by_buffers()

GpuGroupByBuffers create_dev_group_by_buffers ( DeviceAllocator device_allocator,
const std::vector< int64_t *> &  group_by_buffers,
const QueryMemoryDescriptor ,
const unsigned  block_size_x,
const unsigned  grid_size_x,
const int  device_id,
const ExecutorDispatchMode  dispatch_mode,
const int64_t  num_input_rows,
const bool  prepend_index_buffer,
const bool  always_init_group_by_on_host,
const bool  use_bump_allocator,
Allocator insitu_allocator 
)

Definition at line 61 of file GpuMemUtils.cpp.

References align_to_int64(), Allocator::alloc(), QueryMemoryDescriptor::blocksShareMemory(), CHECK, CHECK_GT, CHECK_LE, anonymous_namespace{GpuMemUtils.cpp}::coalesced_size(), DeviceAllocator::copyToDevice(), g_bump_allocator_step_reduction, g_max_memory_allocation_size, g_min_memory_allocation_size, QueryMemoryDescriptor::getBufferSizeBytes(), QueryMemoryDescriptor::getEntryCount(), QueryMemoryDescriptor::getRowSize(), GPU, groups_buffer_size, logger::INFO, KernelPerFragment, QueryMemoryDescriptor::lazyInitGroups(), LOG, QueryMemoryDescriptor::threadsShareMemory(), to_string(), and logger::WARNING.

Referenced by QueryMemoryInitializer::allocateCountDistinctSet(), and ResultSet::radixSortOnGpu().

73  {
74  if (group_by_buffers.empty() && !insitu_allocator) {
75  return {0, 0, 0};
76  }
77  CHECK(cuda_allocator);
78 
79  size_t groups_buffer_size{0};
80  CUdeviceptr group_by_dev_buffers_mem{0};
81  size_t mem_size{0};
82  size_t entry_count{0};
83 
84  if (use_bump_allocator) {
85  CHECK(!prepend_index_buffer);
86  CHECK(!insitu_allocator);
87 
88  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
89  // Allocate an output buffer equal to the size of the number of rows in the
90  // fragment. The kernel per fragment path is only used for projections with lazy
91  // fetched outputs. Therefore, the resulting output buffer should be relatively
92  // narrow compared to the width of an input row, offsetting the larger allocation.
93 
94  CHECK_GT(num_input_rows, int64_t(0));
95  entry_count = num_input_rows;
97  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
98  mem_size = coalesced_size(query_mem_desc,
100  query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
101  // TODO(adb): render allocator support
102  group_by_dev_buffers_mem =
103  reinterpret_cast<CUdeviceptr>(cuda_allocator->alloc(mem_size));
104  } else {
105  // Attempt to allocate increasingly small buffers until we have less than 256B of
106  // memory remaining on the device. This may have the side effect of evicting
107  // memory allocated for previous queries. However, at current maximum slab sizes
108  // (2GB) we expect these effects to be minimal.
109  size_t max_memory_size{g_max_memory_allocation_size};
110  while (true) {
111  entry_count = max_memory_size / query_mem_desc.getRowSize();
113  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
114 
115  try {
116  mem_size = coalesced_size(query_mem_desc,
118  query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
119  CHECK_LE(entry_count, std::numeric_limits<uint32_t>::max());
120 
121  // TODO(adb): render allocator support
122  group_by_dev_buffers_mem =
123  reinterpret_cast<CUdeviceptr>(cuda_allocator->alloc(mem_size));
124  } catch (const OutOfMemory& e) {
125  LOG(WARNING) << e.what();
126  max_memory_size = max_memory_size * g_bump_allocator_step_reduction;
127  if (max_memory_size < g_min_memory_allocation_size) {
128  throw;
129  }
130 
131  LOG(WARNING) << "Ran out of memory for projection query output. Retrying with "
132  << std::to_string(max_memory_size) << " bytes";
133 
134  continue;
135  }
136  break;
137  }
138  }
139  LOG(INFO) << "Projection query allocation succeeded with " << groups_buffer_size
140  << " bytes allocated (max entry count " << entry_count << ")";
141  } else {
142  entry_count = query_mem_desc.getEntryCount();
143  CHECK_GT(entry_count, size_t(0));
145  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
146  mem_size = coalesced_size(query_mem_desc,
148  query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
149  const size_t prepended_buff_size{
150  prepend_index_buffer ? align_to_int64(entry_count * sizeof(int32_t)) : 0};
151 
152  int8_t* group_by_dev_buffers_allocation{nullptr};
153  if (insitu_allocator) {
154  group_by_dev_buffers_allocation =
155  insitu_allocator->alloc(mem_size + prepended_buff_size);
156  } else {
157  group_by_dev_buffers_allocation =
158  cuda_allocator->alloc(mem_size + prepended_buff_size);
159  }
160  CHECK(group_by_dev_buffers_allocation);
161 
162  group_by_dev_buffers_mem =
163  reinterpret_cast<CUdeviceptr>(group_by_dev_buffers_allocation) +
164  prepended_buff_size;
165  }
166  CHECK_GT(groups_buffer_size, size_t(0));
167  CHECK(group_by_dev_buffers_mem);
168 
169  CHECK(query_mem_desc.threadsShareMemory());
170  const size_t step{block_size_x};
171 
172  if (!insitu_allocator && (always_init_group_by_on_host ||
173  !query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU))) {
174  std::vector<int8_t> buff_to_gpu(mem_size);
175  auto buff_to_gpu_ptr = buff_to_gpu.data();
176 
177  for (size_t i = 0; i < group_by_buffers.size(); i += step) {
178  memcpy(buff_to_gpu_ptr, group_by_buffers[i], groups_buffer_size);
179  buff_to_gpu_ptr += groups_buffer_size;
180  }
181  cuda_allocator->copyToDevice(reinterpret_cast<int8_t*>(group_by_dev_buffers_mem),
182  buff_to_gpu.data(),
183  buff_to_gpu.size());
184  }
185 
186  auto group_by_dev_buffer = group_by_dev_buffers_mem;
187 
188  const size_t num_ptrs{block_size_x * grid_size_x};
189 
190  std::vector<CUdeviceptr> group_by_dev_buffers(num_ptrs);
191 
192  for (size_t i = 0; i < num_ptrs; i += step) {
193  for (size_t j = 0; j < step; ++j) {
194  group_by_dev_buffers[i + j] = group_by_dev_buffer;
195  }
196  if (!query_mem_desc.blocksShareMemory()) {
197  group_by_dev_buffer += groups_buffer_size;
198  }
199  }
200 
201  auto group_by_dev_ptr = cuda_allocator->alloc(num_ptrs * sizeof(CUdeviceptr));
202  cuda_allocator->copyToDevice(group_by_dev_ptr,
203  reinterpret_cast<int8_t*>(group_by_dev_buffers.data()),
204  num_ptrs * sizeof(CUdeviceptr));
205 
206  return {reinterpret_cast<CUdeviceptr>(group_by_dev_ptr),
207  group_by_dev_buffers_mem,
208  entry_count};
209 }
size_t g_max_memory_allocation_size
Definition: Execute.cpp:92
#define LOG(tag)
Definition: Logger.h:182
unsigned long long CUdeviceptr
Definition: nocuda.h:27
const int32_t groups_buffer_size
virtual int8_t * alloc(const size_t num_bytes)=0
#define CHECK_GT(x, y)
Definition: Logger.h:199
std::string to_string(char const *&&v)
size_t g_min_memory_allocation_size
Definition: Execute.cpp:93
#define CHECK_LE(x, y)
Definition: Logger.h:198
double g_bump_allocator_step_reduction
Definition: Execute.cpp:97
#define CHECK(condition)
Definition: Logger.h:187
size_t coalesced_size(const QueryMemoryDescriptor &query_mem_desc, const size_t group_by_one_buffer_size, const unsigned grid_size_x)
Definition: GpuMemUtils.cpp:52
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)
+ Here is the call graph for this function:
+ Here is the caller graph for this function:

◆ get_num_allocated_rows_from_gpu()

size_t get_num_allocated_rows_from_gpu ( Data_Namespace::DataMgr data_mgr,
CUdeviceptr  projection_size_gpu,
const int  device_id 
)

Returns back total number of allocated rows per device (i.e., number of matched elements in projections).

TODO(Saman): revisit this for bump allocators

Definition at line 274 of file GpuMemUtils.cpp.

References CHECK, copy_from_gpu(), and num_rows.

Referenced by QueryExecutionContext::launchGpuCode().

276  {
277  int32_t num_rows{0};
278  copy_from_gpu(data_mgr, &num_rows, projection_size_gpu, sizeof(num_rows), device_id);
279  CHECK(num_rows >= 0);
280  return static_cast<size_t>(num_rows);
281 }
const int8_t const int64_t * num_rows
void copy_from_gpu(Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
#define CHECK(condition)
Definition: Logger.h:187
+ Here is the call graph for this function:
+ Here is the caller graph for this function: