OmniSciDB  21ac014ffc
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GpuMemUtils.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "GpuMemUtils.h"
19 #include "GpuInitGroups.h"
20 #include "Logger/Logger.h"
21 #include "StreamingTopN.h"
22 
23 #include "../CudaMgr/CudaMgr.h"
24 #include "GroupByAndAggregate.h"
25 
26 extern size_t g_max_memory_allocation_size;
27 extern size_t g_min_memory_allocation_size;
29 
31  CUdeviceptr dst,
32  const void* src,
33  const size_t num_bytes,
34  const int device_id) {
35 #ifdef HAVE_CUDA
36  if (!data_mgr) { // only for unit tests
37  cuMemcpyHtoD(dst, src, num_bytes);
38  return;
39  }
40 #endif // HAVE_CUDA
41  const auto cuda_mgr = data_mgr->getCudaMgr();
42  CHECK(cuda_mgr);
43  cuda_mgr->copyHostToDevice(reinterpret_cast<int8_t*>(dst),
44  static_cast<const int8_t*>(src),
45  num_bytes,
46  device_id);
47 }
48 
49 namespace {
50 
52  const size_t group_by_one_buffer_size,
53  const unsigned grid_size_x) {
54  CHECK(query_mem_desc.threadsShareMemory());
55  return grid_size_x * group_by_one_buffer_size;
56 }
57 
58 } // namespace
59 
61  DeviceAllocator* cuda_allocator,
62  const std::vector<int64_t*>& group_by_buffers,
64  const unsigned block_size_x,
65  const unsigned grid_size_x,
66  const int device_id,
67  const ExecutorDispatchMode dispatch_mode,
68  const int64_t num_input_rows,
69  const bool prepend_index_buffer,
70  const bool always_init_group_by_on_host,
71  const bool use_bump_allocator,
72  const bool has_varlen_output,
73  Allocator* insitu_allocator) {
74  if (group_by_buffers.empty() && !insitu_allocator) {
75  return {0, 0, 0, 0};
76  }
77  CHECK(cuda_allocator);
78 
79  size_t groups_buffer_size{0};
80  CUdeviceptr group_by_dev_buffers_mem{0};
81  size_t mem_size{0};
82  size_t entry_count{0};
83 
84  if (use_bump_allocator) {
85  CHECK(!prepend_index_buffer);
86  CHECK(!insitu_allocator);
87 
88  if (dispatch_mode == ExecutorDispatchMode::KernelPerFragment) {
89  // Allocate an output buffer equal to the size of the number of rows in the
90  // fragment. The kernel per fragment path is only used for projections with lazy
91  // fetched outputs. Therefore, the resulting output buffer should be relatively
92  // narrow compared to the width of an input row, offsetting the larger allocation.
93 
94  CHECK_GT(num_input_rows, int64_t(0));
95  entry_count = num_input_rows;
96  groups_buffer_size =
97  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
98  mem_size = coalesced_size(query_mem_desc,
99  groups_buffer_size,
100  query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
101  // TODO(adb): render allocator support
102  group_by_dev_buffers_mem =
103  reinterpret_cast<CUdeviceptr>(cuda_allocator->alloc(mem_size));
104  } else {
105  // Attempt to allocate increasingly small buffers until we have less than 256B of
106  // memory remaining on the device. This may have the side effect of evicting
107  // memory allocated for previous queries. However, at current maximum slab sizes
108  // (2GB) we expect these effects to be minimal.
109  size_t max_memory_size{g_max_memory_allocation_size};
110  while (true) {
111  entry_count = max_memory_size / query_mem_desc.getRowSize();
112  groups_buffer_size =
113  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
114 
115  try {
116  mem_size = coalesced_size(query_mem_desc,
117  groups_buffer_size,
118  query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
119  CHECK_LE(entry_count, std::numeric_limits<uint32_t>::max());
120 
121  // TODO(adb): render allocator support
122  group_by_dev_buffers_mem =
123  reinterpret_cast<CUdeviceptr>(cuda_allocator->alloc(mem_size));
124  } catch (const OutOfMemory& e) {
125  LOG(WARNING) << e.what();
126  max_memory_size = max_memory_size * g_bump_allocator_step_reduction;
127  if (max_memory_size < g_min_memory_allocation_size) {
128  throw;
129  }
130 
131  LOG(WARNING) << "Ran out of memory for projection query output. Retrying with "
132  << std::to_string(max_memory_size) << " bytes";
133 
134  continue;
135  }
136  break;
137  }
138  }
139  LOG(INFO) << "Projection query allocation succeeded with " << groups_buffer_size
140  << " bytes allocated (max entry count " << entry_count << ")";
141  } else {
142  entry_count = query_mem_desc.getEntryCount();
143  CHECK_GT(entry_count, size_t(0));
144  groups_buffer_size =
145  query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU, entry_count);
146  mem_size = coalesced_size(query_mem_desc,
147  groups_buffer_size,
148  query_mem_desc.blocksShareMemory() ? 1 : grid_size_x);
149  const size_t prepended_buff_size{
150  prepend_index_buffer ? align_to_int64(entry_count * sizeof(int32_t)) : 0};
151 
152  int8_t* group_by_dev_buffers_allocation{nullptr};
153  if (insitu_allocator) {
154  group_by_dev_buffers_allocation =
155  insitu_allocator->alloc(mem_size + prepended_buff_size);
156  } else {
157  group_by_dev_buffers_allocation =
158  cuda_allocator->alloc(mem_size + prepended_buff_size);
159  }
160  CHECK(group_by_dev_buffers_allocation);
161 
162  group_by_dev_buffers_mem =
163  reinterpret_cast<CUdeviceptr>(group_by_dev_buffers_allocation) +
164  prepended_buff_size;
165  }
166  CHECK_GT(groups_buffer_size, size_t(0));
167  CHECK(group_by_dev_buffers_mem);
168 
169  CHECK(query_mem_desc.threadsShareMemory());
170  const size_t step{block_size_x};
171 
172  if (!insitu_allocator && (always_init_group_by_on_host ||
173  !query_mem_desc.lazyInitGroups(ExecutorDeviceType::GPU))) {
174  std::vector<int8_t> buff_to_gpu(mem_size);
175  auto buff_to_gpu_ptr = buff_to_gpu.data();
176 
177  const size_t start = has_varlen_output ? 1 : 0;
178  for (size_t i = start; i < group_by_buffers.size(); i += step) {
179  memcpy(buff_to_gpu_ptr, group_by_buffers[i], groups_buffer_size);
180  buff_to_gpu_ptr += groups_buffer_size;
181  }
182  cuda_allocator->copyToDevice(reinterpret_cast<int8_t*>(group_by_dev_buffers_mem),
183  buff_to_gpu.data(),
184  buff_to_gpu.size());
185  }
186 
187  auto group_by_dev_buffer = group_by_dev_buffers_mem;
188 
189  const size_t num_ptrs =
190  (block_size_x * grid_size_x) + (has_varlen_output ? size_t(1) : size_t(0));
191 
192  std::vector<CUdeviceptr> group_by_dev_buffers(num_ptrs);
193 
194  const size_t start_index = has_varlen_output ? 1 : 0;
195  for (size_t i = start_index; i < num_ptrs; i += step) {
196  for (size_t j = 0; j < step; ++j) {
197  group_by_dev_buffers[i + j] = group_by_dev_buffer;
198  }
199  if (!query_mem_desc.blocksShareMemory()) {
200  group_by_dev_buffer += groups_buffer_size;
201  }
202  }
203 
204  CUdeviceptr varlen_output_buffer{0};
205  if (has_varlen_output) {
206  const auto varlen_buffer_elem_size_opt = query_mem_desc.varlenOutputBufferElemSize();
207  CHECK(varlen_buffer_elem_size_opt); // TODO(adb): relax
208 
209  group_by_dev_buffers[0] = reinterpret_cast<CUdeviceptr>(cuda_allocator->alloc(
210  query_mem_desc.getEntryCount() * varlen_buffer_elem_size_opt.value()));
211  varlen_output_buffer = group_by_dev_buffers[0];
212  }
213 
214  auto group_by_dev_ptr = cuda_allocator->alloc(num_ptrs * sizeof(CUdeviceptr));
215  cuda_allocator->copyToDevice(group_by_dev_ptr,
216  reinterpret_cast<int8_t*>(group_by_dev_buffers.data()),
217  num_ptrs * sizeof(CUdeviceptr));
218 
219  return {reinterpret_cast<CUdeviceptr>(group_by_dev_ptr),
220  group_by_dev_buffers_mem,
221  entry_count,
222  varlen_output_buffer};
223 }
224 
226  void* dst,
227  const CUdeviceptr src,
228  const size_t num_bytes,
229  const int device_id) {
230  const auto cuda_mgr = data_mgr->getCudaMgr();
231  CHECK(cuda_mgr);
232  cuda_mgr->copyDeviceToHost(static_cast<int8_t*>(dst),
233  reinterpret_cast<const int8_t*>(src),
234  num_bytes,
235  device_id);
236 }
237 
239  const std::vector<int64_t*>& group_by_buffers,
240  const size_t groups_buffer_size,
241  const CUdeviceptr group_by_dev_buffers_mem,
242  const QueryMemoryDescriptor& query_mem_desc,
243  const unsigned block_size_x,
244  const unsigned grid_size_x,
245  const int device_id,
246  const bool prepend_index_buffer,
247  const bool has_varlen_output) {
248  if (group_by_buffers.empty()) {
249  return;
250  }
251  const size_t first_group_buffer_idx = has_varlen_output ? 1 : 0;
252 
253  const unsigned block_buffer_count{query_mem_desc.blocksShareMemory() ? 1 : grid_size_x};
254  if (block_buffer_count == 1 && !prepend_index_buffer) {
255  CHECK_EQ(coalesced_size(query_mem_desc, groups_buffer_size, block_buffer_count),
256  groups_buffer_size);
257  copy_from_gpu(data_mgr,
258  group_by_buffers[first_group_buffer_idx],
259  group_by_dev_buffers_mem,
260  groups_buffer_size,
261  device_id);
262  return;
263  }
264  const size_t index_buffer_sz{
265  prepend_index_buffer ? query_mem_desc.getEntryCount() * sizeof(int64_t) : 0};
266  std::vector<int8_t> buff_from_gpu(
267  coalesced_size(query_mem_desc, groups_buffer_size, block_buffer_count) +
268  index_buffer_sz);
269  copy_from_gpu(data_mgr,
270  &buff_from_gpu[0],
271  group_by_dev_buffers_mem - index_buffer_sz,
272  buff_from_gpu.size(),
273  device_id);
274  auto buff_from_gpu_ptr = &buff_from_gpu[0];
275  for (size_t i = 0; i < block_buffer_count; ++i) {
276  const size_t buffer_idx = (i * block_size_x) + first_group_buffer_idx;
277  CHECK_LT(buffer_idx, group_by_buffers.size());
278  memcpy(group_by_buffers[buffer_idx],
279  buff_from_gpu_ptr,
280  groups_buffer_size + index_buffer_sz);
281  buff_from_gpu_ptr += groups_buffer_size;
282  }
283 }
284 
292  CUdeviceptr projection_size_gpu,
293  const int device_id) {
294  int32_t num_rows{0};
295  copy_from_gpu(data_mgr, &num_rows, projection_size_gpu, sizeof(num_rows), device_id);
296  CHECK(num_rows >= 0);
297  return static_cast<size_t>(num_rows);
298 }
299 
308  Data_Namespace::DataMgr* data_mgr,
309  const GpuGroupByBuffers& gpu_group_by_buffers,
310  const QueryMemoryDescriptor& query_mem_desc,
311  int8_t* projection_buffer,
312  const size_t projection_count,
313  const int device_id) {
314  CHECK(query_mem_desc.didOutputColumnar());
316  constexpr size_t row_index_width = sizeof(int64_t);
317  // copy all the row indices back to the host
318  copy_from_gpu(data_mgr,
319  reinterpret_cast<int64_t*>(projection_buffer),
320  gpu_group_by_buffers.data,
321  projection_count * row_index_width,
322  device_id);
323  size_t buffer_offset_cpu{projection_count * row_index_width};
324  // other columns are actual non-lazy columns for the projection:
325  for (size_t i = 0; i < query_mem_desc.getSlotCount(); i++) {
326  if (query_mem_desc.getPaddedSlotWidthBytes(i) > 0) {
327  const auto column_proj_size =
328  projection_count * query_mem_desc.getPaddedSlotWidthBytes(i);
329  copy_from_gpu(data_mgr,
330  projection_buffer + buffer_offset_cpu,
331  gpu_group_by_buffers.data + query_mem_desc.getColOffInBytes(i),
332  column_proj_size,
333  device_id);
334  buffer_offset_cpu += align_to_int64(column_proj_size);
335  }
336  }
337 }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:208
#define CHECK_EQ(x, y)
Definition: Logger.h:214
size_t getBufferSizeBytes(const RelAlgExecutionUnit &ra_exe_unit, const unsigned thread_count, const ExecutorDeviceType device_type) const
double g_bump_allocator_step_reduction
Definition: Execute.cpp:110
CUdeviceptr data
Definition: GpuMemUtils.h:61
Streaming Top N algorithm.
#define LOG(tag)
Definition: Logger.h:200
unsigned long long CUdeviceptr
Definition: nocuda.h:27
virtual void copyToDevice(int8_t *device_dst, const int8_t *host_src, const size_t num_bytes) const =0
virtual int8_t * alloc(const size_t num_bytes)=0
#define CHECK_GT(x, y)
Definition: Logger.h:218
std::string to_string(char const *&&v)
ExecutorDispatchMode
void copy_group_by_buffers_from_gpu(Data_Namespace::DataMgr *data_mgr, const std::vector< int64_t * > &group_by_buffers, const size_t groups_buffer_size, const CUdeviceptr group_by_dev_buffers_mem, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const bool prepend_index_buffer, const bool has_varlen_output)
void copy_to_gpu(Data_Namespace::DataMgr *data_mgr, CUdeviceptr dst, const void *src, const size_t num_bytes, const int device_id)
Definition: GpuMemUtils.cpp:30
bool lazyInitGroups(const ExecutorDeviceType) const
void copy_from_gpu(Data_Namespace::DataMgr *data_mgr, void *dst, const CUdeviceptr src, const size_t num_bytes, const int device_id)
size_t g_max_memory_allocation_size
Definition: Execute.cpp:105
const int8_t getPaddedSlotWidthBytes(const size_t slot_idx) const
QueryDescriptionType getQueryDescriptionType() const
size_t g_min_memory_allocation_size
Definition: Execute.cpp:106
std::optional< size_t > varlenOutputBufferElemSize() const
#define CHECK_LT(x, y)
Definition: Logger.h:216
size_t get_num_allocated_rows_from_gpu(Data_Namespace::DataMgr *data_mgr, CUdeviceptr projection_size_gpu, const int device_id)
#define CHECK_LE(x, y)
Definition: Logger.h:217
#define CHECK(condition)
Definition: Logger.h:206
GpuGroupByBuffers create_dev_group_by_buffers(DeviceAllocator *cuda_allocator, const std::vector< int64_t * > &group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, const unsigned block_size_x, const unsigned grid_size_x, const int device_id, const ExecutorDispatchMode dispatch_mode, const int64_t num_input_rows, const bool prepend_index_buffer, const bool always_init_group_by_on_host, const bool use_bump_allocator, const bool has_varlen_output, Allocator *insitu_allocator)
Definition: GpuMemUtils.cpp:60
void copy_projection_buffer_from_gpu_columnar(Data_Namespace::DataMgr *data_mgr, const GpuGroupByBuffers &gpu_group_by_buffers, const QueryMemoryDescriptor &query_mem_desc, int8_t *projection_buffer, const size_t projection_count, const int device_id)
Allocate GPU memory using GpuBuffers via DataMgr.
size_t coalesced_size(const QueryMemoryDescriptor &query_mem_desc, const size_t group_by_one_buffer_size, const unsigned grid_size_x)
Definition: GpuMemUtils.cpp:51
size_t getColOffInBytes(const size_t col_idx) const
FORCE_INLINE HOST DEVICE T align_to_int64(T addr)