OmniSciDB  8a228a1076
CudaMgr.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2018 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CudaMgr/CudaMgr.h"
19 
20 #include <algorithm>
21 #include <boost/stacktrace.hpp>
22 #include <cassert>
23 #include <iostream>
24 #include <stdexcept>
25 
26 #include "Logger/Logger.h"
27 
28 namespace CudaMgr_Namespace {
29 
30 CudaErrorException::CudaErrorException(CUresult status)
31  : std::runtime_error(errorMessage(status)), status_(status) {
32  // cuda already de-initialized can occur during system shutdown. avoid making calls to
33  // the logger to prevent failing during a standard teardown.
34  if (status != CUDA_ERROR_DEINITIALIZED) {
35  VLOG(1) << errorMessage(status);
36  VLOG(1) << boost::stacktrace::stacktrace();
37  }
38 }
39 
40 std::string errorMessage(CUresult const status) {
41  const char* errorString{nullptr};
42  cuGetErrorString(status, &errorString);
43  return errorString
44  ? "CUDA Error (" + std::to_string(status) + "): " + std::string(errorString)
45  : "CUDA Driver API error code " + std::to_string(status);
46 }
47 
48 CudaMgr::CudaMgr(const int num_gpus, const int start_gpu)
49  : start_gpu_(start_gpu)
50  , min_shared_memory_per_block_for_all_devices(0)
51  , min_num_mps_for_all_devices(0) {
52  checkError(cuInit(0));
53  checkError(cuDeviceGetCount(&device_count_));
54 
55  if (num_gpus > 0) { // numGpus <= 0 will just use number of gpus found
56  CHECK_LE(num_gpus + start_gpu_, device_count_);
57  device_count_ = std::min(device_count_, num_gpus);
58  } else {
59  // if we are using all gpus we cannot start on a gpu other than 0
60  CHECK_EQ(start_gpu_, 0);
61  }
62  fillDeviceProperties();
63  initDeviceGroup();
64  createDeviceContexts();
65  printDeviceProperties();
66 
67  // warm up the GPU JIT
68  LOG(INFO) << "Warming up the GPU JIT Compiler... (this may take several seconds)";
69  setContext(0);
71  LOG(INFO) << "GPU JIT Compiler initialized.";
72 }
73 
74 void CudaMgr::initDeviceGroup() {
75  for (int device_id = 0; device_id < device_count_; device_id++) {
76  device_group_.push_back(
77  {device_id, device_id + start_gpu_, device_properties_[device_id].uuid});
78  }
79 }
80 
82  try {
83  // We don't want to remove the cudaMgr before all other processes have cleaned up.
84  // This should be enforced by the lifetime policies, but take this lock to be safe.
85  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
86 
88  for (int d = 0; d < device_count_; ++d) {
89  checkError(cuCtxDestroy(device_contexts_[d]));
90  }
91  } catch (const CudaErrorException& e) {
92  if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
93  // TODO(adb / asuhan): Verify cuModuleUnload removes the context
94  return;
95  }
96  LOG(ERROR) << "CUDA Error: " << e.what();
97  } catch (const std::runtime_error& e) {
98  LOG(ERROR) << "CUDA Error: " << e.what();
99  }
100 }
101 
103  for (int d = 0; d < device_count_; ++d) {
104  setContext(d);
105  checkError(cuCtxSynchronize());
106  }
107 }
108 
109 void CudaMgr::copyHostToDevice(int8_t* device_ptr,
110  const int8_t* host_ptr,
111  const size_t num_bytes,
112  const int device_num) {
113  setContext(device_num);
114  checkError(
115  cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
116 }
117 
118 void CudaMgr::copyDeviceToHost(int8_t* host_ptr,
119  const int8_t* device_ptr,
120  const size_t num_bytes,
121  const int device_num) {
122  setContext(device_num);
123  checkError(
124  cuMemcpyDtoH(host_ptr, reinterpret_cast<const CUdeviceptr>(device_ptr), num_bytes));
125 }
126 
127 void CudaMgr::copyDeviceToDevice(int8_t* dest_ptr,
128  int8_t* src_ptr,
129  const size_t num_bytes,
130  const int dest_device_num,
131  const int src_device_num) {
132  // dest_device_num and src_device_num are the device numbers relative to start_gpu_
133  // (real_device_num - start_gpu_)
134  if (src_device_num == dest_device_num) {
135  setContext(src_device_num);
136  checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
137  reinterpret_cast<CUdeviceptr>(src_ptr),
138  num_bytes));
139  } else {
140  checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
141  device_contexts_[dest_device_num],
142  reinterpret_cast<CUdeviceptr>(src_ptr),
143  device_contexts_[src_device_num],
144  num_bytes)); // will we always have peer?
145  }
146 }
147 
148 void CudaMgr::loadGpuModuleData(CUmodule* module,
149  const void* image,
150  unsigned int num_options,
151  CUjit_option* options,
152  void** option_vals,
153  const int device_id) const {
154  setContext(device_id);
155  checkError(cuModuleLoadDataEx(module, image, num_options, options, option_vals));
156 }
157 
158 void CudaMgr::unloadGpuModuleData(CUmodule* module, const int device_id) const {
159  std::lock_guard<std::mutex> gpuLock(device_cleanup_mutex_);
160  CHECK(module);
161 
162  setContext(device_id);
163  try {
164  auto code = cuModuleUnload(*module);
165  // If the Cuda driver has already shut down, ignore the resulting errors.
166  if (code != CUDA_ERROR_DEINITIALIZED) {
167  checkError(code);
168  }
169  } catch (const std::runtime_error& e) {
170  LOG(ERROR) << "CUDA Error: " << e.what();
171  }
172 }
173 
174 CudaMgr::CudaMemoryUsage CudaMgr::getCudaMemoryUsage() {
175  CudaMemoryUsage usage;
176  cuMemGetInfo(&usage.free, &usage.total);
177  return usage;
178 }
179 
180 void CudaMgr::fillDeviceProperties() {
182  cuDriverGetVersion(&gpu_driver_version_);
183  for (int device_num = 0; device_num < device_count_; ++device_num) {
184  checkError(
185  cuDeviceGet(&device_properties_[device_num].device, device_num + start_gpu_));
186  CUuuid cuda_uuid;
187  checkError(cuDeviceGetUuid(&cuda_uuid, device_properties_[device_num].device));
188  device_properties_[device_num].uuid = omnisci::UUID(cuda_uuid.bytes);
189  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMajor,
190  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
191  device_properties_[device_num].device));
192  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMinor,
193  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
194  device_properties_[device_num].device));
195  checkError(cuDeviceTotalMem(&device_properties_[device_num].globalMem,
196  device_properties_[device_num].device));
197  checkError(cuDeviceGetAttribute(&device_properties_[device_num].constantMem,
198  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
199  device_properties_[device_num].device));
200  checkError(
201  cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerMP,
202  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
203  device_properties_[device_num].device));
204  checkError(cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerBlock,
205  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
206  device_properties_[device_num].device));
207  checkError(cuDeviceGetAttribute(&device_properties_[device_num].numMPs,
208  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
209  device_properties_[device_num].device));
210  checkError(cuDeviceGetAttribute(&device_properties_[device_num].warpSize,
211  CU_DEVICE_ATTRIBUTE_WARP_SIZE,
212  device_properties_[device_num].device));
213  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxThreadsPerBlock,
214  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
215  device_properties_[device_num].device));
216  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerBlock,
217  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
218  device_properties_[device_num].device));
219  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerMP,
220  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
221  device_properties_[device_num].device));
222  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciBusId,
223  CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
224  device_properties_[device_num].device));
225  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciDeviceId,
226  CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
227  device_properties_[device_num].device));
228  checkError(cuDeviceGetAttribute(&device_properties_[device_num].clockKhz,
229  CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
230  device_properties_[device_num].device));
231  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryClockKhz,
232  CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
233  device_properties_[device_num].device));
234  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryBusWidth,
235  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
236  device_properties_[device_num].device));
237  device_properties_[device_num].memoryBandwidthGBs =
238  device_properties_[device_num].memoryClockKhz / 1000000.0 / 8.0 *
239  device_properties_[device_num].memoryBusWidth;
240  }
242  computeMinSharedMemoryPerBlockForAllDevices();
243  min_num_mps_for_all_devices = computeMinNumMPsForAllDevices();
244 }
245 
246 int8_t* CudaMgr::allocatePinnedHostMem(const size_t num_bytes) {
247  setContext(0);
248  void* host_ptr;
249  checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
250  return reinterpret_cast<int8_t*>(host_ptr);
251 }
252 
253 int8_t* CudaMgr::allocateDeviceMem(const size_t num_bytes, const int device_num) {
254  setContext(device_num);
255  CUdeviceptr device_ptr;
256  checkError(cuMemAlloc(&device_ptr, num_bytes));
257  return reinterpret_cast<int8_t*>(device_ptr);
258 }
259 
260 void CudaMgr::freePinnedHostMem(int8_t* host_ptr) {
261  checkError(cuMemFreeHost(reinterpret_cast<void*>(host_ptr)));
262 }
263 
264 void CudaMgr::freeDeviceMem(int8_t* device_ptr) {
265  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
266 
267  checkError(cuMemFree(reinterpret_cast<CUdeviceptr>(device_ptr)));
268 }
269 
270 void CudaMgr::zeroDeviceMem(int8_t* device_ptr,
271  const size_t num_bytes,
272  const int device_num) {
273  setDeviceMem(device_ptr, 0, num_bytes, device_num);
274 }
275 
276 void CudaMgr::setDeviceMem(int8_t* device_ptr,
277  const unsigned char uc,
278  const size_t num_bytes,
279  const int device_num) {
280  setContext(device_num);
281  checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
282 }
283 
289  for (int i = 0; i < device_count_; i++) {
290  if (device_properties_[i].computeMajor < 5) {
291  return false;
292  }
293  }
294  return true;
295 }
296 
302  for (int i = 0; i < device_count_; i++) {
303  if (device_properties_[i].computeMajor < 7) {
304  return false;
305  }
306  }
307  return true;
308 }
309 
314 size_t CudaMgr::computeMinSharedMemoryPerBlockForAllDevices() const {
315  int shared_mem_size =
316  device_count_ > 0 ? device_properties_.front().sharedMemPerBlock : 0;
317  for (int d = 1; d < device_count_; d++) {
318  shared_mem_size = std::min(shared_mem_size, device_properties_[d].sharedMemPerBlock);
319  }
320  return shared_mem_size;
321 }
322 
327 size_t CudaMgr::computeMinNumMPsForAllDevices() const {
328  int num_mps = device_count_ > 0 ? device_properties_.front().numMPs : 0;
329  for (int d = 1; d < device_count_; d++) {
330  num_mps = std::min(num_mps, device_properties_[d].numMPs);
331  }
332  return num_mps;
333 }
334 
335 void CudaMgr::createDeviceContexts() {
336  CHECK_EQ(device_contexts_.size(), size_t(0));
338  for (int d = 0; d < device_count_; ++d) {
339  CUresult status = cuCtxCreate(&device_contexts_[d], 0, device_properties_[d].device);
340  if (status != CUDA_SUCCESS) {
341  // this is called from destructor so we need
342  // to clean up
343  // destroy all contexts up to this point
344  for (int destroy_id = 0; destroy_id <= d; ++destroy_id) {
345  try {
346  checkError(cuCtxDestroy(device_contexts_[destroy_id]));
347  } catch (const CudaErrorException& e) {
348  LOG(ERROR) << "Failed to destroy CUDA context for device ID " << destroy_id
349  << " with " << e.what()
350  << ". CUDA contexts were being destroyed due to an error creating "
351  "CUDA context for device ID "
352  << d << " out of " << device_count_ << " (" << errorMessage(status)
353  << ").";
354  }
355  }
356  // checkError will translate the message and throw
357  checkError(status);
358  }
359  }
360 }
361 
362 void CudaMgr::setContext(const int device_num) const {
363  // deviceNum is the device number relative to startGpu (realDeviceNum - startGpu_)
364  CHECK_LT(device_num, device_count_);
365  cuCtxSetCurrent(device_contexts_[device_num]);
366 }
367 
368 void CudaMgr::printDeviceProperties() const {
369  LOG(INFO) << "Using " << device_count_ << " Gpus.";
370  for (int d = 0; d < device_count_; ++d) {
371  VLOG(1) << "Device: " << device_properties_[d].device;
372  VLOG(1) << "UUID: " << device_properties_[d].uuid;
373  VLOG(1) << "Clock (khz): " << device_properties_[d].clockKhz;
374  VLOG(1) << "Compute Major: " << device_properties_[d].computeMajor;
375  VLOG(1) << "Compute Minor: " << device_properties_[d].computeMinor;
376  VLOG(1) << "PCI bus id: " << device_properties_[d].pciBusId;
377  VLOG(1) << "PCI deviceId id: " << device_properties_[d].pciDeviceId;
378  VLOG(1) << "Per device global memory: "
379  << device_properties_[d].globalMem / 1073741824.0 << " GB";
380  VLOG(1) << "Memory clock (khz): " << device_properties_[d].memoryClockKhz;
381  VLOG(1) << "Memory bandwidth: " << device_properties_[d].memoryBandwidthGBs
382  << " GB/sec";
383 
384  VLOG(1) << "Constant Memory: " << device_properties_[d].constantMem;
385  VLOG(1) << "Shared memory per multiprocessor: "
386  << device_properties_[d].sharedMemPerMP;
387  VLOG(1) << "Shared memory per block: " << device_properties_[d].sharedMemPerBlock;
388  VLOG(1) << "Number of MPs: " << device_properties_[d].numMPs;
389  VLOG(1) << "Warp Size: " << device_properties_[d].warpSize;
390  VLOG(1) << "Max threads per block: " << device_properties_[d].maxThreadsPerBlock;
391  VLOG(1) << "Max registers per block: " << device_properties_[d].maxRegistersPerBlock;
392  VLOG(1) << "Max register per MP: " << device_properties_[d].maxRegistersPerMP;
393  VLOG(1) << "Memory bus width in bits: " << device_properties_[d].memoryBusWidth;
394  }
395 }
396 
397 void CudaMgr::checkError(CUresult status) const {
398  if (status != CUDA_SUCCESS) {
399  throw CudaErrorException(status);
400  }
401 }
402 
403 } // namespace CudaMgr_Namespace
#define CHECK_EQ(x, y)
Definition: Logger.h:205
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:239
int CUjit_option
Definition: nocuda.h:25
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:118
std::mutex device_cleanup_mutex_
Definition: CudaMgr.h:244
#define LOG(tag)
Definition: Logger.h:188
int8_t * allocatePinnedHostMem(const size_t num_bytes)
Definition: CudaMgr.cpp:246
void nvidia_jit_warmup()
unsigned long long CUdeviceptr
Definition: nocuda.h:27
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:238
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:288
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num)
Definition: CudaMgr.cpp:127
std::string to_string(char const *&&v)
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:242
void freeDeviceMem(int8_t *device_ptr)
Definition: CudaMgr.cpp:264
omnisci::DeviceGroup device_group_
Definition: CudaMgr.h:241
std::string errorMessage(CUresult const status)
Definition: CudaMgr.cpp:40
void synchronizeDevices() const
Definition: CudaMgr.cpp:102
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:276
bool isArchVoltaOrGreaterForAll() const
Definition: CudaMgr.cpp:301
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:109
#define CHECK_LT(x, y)
Definition: Logger.h:207
#define CHECK_LE(x, y)
Definition: Logger.h:208
int CUresult
Definition: nocuda.h:21
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240
void freePinnedHostMem(int8_t *host_ptr)
Definition: CudaMgr.cpp:260
#define CHECK(condition)
Definition: Logger.h:197
CudaMgr(const int num_gpus, const int start_gpu=0)
Definition: CudaMgr.cpp:48
void setContext(const int device_num) const
Definition: CudaMgr.cpp:362
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:270
#define VLOG(n)
Definition: Logger.h:291
void * CUmodule
Definition: nocuda.h:23
int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:253