OmniSciDB  085a039ca4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CudaMgr.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2018 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CudaMgr/CudaMgr.h"
20 
21 #include <algorithm>
22 #include <cassert>
23 #include <iostream>
24 #include <stdexcept>
25 
26 #include <boost/filesystem.hpp>
27 #include "Logger/Logger.h"
28 
29 namespace CudaMgr_Namespace {
30 
31 CudaErrorException::CudaErrorException(CUresult status)
32  : std::runtime_error(errorMessage(status)), status_(status) {
33  // cuda already de-initialized can occur during system shutdown. avoid making calls to
34  // the logger to prevent failing during a standard teardown.
35  if (status != CUDA_ERROR_DEINITIALIZED) {
36  VLOG(1) << errorMessage(status);
37  VLOG(1) << boost::stacktrace::stacktrace();
38  }
39 }
40 
41 std::string errorMessage(CUresult const status) {
42  const char* errorString{nullptr};
43  cuGetErrorString(status, &errorString);
44  return errorString
45  ? "CUDA Error (" + std::to_string(status) + "): " + std::string(errorString)
46  : "CUDA Driver API error code " + std::to_string(status);
47 }
48 
49 CudaMgr::CudaMgr(const int num_gpus, const int start_gpu)
50  : start_gpu_(start_gpu)
51  , min_shared_memory_per_block_for_all_devices(0)
52  , min_num_mps_for_all_devices(0) {
53  checkError(cuInit(0));
54  checkError(cuDeviceGetCount(&device_count_));
55 
56  if (num_gpus > 0) { // numGpus <= 0 will just use number of gpus found
57  device_count_ = std::min(device_count_, num_gpus);
58  } else {
59  // if we are using all gpus we cannot start on a gpu other than 0
60  CHECK_EQ(start_gpu_, 0);
61  }
62  fillDeviceProperties();
63  initDeviceGroup();
64  createDeviceContexts();
65  printDeviceProperties();
66 
67  // warm up the GPU JIT
68  LOG(INFO) << "Warming up the GPU JIT Compiler... (this may take several seconds)";
69  setContext(0);
71  LOG(INFO) << "GPU JIT Compiler initialized.";
72 }
73 
74 void CudaMgr::initDeviceGroup() {
75  for (int device_id = 0; device_id < device_count_; device_id++) {
76  device_group_.push_back(
77  {device_id, device_id + start_gpu_, device_properties_[device_id].uuid});
78  }
79 }
80 
82  try {
83  // We don't want to remove the cudaMgr before all other processes have cleaned up.
84  // This should be enforced by the lifetime policies, but take this lock to be safe.
85  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
86 
88  for (int d = 0; d < device_count_; ++d) {
89  checkError(cuCtxDestroy(device_contexts_[d]));
90  }
91  } catch (const CudaErrorException& e) {
92  if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
93  // TODO(adb / asuhan): Verify cuModuleUnload removes the context
94  return;
95  }
96  LOG(ERROR) << "CUDA Error: " << e.what();
97  } catch (const std::runtime_error& e) {
98  LOG(ERROR) << "CUDA Error: " << e.what();
99  }
100 }
101 
103  for (int d = 0; d < device_count_; ++d) {
104  setContext(d);
105  checkError(cuCtxSynchronize());
106  }
107 }
108 
109 void CudaMgr::copyHostToDevice(int8_t* device_ptr,
110  const int8_t* host_ptr,
111  const size_t num_bytes,
112  const int device_num,
113  CUstream cuda_stream) {
114  setContext(device_num);
115  if (!cuda_stream) {
116  checkError(
117  cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
118  } else {
119  checkError(cuMemcpyHtoDAsync(
120  reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes, cuda_stream));
121  checkError(cuStreamSynchronize(cuda_stream));
122  }
123 }
124 
125 void CudaMgr::copyDeviceToHost(int8_t* host_ptr,
126  const int8_t* device_ptr,
127  const size_t num_bytes,
128  const int device_num,
129  CUstream cuda_stream) {
130  setContext(device_num);
131  if (!cuda_stream) {
132  checkError(cuMemcpyDtoH(
133  host_ptr, reinterpret_cast<const CUdeviceptr>(device_ptr), num_bytes));
134  } else {
135  checkError(cuMemcpyDtoHAsync(host_ptr,
136  reinterpret_cast<const CUdeviceptr>(device_ptr),
137  num_bytes,
138  cuda_stream));
139  checkError(cuStreamSynchronize(cuda_stream));
140  }
141 }
142 
143 void CudaMgr::copyDeviceToDevice(int8_t* dest_ptr,
144  int8_t* src_ptr,
145  const size_t num_bytes,
146  const int dest_device_num,
147  const int src_device_num,
148  CUstream cuda_stream) {
149  // dest_device_num and src_device_num are the device numbers relative to start_gpu_
150  // (real_device_num - start_gpu_)
151  if (src_device_num == dest_device_num) {
152  setContext(src_device_num);
153  if (!cuda_stream) {
154  checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
155  reinterpret_cast<CUdeviceptr>(src_ptr),
156  num_bytes));
157  } else {
158  checkError(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
159  reinterpret_cast<CUdeviceptr>(src_ptr),
160  num_bytes,
161  cuda_stream));
162  checkError(cuStreamSynchronize(cuda_stream));
163  }
164  } else {
165  if (!cuda_stream) {
166  checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
167  device_contexts_[dest_device_num],
168  reinterpret_cast<CUdeviceptr>(src_ptr),
169  device_contexts_[src_device_num],
170  num_bytes)); // will we always have peer?
171  } else {
172  checkError(cuMemcpyPeerAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
173  device_contexts_[dest_device_num],
174  reinterpret_cast<CUdeviceptr>(src_ptr),
175  device_contexts_[src_device_num],
176  num_bytes,
177  cuda_stream)); // will we always have peer?
178  checkError(cuStreamSynchronize(cuda_stream));
179  }
180  }
181 }
182 
183 void CudaMgr::loadGpuModuleData(CUmodule* module,
184  const void* image,
185  unsigned int num_options,
186  CUjit_option* options,
187  void** option_vals,
188  const int device_id) const {
189  setContext(device_id);
190  checkError(cuModuleLoadDataEx(module, image, num_options, options, option_vals));
191 }
192 
193 void CudaMgr::unloadGpuModuleData(CUmodule* module, const int device_id) const {
194  std::lock_guard<std::mutex> gpuLock(device_cleanup_mutex_);
195  CHECK(module);
196 
197  setContext(device_id);
198  try {
199  auto code = cuModuleUnload(*module);
200  // If the Cuda driver has already shut down, ignore the resulting errors.
201  if (code != CUDA_ERROR_DEINITIALIZED) {
202  checkError(code);
203  }
204  } catch (const std::runtime_error& e) {
205  LOG(ERROR) << "CUDA Error: " << e.what();
206  }
207 }
208 
209 CudaMgr::CudaMemoryUsage CudaMgr::getCudaMemoryUsage() {
210  CudaMemoryUsage usage;
211  cuMemGetInfo(&usage.free, &usage.total);
212  return usage;
213 }
214 
215 void CudaMgr::fillDeviceProperties() {
216  device_properties_.resize(device_count_);
217  cuDriverGetVersion(&gpu_driver_version_);
218  for (int device_num = 0; device_num < device_count_; ++device_num) {
219  checkError(
220  cuDeviceGet(&device_properties_[device_num].device, device_num + start_gpu_));
221  CUuuid cuda_uuid;
222  checkError(cuDeviceGetUuid(&cuda_uuid, device_properties_[device_num].device));
223  device_properties_[device_num].uuid = heavyai::UUID(cuda_uuid.bytes);
224  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMajor,
225  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
226  device_properties_[device_num].device));
227  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMinor,
228  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
229  device_properties_[device_num].device));
230  checkError(cuDeviceTotalMem(&device_properties_[device_num].globalMem,
231  device_properties_[device_num].device));
232  checkError(cuDeviceGetAttribute(&device_properties_[device_num].constantMem,
233  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
234  device_properties_[device_num].device));
235  checkError(
236  cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerMP,
237  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
238  device_properties_[device_num].device));
239  checkError(cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerBlock,
240  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
241  device_properties_[device_num].device));
242  checkError(cuDeviceGetAttribute(&device_properties_[device_num].numMPs,
243  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
244  device_properties_[device_num].device));
245  checkError(cuDeviceGetAttribute(&device_properties_[device_num].warpSize,
246  CU_DEVICE_ATTRIBUTE_WARP_SIZE,
247  device_properties_[device_num].device));
248  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxThreadsPerBlock,
249  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
250  device_properties_[device_num].device));
251  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerBlock,
252  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
253  device_properties_[device_num].device));
254  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerMP,
255  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
256  device_properties_[device_num].device));
257  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciBusId,
258  CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
259  device_properties_[device_num].device));
260  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciDeviceId,
261  CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
262  device_properties_[device_num].device));
263  checkError(cuDeviceGetAttribute(&device_properties_[device_num].clockKhz,
264  CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
265  device_properties_[device_num].device));
266  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryClockKhz,
267  CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
268  device_properties_[device_num].device));
269  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryBusWidth,
270  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
271  device_properties_[device_num].device));
272  device_properties_[device_num].memoryBandwidthGBs =
273  device_properties_[device_num].memoryClockKhz / 1000000.0 / 8.0 *
274  device_properties_[device_num].memoryBusWidth;
275  }
277  computeMinSharedMemoryPerBlockForAllDevices();
278  min_num_mps_for_all_devices = computeMinNumMPsForAllDevices();
279 }
280 
281 int8_t* CudaMgr::allocatePinnedHostMem(const size_t num_bytes) {
282  setContext(0);
283  void* host_ptr;
284  checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
285  return reinterpret_cast<int8_t*>(host_ptr);
286 }
287 
288 int8_t* CudaMgr::allocateDeviceMem(const size_t num_bytes, const int device_num) {
289  setContext(device_num);
290  CUdeviceptr device_ptr;
291  checkError(cuMemAlloc(&device_ptr, num_bytes));
292  return reinterpret_cast<int8_t*>(device_ptr);
293 }
294 
295 void CudaMgr::freePinnedHostMem(int8_t* host_ptr) {
296  checkError(cuMemFreeHost(reinterpret_cast<void*>(host_ptr)));
297 }
298 
299 void CudaMgr::freeDeviceMem(int8_t* device_ptr) {
300  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
301 
302  checkError(cuMemFree(reinterpret_cast<CUdeviceptr>(device_ptr)));
303 }
304 
305 void CudaMgr::zeroDeviceMem(int8_t* device_ptr,
306  const size_t num_bytes,
307  const int device_num,
308  CUstream cuda_stream) {
309  setDeviceMem(device_ptr, 0, num_bytes, device_num, cuda_stream);
310 }
311 
312 void CudaMgr::setDeviceMem(int8_t* device_ptr,
313  const unsigned char uc,
314  const size_t num_bytes,
315  const int device_num,
316  CUstream cuda_stream) {
317  setContext(device_num);
318  if (!cuda_stream) {
319  checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
320  } else {
321  checkError(cuMemsetD8Async(
322  reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes, cuda_stream));
323  checkError(cuStreamSynchronize(cuda_stream));
324  }
325 }
326 
332  for (int i = 0; i < device_count_; i++) {
333  if (device_properties_[i].computeMajor < 5) {
334  return false;
335  }
336  }
337  return true;
338 }
339 
345  for (int i = 0; i < device_count_; i++) {
346  if (device_properties_[i].computeMajor < 7) {
347  return false;
348  }
349  }
350  return true;
351 }
352 
357 size_t CudaMgr::computeMinSharedMemoryPerBlockForAllDevices() const {
358  int shared_mem_size =
359  device_count_ > 0 ? device_properties_.front().sharedMemPerBlock : 0;
360  for (int d = 1; d < device_count_; d++) {
361  shared_mem_size = std::min(shared_mem_size, device_properties_[d].sharedMemPerBlock);
362  }
363  return shared_mem_size;
364 }
365 
370 size_t CudaMgr::computeMinNumMPsForAllDevices() const {
371  int num_mps = device_count_ > 0 ? device_properties_.front().numMPs : 0;
372  for (int d = 1; d < device_count_; d++) {
373  num_mps = std::min(num_mps, device_properties_[d].numMPs);
374  }
375  return num_mps;
376 }
377 
378 void CudaMgr::createDeviceContexts() {
379  CHECK_EQ(device_contexts_.size(), size_t(0));
380  device_contexts_.resize(device_count_);
381  for (int d = 0; d < device_count_; ++d) {
382  CUresult status = cuCtxCreate(&device_contexts_[d], 0, device_properties_[d].device);
383  if (status != CUDA_SUCCESS) {
384  // this is called from destructor so we need
385  // to clean up
386  // destroy all contexts up to this point
387  for (int destroy_id = 0; destroy_id <= d; ++destroy_id) {
388  try {
389  checkError(cuCtxDestroy(device_contexts_[destroy_id]));
390  } catch (const CudaErrorException& e) {
391  LOG(ERROR) << "Failed to destroy CUDA context for device ID " << destroy_id
392  << " with " << e.what()
393  << ". CUDA contexts were being destroyed due to an error creating "
394  "CUDA context for device ID "
395  << d << " out of " << device_count_ << " (" << errorMessage(status)
396  << ").";
397  }
398  }
399  // checkError will translate the message and throw
400  checkError(status);
401  }
402  }
403 }
404 
405 void CudaMgr::setContext(const int device_num) const {
406  // deviceNum is the device number relative to startGpu (realDeviceNum - startGpu_)
407  CHECK_LT(device_num, device_count_);
408  cuCtxSetCurrent(device_contexts_[device_num]);
409 }
410 
411 int CudaMgr::getContext() const {
412  CUcontext cnow;
413  checkError(cuCtxGetCurrent(&cnow));
414  if (cnow == NULL) {
415  throw std::runtime_error("no cuda device context");
416  }
417  int device_num{0};
418  for (auto& c : device_contexts_) {
419  if (c == cnow) {
420  return device_num;
421  }
422  ++device_num;
423  }
424  // TODO(sy): Change device_contexts_ to have O(1) lookup? (Or maybe not worth it.)
425  throw std::runtime_error("invalid cuda device context");
426 }
427 
428 void CudaMgr::printDeviceProperties() const {
429  LOG(INFO) << "Using " << device_count_ << " Gpus.";
430  for (int d = 0; d < device_count_; ++d) {
431  VLOG(1) << "Device: " << device_properties_[d].device;
432  VLOG(1) << "UUID: " << device_properties_[d].uuid;
433  VLOG(1) << "Clock (khz): " << device_properties_[d].clockKhz;
434  VLOG(1) << "Compute Major: " << device_properties_[d].computeMajor;
435  VLOG(1) << "Compute Minor: " << device_properties_[d].computeMinor;
436  VLOG(1) << "PCI bus id: " << device_properties_[d].pciBusId;
437  VLOG(1) << "PCI deviceId id: " << device_properties_[d].pciDeviceId;
438  VLOG(1) << "Per device global memory: "
439  << device_properties_[d].globalMem / 1073741824.0 << " GB";
440  VLOG(1) << "Memory clock (khz): " << device_properties_[d].memoryClockKhz;
441  VLOG(1) << "Memory bandwidth: " << device_properties_[d].memoryBandwidthGBs
442  << " GB/sec";
443 
444  VLOG(1) << "Constant Memory: " << device_properties_[d].constantMem;
445  VLOG(1) << "Shared memory per multiprocessor: "
446  << device_properties_[d].sharedMemPerMP;
447  VLOG(1) << "Shared memory per block: " << device_properties_[d].sharedMemPerBlock;
448  VLOG(1) << "Number of MPs: " << device_properties_[d].numMPs;
449  VLOG(1) << "Warp Size: " << device_properties_[d].warpSize;
450  VLOG(1) << "Max threads per block: " << device_properties_[d].maxThreadsPerBlock;
451  VLOG(1) << "Max registers per block: " << device_properties_[d].maxRegistersPerBlock;
452  VLOG(1) << "Max register per MP: " << device_properties_[d].maxRegistersPerMP;
453  VLOG(1) << "Memory bus width in bits: " << device_properties_[d].memoryBusWidth;
454  }
455 }
456 
457 void CudaMgr::checkError(CUresult status) const {
458  if (status != CUDA_SUCCESS) {
459  throw CudaErrorException(status);
460  }
461 }
462 
463 } // namespace CudaMgr_Namespace
464 
465 std::string get_cuda_home(void) {
466  static const char* CUDA_DEFAULT_PATH = "/usr/local/cuda";
467  const char* env = nullptr;
468 
469  if (!(env = getenv("CUDA_HOME")) && !(env = getenv("CUDA_DIR"))) {
470  // check if the default CUDA directory exists: /usr/local/cuda
471  if (boost::filesystem::exists(boost::filesystem::path(CUDA_DEFAULT_PATH))) {
472  env = CUDA_DEFAULT_PATH;
473  }
474  }
475 
476  if (env == nullptr) {
477  LOG(WARNING) << "Could not find CUDA installation path: environment variables "
478  "CUDA_HOME or CUDA_DIR are not defined";
479  return "";
480  }
481 
482  // check if the CUDA directory is sensible:
483  auto cuda_include_dir = env + std::string("/include");
484  auto cuda_h_file = cuda_include_dir + "/cuda.h";
485  if (!boost::filesystem::exists(boost::filesystem::path(cuda_h_file))) {
486  LOG(WARNING) << "cuda.h does not exist in `" << cuda_include_dir << "`. Discarding `"
487  << env << "` as CUDA installation path.";
488  return "";
489  }
490 
491  return std::string(env);
492 }
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:109
#define CHECK_EQ(x, y)
Definition: Logger.h:231
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:247
int CUjit_option
Definition: nocuda.h:26
heavyai::DeviceGroup device_group_
Definition: CudaMgr.h:249
int CUcontext
Definition: nocuda.h:22
std::mutex device_cleanup_mutex_
Definition: CudaMgr.h:252
void * CUstream
Definition: nocuda.h:23
#define LOG(tag)
Definition: Logger.h:217
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:312
int8_t * allocatePinnedHostMem(const size_t num_bytes)
Definition: CudaMgr.cpp:281
void nvidia_jit_warmup()
unsigned long long CUdeviceptr
Definition: nocuda.h:28
void setContext(const int device_num) const
Definition: CudaMgr.cpp:405
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:246
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:125
std::string to_string(char const *&&v)
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:250
std::string get_cuda_home(void)
Definition: CudaMgr.cpp:465
void freeDeviceMem(int8_t *device_ptr)
Definition: CudaMgr.cpp:299
std::string errorMessage(CUresult const status)
Definition: CudaMgr.cpp:41
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:143
#define CHECK_LT(x, y)
Definition: Logger.h:233
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:331
int CUresult
Definition: nocuda.h:21
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:248
void freePinnedHostMem(int8_t *host_ptr)
Definition: CudaMgr.cpp:295
#define CHECK(condition)
Definition: Logger.h:223
void synchronizeDevices() const
Definition: CudaMgr.cpp:102
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:305
CudaMgr(const int num_gpus, const int start_gpu=0)
Definition: CudaMgr.cpp:49
bool isArchVoltaOrGreaterForAll() const
Definition: CudaMgr.cpp:344
#define VLOG(n)
Definition: Logger.h:317
void * CUmodule
Definition: nocuda.h:24
int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:288