OmniSciDB  471d68cefb
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CudaMgr.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2018 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CudaMgr/CudaMgr.h"
20 
21 #include <algorithm>
22 #include <cassert>
23 #include <iostream>
24 #include <stdexcept>
25 
26 #include "Logger/Logger.h"
27 
28 namespace CudaMgr_Namespace {
29 
30 CudaErrorException::CudaErrorException(CUresult status)
31  : std::runtime_error(errorMessage(status)), status_(status) {
32  // cuda already de-initialized can occur during system shutdown. avoid making calls to
33  // the logger to prevent failing during a standard teardown.
34  if (status != CUDA_ERROR_DEINITIALIZED) {
35  VLOG(1) << errorMessage(status);
36  VLOG(1) << boost::stacktrace::stacktrace();
37  }
38 }
39 
40 std::string errorMessage(CUresult const status) {
41  const char* errorString{nullptr};
42  cuGetErrorString(status, &errorString);
43  return errorString
44  ? "CUDA Error (" + std::to_string(status) + "): " + std::string(errorString)
45  : "CUDA Driver API error code " + std::to_string(status);
46 }
47 
48 CudaMgr::CudaMgr(const int num_gpus, const int start_gpu)
49  : start_gpu_(start_gpu)
50  , min_shared_memory_per_block_for_all_devices(0)
51  , min_num_mps_for_all_devices(0) {
52  checkError(cuInit(0));
53  checkError(cuDeviceGetCount(&device_count_));
54 
55  if (num_gpus > 0) { // numGpus <= 0 will just use number of gpus found
56  device_count_ = std::min(device_count_, num_gpus);
57  } else {
58  // if we are using all gpus we cannot start on a gpu other than 0
59  CHECK_EQ(start_gpu_, 0);
60  }
61  fillDeviceProperties();
62  initDeviceGroup();
63  createDeviceContexts();
64  printDeviceProperties();
65 
66  // warm up the GPU JIT
67  LOG(INFO) << "Warming up the GPU JIT Compiler... (this may take several seconds)";
68  setContext(0);
70  LOG(INFO) << "GPU JIT Compiler initialized.";
71 }
72 
73 void CudaMgr::initDeviceGroup() {
74  for (int device_id = 0; device_id < device_count_; device_id++) {
75  device_group_.push_back(
76  {device_id, device_id + start_gpu_, device_properties_[device_id].uuid});
77  }
78 }
79 
81  try {
82  // We don't want to remove the cudaMgr before all other processes have cleaned up.
83  // This should be enforced by the lifetime policies, but take this lock to be safe.
84  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
85 
87  for (int d = 0; d < device_count_; ++d) {
88  checkError(cuCtxDestroy(device_contexts_[d]));
89  }
90  } catch (const CudaErrorException& e) {
91  if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
92  // TODO(adb / asuhan): Verify cuModuleUnload removes the context
93  return;
94  }
95  LOG(ERROR) << "CUDA Error: " << e.what();
96  } catch (const std::runtime_error& e) {
97  LOG(ERROR) << "CUDA Error: " << e.what();
98  }
99 }
100 
102  for (int d = 0; d < device_count_; ++d) {
103  setContext(d);
104  checkError(cuCtxSynchronize());
105  }
106 }
107 
108 void CudaMgr::copyHostToDevice(int8_t* device_ptr,
109  const int8_t* host_ptr,
110  const size_t num_bytes,
111  const int device_num) {
112  setContext(device_num);
113  checkError(
114  cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
115 }
116 
117 void CudaMgr::copyDeviceToHost(int8_t* host_ptr,
118  const int8_t* device_ptr,
119  const size_t num_bytes,
120  const int device_num) {
121  setContext(device_num);
122  checkError(
123  cuMemcpyDtoH(host_ptr, reinterpret_cast<const CUdeviceptr>(device_ptr), num_bytes));
124 }
125 
126 void CudaMgr::copyDeviceToDevice(int8_t* dest_ptr,
127  int8_t* src_ptr,
128  const size_t num_bytes,
129  const int dest_device_num,
130  const int src_device_num) {
131  // dest_device_num and src_device_num are the device numbers relative to start_gpu_
132  // (real_device_num - start_gpu_)
133  if (src_device_num == dest_device_num) {
134  setContext(src_device_num);
135  checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
136  reinterpret_cast<CUdeviceptr>(src_ptr),
137  num_bytes));
138  } else {
139  checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
140  device_contexts_[dest_device_num],
141  reinterpret_cast<CUdeviceptr>(src_ptr),
142  device_contexts_[src_device_num],
143  num_bytes)); // will we always have peer?
144  }
145 }
146 
147 void CudaMgr::loadGpuModuleData(CUmodule* module,
148  const void* image,
149  unsigned int num_options,
150  CUjit_option* options,
151  void** option_vals,
152  const int device_id) const {
153  setContext(device_id);
154  checkError(cuModuleLoadDataEx(module, image, num_options, options, option_vals));
155 }
156 
157 void CudaMgr::unloadGpuModuleData(CUmodule* module, const int device_id) const {
158  std::lock_guard<std::mutex> gpuLock(device_cleanup_mutex_);
159  CHECK(module);
160 
161  setContext(device_id);
162  try {
163  auto code = cuModuleUnload(*module);
164  // If the Cuda driver has already shut down, ignore the resulting errors.
165  if (code != CUDA_ERROR_DEINITIALIZED) {
166  checkError(code);
167  }
168  } catch (const std::runtime_error& e) {
169  LOG(ERROR) << "CUDA Error: " << e.what();
170  }
171 }
172 
173 CudaMgr::CudaMemoryUsage CudaMgr::getCudaMemoryUsage() {
174  CudaMemoryUsage usage;
175  cuMemGetInfo(&usage.free, &usage.total);
176  return usage;
177 }
178 
179 void CudaMgr::fillDeviceProperties() {
180  device_properties_.resize(device_count_);
181  cuDriverGetVersion(&gpu_driver_version_);
182  for (int device_num = 0; device_num < device_count_; ++device_num) {
183  checkError(
184  cuDeviceGet(&device_properties_[device_num].device, device_num + start_gpu_));
185  CUuuid cuda_uuid;
186  checkError(cuDeviceGetUuid(&cuda_uuid, device_properties_[device_num].device));
187  device_properties_[device_num].uuid = omnisci::UUID(cuda_uuid.bytes);
188  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMajor,
189  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
190  device_properties_[device_num].device));
191  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMinor,
192  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
193  device_properties_[device_num].device));
194  checkError(cuDeviceTotalMem(&device_properties_[device_num].globalMem,
195  device_properties_[device_num].device));
196  checkError(cuDeviceGetAttribute(&device_properties_[device_num].constantMem,
197  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
198  device_properties_[device_num].device));
199  checkError(
200  cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerMP,
201  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
202  device_properties_[device_num].device));
203  checkError(cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerBlock,
204  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
205  device_properties_[device_num].device));
206  checkError(cuDeviceGetAttribute(&device_properties_[device_num].numMPs,
207  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
208  device_properties_[device_num].device));
209  checkError(cuDeviceGetAttribute(&device_properties_[device_num].warpSize,
210  CU_DEVICE_ATTRIBUTE_WARP_SIZE,
211  device_properties_[device_num].device));
212  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxThreadsPerBlock,
213  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
214  device_properties_[device_num].device));
215  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerBlock,
216  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
217  device_properties_[device_num].device));
218  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerMP,
219  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
220  device_properties_[device_num].device));
221  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciBusId,
222  CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
223  device_properties_[device_num].device));
224  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciDeviceId,
225  CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
226  device_properties_[device_num].device));
227  checkError(cuDeviceGetAttribute(&device_properties_[device_num].clockKhz,
228  CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
229  device_properties_[device_num].device));
230  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryClockKhz,
231  CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
232  device_properties_[device_num].device));
233  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryBusWidth,
234  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
235  device_properties_[device_num].device));
236  device_properties_[device_num].memoryBandwidthGBs =
237  device_properties_[device_num].memoryClockKhz / 1000000.0 / 8.0 *
238  device_properties_[device_num].memoryBusWidth;
239  }
241  computeMinSharedMemoryPerBlockForAllDevices();
242  min_num_mps_for_all_devices = computeMinNumMPsForAllDevices();
243 }
244 
245 int8_t* CudaMgr::allocatePinnedHostMem(const size_t num_bytes) {
246  setContext(0);
247  void* host_ptr;
248  checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
249  return reinterpret_cast<int8_t*>(host_ptr);
250 }
251 
252 int8_t* CudaMgr::allocateDeviceMem(const size_t num_bytes, const int device_num) {
253  setContext(device_num);
254  CUdeviceptr device_ptr;
255  checkError(cuMemAlloc(&device_ptr, num_bytes));
256  return reinterpret_cast<int8_t*>(device_ptr);
257 }
258 
259 void CudaMgr::freePinnedHostMem(int8_t* host_ptr) {
260  checkError(cuMemFreeHost(reinterpret_cast<void*>(host_ptr)));
261 }
262 
263 void CudaMgr::freeDeviceMem(int8_t* device_ptr) {
264  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
265 
266  checkError(cuMemFree(reinterpret_cast<CUdeviceptr>(device_ptr)));
267 }
268 
269 void CudaMgr::zeroDeviceMem(int8_t* device_ptr,
270  const size_t num_bytes,
271  const int device_num) {
272  setDeviceMem(device_ptr, 0, num_bytes, device_num);
273 }
274 
275 void CudaMgr::setDeviceMem(int8_t* device_ptr,
276  const unsigned char uc,
277  const size_t num_bytes,
278  const int device_num) {
279  setContext(device_num);
280  checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
281 }
282 
288  for (int i = 0; i < device_count_; i++) {
289  if (device_properties_[i].computeMajor < 5) {
290  return false;
291  }
292  }
293  return true;
294 }
295 
301  for (int i = 0; i < device_count_; i++) {
302  if (device_properties_[i].computeMajor < 7) {
303  return false;
304  }
305  }
306  return true;
307 }
308 
313 size_t CudaMgr::computeMinSharedMemoryPerBlockForAllDevices() const {
314  int shared_mem_size =
315  device_count_ > 0 ? device_properties_.front().sharedMemPerBlock : 0;
316  for (int d = 1; d < device_count_; d++) {
317  shared_mem_size = std::min(shared_mem_size, device_properties_[d].sharedMemPerBlock);
318  }
319  return shared_mem_size;
320 }
321 
326 size_t CudaMgr::computeMinNumMPsForAllDevices() const {
327  int num_mps = device_count_ > 0 ? device_properties_.front().numMPs : 0;
328  for (int d = 1; d < device_count_; d++) {
329  num_mps = std::min(num_mps, device_properties_[d].numMPs);
330  }
331  return num_mps;
332 }
333 
334 void CudaMgr::createDeviceContexts() {
335  CHECK_EQ(device_contexts_.size(), size_t(0));
336  device_contexts_.resize(device_count_);
337  for (int d = 0; d < device_count_; ++d) {
338  CUresult status = cuCtxCreate(&device_contexts_[d], 0, device_properties_[d].device);
339  if (status != CUDA_SUCCESS) {
340  // this is called from destructor so we need
341  // to clean up
342  // destroy all contexts up to this point
343  for (int destroy_id = 0; destroy_id <= d; ++destroy_id) {
344  try {
345  checkError(cuCtxDestroy(device_contexts_[destroy_id]));
346  } catch (const CudaErrorException& e) {
347  LOG(ERROR) << "Failed to destroy CUDA context for device ID " << destroy_id
348  << " with " << e.what()
349  << ". CUDA contexts were being destroyed due to an error creating "
350  "CUDA context for device ID "
351  << d << " out of " << device_count_ << " (" << errorMessage(status)
352  << ").";
353  }
354  }
355  // checkError will translate the message and throw
356  checkError(status);
357  }
358  }
359 }
360 
361 void CudaMgr::setContext(const int device_num) const {
362  // deviceNum is the device number relative to startGpu (realDeviceNum - startGpu_)
363  CHECK_LT(device_num, device_count_);
364  cuCtxSetCurrent(device_contexts_[device_num]);
365 }
366 
367 void CudaMgr::printDeviceProperties() const {
368  LOG(INFO) << "Using " << device_count_ << " Gpus.";
369  for (int d = 0; d < device_count_; ++d) {
370  VLOG(1) << "Device: " << device_properties_[d].device;
371  VLOG(1) << "UUID: " << device_properties_[d].uuid;
372  VLOG(1) << "Clock (khz): " << device_properties_[d].clockKhz;
373  VLOG(1) << "Compute Major: " << device_properties_[d].computeMajor;
374  VLOG(1) << "Compute Minor: " << device_properties_[d].computeMinor;
375  VLOG(1) << "PCI bus id: " << device_properties_[d].pciBusId;
376  VLOG(1) << "PCI deviceId id: " << device_properties_[d].pciDeviceId;
377  VLOG(1) << "Per device global memory: "
378  << device_properties_[d].globalMem / 1073741824.0 << " GB";
379  VLOG(1) << "Memory clock (khz): " << device_properties_[d].memoryClockKhz;
380  VLOG(1) << "Memory bandwidth: " << device_properties_[d].memoryBandwidthGBs
381  << " GB/sec";
382 
383  VLOG(1) << "Constant Memory: " << device_properties_[d].constantMem;
384  VLOG(1) << "Shared memory per multiprocessor: "
385  << device_properties_[d].sharedMemPerMP;
386  VLOG(1) << "Shared memory per block: " << device_properties_[d].sharedMemPerBlock;
387  VLOG(1) << "Number of MPs: " << device_properties_[d].numMPs;
388  VLOG(1) << "Warp Size: " << device_properties_[d].warpSize;
389  VLOG(1) << "Max threads per block: " << device_properties_[d].maxThreadsPerBlock;
390  VLOG(1) << "Max registers per block: " << device_properties_[d].maxRegistersPerBlock;
391  VLOG(1) << "Max register per MP: " << device_properties_[d].maxRegistersPerMP;
392  VLOG(1) << "Memory bus width in bits: " << device_properties_[d].memoryBusWidth;
393  }
394 }
395 
396 void CudaMgr::checkError(CUresult status) const {
397  if (status != CUDA_SUCCESS) {
398  throw CudaErrorException(status);
399  }
400 }
401 
402 } // namespace CudaMgr_Namespace
403 
404 std::string get_cuda_home(void) {
405  static const char* CUDA_DEFAULT_PATH = "/usr/local/cuda";
406  const char* env = nullptr;
407 
408  if (!(env = getenv("CUDA_HOME")) && !(env = getenv("CUDA_DIR"))) {
409  // check if the default CUDA directory exists: /usr/local/cuda
410  if (boost::filesystem::exists(boost::filesystem::path(CUDA_DEFAULT_PATH))) {
411  env = CUDA_DEFAULT_PATH;
412  }
413  }
414 
415  if (env == nullptr) {
416  LOG(WARNING) << "Could not find CUDA installation path: environment variables "
417  "CUDA_HOME or CUDA_DIR are not defined";
418  return "";
419  }
420 
421  // check if the CUDA directory is sensible:
422  auto cuda_include_dir = env + std::string("/include");
423  auto cuda_h_file = cuda_include_dir + "/cuda.h";
424  if (!boost::filesystem::exists(boost::filesystem::path(cuda_h_file))) {
425  LOG(WARNING) << "cuda.h does not exist in `" << cuda_include_dir << "`. Discarding `"
426  << env << "` as CUDA installation path.";
427  return "";
428  }
429 
430  return std::string(env);
431 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:239
int CUjit_option
Definition: nocuda.h:25
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:117
std::mutex device_cleanup_mutex_
Definition: CudaMgr.h:244
#define LOG(tag)
Definition: Logger.h:203
int8_t * allocatePinnedHostMem(const size_t num_bytes)
Definition: CudaMgr.cpp:245
void nvidia_jit_warmup()
unsigned long long CUdeviceptr
Definition: nocuda.h:27
void setContext(const int device_num) const
Definition: CudaMgr.cpp:361
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:238
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num)
Definition: CudaMgr.cpp:126
std::string to_string(char const *&&v)
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:242
std::string get_cuda_home(void)
Definition: CudaMgr.cpp:404
void freeDeviceMem(int8_t *device_ptr)
Definition: CudaMgr.cpp:263
omnisci::DeviceGroup device_group_
Definition: CudaMgr.h:241
std::string errorMessage(CUresult const status)
Definition: CudaMgr.cpp:40
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:275
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:108
#define CHECK_LT(x, y)
Definition: Logger.h:219
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:287
int CUresult
Definition: nocuda.h:21
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240
void freePinnedHostMem(int8_t *host_ptr)
Definition: CudaMgr.cpp:259
#define CHECK(condition)
Definition: Logger.h:209
void synchronizeDevices() const
Definition: CudaMgr.cpp:101
CudaMgr(const int num_gpus, const int start_gpu=0)
Definition: CudaMgr.cpp:48
bool isArchVoltaOrGreaterForAll() const
Definition: CudaMgr.cpp:300
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:269
#define VLOG(n)
Definition: Logger.h:303
void * CUmodule
Definition: nocuda.h:23
int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:252