OmniSciDB  f17484ade4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CudaMgr.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CudaMgr/CudaMgr.h"
20 
21 #include <algorithm>
22 #include <iostream>
23 #include <stdexcept>
24 
25 #include <boost/filesystem.hpp>
26 #include "Logger/Logger.h"
27 
28 namespace CudaMgr_Namespace {
29 
30 CudaErrorException::CudaErrorException(CUresult status)
31  : std::runtime_error(errorMessage(status)), status_(status) {
32  // cuda already de-initialized can occur during system shutdown. avoid making calls to
33  // the logger to prevent failing during a standard teardown.
34  if (status != CUDA_ERROR_DEINITIALIZED) {
35  VLOG(1) << errorMessage(status);
36  VLOG(1) << boost::stacktrace::stacktrace();
37  }
38 }
39 
40 std::string errorMessage(CUresult const status) {
41  const char* errorString{nullptr};
42  cuGetErrorString(status, &errorString);
43  return errorString
44  ? "CUDA Error (" + std::to_string(status) + "): " + std::string(errorString)
45  : "CUDA Driver API error code " + std::to_string(status);
46 }
47 
48 CudaMgr::CudaMgr(const int num_gpus, const int start_gpu)
49  : start_gpu_(start_gpu)
50  , min_shared_memory_per_block_for_all_devices(0)
51  , min_num_mps_for_all_devices(0) {
52  checkError(cuInit(0));
53  checkError(cuDeviceGetCount(&device_count_));
54 
55  if (num_gpus > 0) { // numGpus <= 0 will just use number of gpus found
56  device_count_ = std::min(device_count_, num_gpus);
57  } else {
58  // if we are using all gpus we cannot start on a gpu other than 0
59  CHECK_EQ(start_gpu_, 0);
60  }
61  fillDeviceProperties();
62  initDeviceGroup();
63  createDeviceContexts();
64  logDeviceProperties();
65 
66  // warm up the GPU JIT
67  LOG(INFO) << "Warming up the GPU JIT Compiler... (this may take several seconds)";
68  setContext(0);
70  LOG(INFO) << "GPU JIT Compiler initialized.";
71 }
72 
73 void CudaMgr::initDeviceGroup() {
74  for (int device_id = 0; device_id < device_count_; device_id++) {
75  device_group_.push_back(
76  {device_id, device_id + start_gpu_, device_properties_[device_id].uuid});
77  }
78 }
79 
81  try {
82  // We don't want to remove the cudaMgr before all other processes have cleaned up.
83  // This should be enforced by the lifetime policies, but take this lock to be safe.
84  std::lock_guard<std::mutex> device_lock(device_mutex_);
87 
88  for (int d = 0; d < device_count_; ++d) {
89  checkError(cuCtxDestroy(device_contexts_[d]));
90  }
91  } catch (const CudaErrorException& e) {
92  if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
93  // TODO(adb / asuhan): Verify cuModuleUnload removes the context
94  return;
95  }
96  LOG(ERROR) << "CUDA Error: " << e.what();
97  } catch (const std::runtime_error& e) {
98  LOG(ERROR) << "CUDA Error: " << e.what();
99  }
100 }
101 
102 size_t CudaMgr::computePaddedBufferSize(size_t buf_size, size_t granularity) const {
103  return (((buf_size + (granularity - 1)) / granularity) * granularity);
104 }
105 
106 size_t CudaMgr::getGranularity(const int device_num) const {
107  CUmemAllocationProp allocation_prop{};
108  allocation_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
109  allocation_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
110  allocation_prop.location.id = device_num;
111  size_t granularity{};
112  checkError(cuMemGetAllocationGranularity(
113  &granularity, &allocation_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
114  return granularity;
115 }
116 
118  for (int d = 0; d < device_count_; ++d) {
119  setContext(d);
120  checkError(cuCtxSynchronize());
121  }
122 }
123 
124 void CudaMgr::copyHostToDevice(int8_t* device_ptr,
125  const int8_t* host_ptr,
126  const size_t num_bytes,
127  const int device_num,
128  CUstream cuda_stream) {
129  setContext(device_num);
130  if (!cuda_stream) {
131  checkError(
132  cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
133  } else {
134  checkError(cuMemcpyHtoDAsync(
135  reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes, cuda_stream));
136  checkError(cuStreamSynchronize(cuda_stream));
137  }
138 }
139 
140 void CudaMgr::copyDeviceToHost(int8_t* host_ptr,
141  const int8_t* device_ptr,
142  const size_t num_bytes,
143  CUstream cuda_stream) {
144  // set device_num based on device_ptr
145  auto const cu_device_ptr = reinterpret_cast<CUdeviceptr>(device_ptr);
146  {
147  std::lock_guard<std::mutex> device_lock(device_mutex_);
148  auto itr = device_memory_allocation_map_.upper_bound(cu_device_ptr);
149  CHECK(itr != device_memory_allocation_map_.begin());
150  --itr;
151  auto const& allocation_base = itr->first;
152  auto const& allocation_size = itr->second.size;
153  CHECK_LE(cu_device_ptr + num_bytes, allocation_base + allocation_size);
154  auto const& allocation_device_num = itr->second.device_num;
155  setContext(allocation_device_num);
156  }
157  if (!cuda_stream) {
158  checkError(cuMemcpyDtoH(host_ptr, cu_device_ptr, num_bytes));
159  } else {
160  checkError(cuMemcpyDtoHAsync(host_ptr, cu_device_ptr, num_bytes, cuda_stream));
161  checkError(cuStreamSynchronize(cuda_stream));
162  }
163 }
164 
165 void CudaMgr::copyDeviceToDevice(int8_t* dest_ptr,
166  int8_t* src_ptr,
167  const size_t num_bytes,
168  const int dest_device_num,
169  const int src_device_num,
170  CUstream cuda_stream) {
171  // dest_device_num and src_device_num are the device numbers relative to start_gpu_
172  // (real_device_num - start_gpu_)
173  if (src_device_num == dest_device_num) {
174  setContext(src_device_num);
175  if (!cuda_stream) {
176  checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
177  reinterpret_cast<CUdeviceptr>(src_ptr),
178  num_bytes));
179  } else {
180  checkError(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
181  reinterpret_cast<CUdeviceptr>(src_ptr),
182  num_bytes,
183  cuda_stream));
184  checkError(cuStreamSynchronize(cuda_stream));
185  }
186  } else {
187  if (!cuda_stream) {
188  checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
189  device_contexts_[dest_device_num],
190  reinterpret_cast<CUdeviceptr>(src_ptr),
191  device_contexts_[src_device_num],
192  num_bytes)); // will we always have peer?
193  } else {
194  checkError(cuMemcpyPeerAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
195  device_contexts_[dest_device_num],
196  reinterpret_cast<CUdeviceptr>(src_ptr),
197  device_contexts_[src_device_num],
198  num_bytes,
199  cuda_stream)); // will we always have peer?
200  checkError(cuStreamSynchronize(cuda_stream));
201  }
202  }
203 }
204 
205 void CudaMgr::loadGpuModuleData(CUmodule* module,
206  const void* image,
207  unsigned int num_options,
208  CUjit_option* options,
209  void** option_vals,
210  const int device_id) const {
211  setContext(device_id);
212  checkError(cuModuleLoadDataEx(module, image, num_options, options, option_vals));
213 }
214 
215 void CudaMgr::unloadGpuModuleData(CUmodule* module, const int device_id) const {
216  std::lock_guard<std::mutex> device_lock(device_mutex_);
217  CHECK(module);
218  setContext(device_id);
219  try {
220  auto code = cuModuleUnload(*module);
221  // If the Cuda driver has already shut down, ignore the resulting errors.
222  if (code != CUDA_ERROR_DEINITIALIZED) {
223  checkError(code);
224  }
225  } catch (const std::runtime_error& e) {
226  LOG(ERROR) << "CUDA Error: " << e.what();
227  }
228 }
229 
230 CudaMgr::CudaMemoryUsage CudaMgr::getCudaMemoryUsage() {
231  CudaMemoryUsage usage;
232  cuMemGetInfo(&usage.free, &usage.total);
233  return usage;
234 }
235 
236 void CudaMgr::fillDeviceProperties() {
237  device_properties_.resize(device_count_);
238  cuDriverGetVersion(&gpu_driver_version_);
239  for (int device_num = 0; device_num < device_count_; ++device_num) {
240  checkError(
241  cuDeviceGet(&device_properties_[device_num].device, device_num + start_gpu_));
242  CUuuid cuda_uuid;
243  checkError(cuDeviceGetUuid(&cuda_uuid, device_properties_[device_num].device));
244  device_properties_[device_num].uuid = heavyai::UUID(cuda_uuid.bytes);
245  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMajor,
246  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
247  device_properties_[device_num].device));
248  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMinor,
249  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
250  device_properties_[device_num].device));
251  checkError(cuDeviceTotalMem(&device_properties_[device_num].globalMem,
252  device_properties_[device_num].device));
253  checkError(cuDeviceGetAttribute(&device_properties_[device_num].constantMem,
254  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
255  device_properties_[device_num].device));
256  checkError(
257  cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerMP,
258  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
259  device_properties_[device_num].device));
260  checkError(cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerBlock,
261  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
262  device_properties_[device_num].device));
263  checkError(cuDeviceGetAttribute(&device_properties_[device_num].numMPs,
264  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
265  device_properties_[device_num].device));
266  checkError(cuDeviceGetAttribute(&device_properties_[device_num].warpSize,
267  CU_DEVICE_ATTRIBUTE_WARP_SIZE,
268  device_properties_[device_num].device));
269  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxThreadsPerBlock,
270  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
271  device_properties_[device_num].device));
272  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerBlock,
273  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
274  device_properties_[device_num].device));
275  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerMP,
276  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
277  device_properties_[device_num].device));
278  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciBusId,
279  CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
280  device_properties_[device_num].device));
281  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciDeviceId,
282  CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
283  device_properties_[device_num].device));
284  checkError(cuDeviceGetAttribute(&device_properties_[device_num].clockKhz,
285  CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
286  device_properties_[device_num].device));
287  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryClockKhz,
288  CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
289  device_properties_[device_num].device));
290  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryBusWidth,
291  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
292  device_properties_[device_num].device));
293  device_properties_[device_num].memoryBandwidthGBs =
294  device_properties_[device_num].memoryClockKhz / 1000000.0 / 8.0 *
295  device_properties_[device_num].memoryBusWidth;
296 
297  // capture memory allocation granularity
298  device_properties_[device_num].allocationGranularity = getGranularity(device_num);
299  }
301  computeMinSharedMemoryPerBlockForAllDevices();
302  min_num_mps_for_all_devices = computeMinNumMPsForAllDevices();
303 }
304 
305 int8_t* CudaMgr::allocatePinnedHostMem(const size_t num_bytes) {
306  setContext(0);
307  void* host_ptr;
308  checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
309  return reinterpret_cast<int8_t*>(host_ptr);
310 }
311 
312 int8_t* CudaMgr::allocateDeviceMem(const size_t num_bytes, const int device_num) {
313  std::lock_guard<std::mutex> map_lock(device_mutex_);
314  setContext(device_num);
315 
316  CUdeviceptr device_ptr{};
317  CUmemGenericAllocationHandle handle{};
318  auto granularity = getGranularity(device_num);
319  // reserve the actual memory
320  auto padded_num_bytes = computePaddedBufferSize(num_bytes, granularity);
321  auto status = cuMemAddressReserve(&device_ptr, padded_num_bytes, granularity, 0, 0);
322 
323  if (status == CUDA_SUCCESS) {
324  // create a handle for the allocation
325  CUmemAllocationProp allocation_prop{};
326  allocation_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
327  allocation_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
328  allocation_prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
329  allocation_prop.location.id = device_num + start_gpu_;
330  status = cuMemCreate(&handle, padded_num_bytes, &allocation_prop, 0);
331 
332  if (status == CUDA_SUCCESS) {
333  // map the memory
334  status = cuMemMap(device_ptr, padded_num_bytes, 0, handle, 0);
335 
336  if (status == CUDA_SUCCESS) {
337  // set the memory access
338  CUmemAccessDesc access_desc{};
339  access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
340  access_desc.location.id = device_num + start_gpu_;
341  access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
342  status = cuMemSetAccess(device_ptr, padded_num_bytes, &access_desc, 1);
343  }
344  }
345  }
346 
347  if (status != CUDA_SUCCESS) {
348  // clean up in reverse order
349  if (device_ptr && handle) {
350  cuMemUnmap(device_ptr, padded_num_bytes);
351  }
352  if (handle) {
353  cuMemRelease(handle);
354  }
355  if (device_ptr) {
356  cuMemAddressFree(device_ptr, padded_num_bytes);
357  }
358  throw CudaErrorException(status);
359  }
360  DeviceMemoryMetadata device_ptr_metadata{
361  padded_num_bytes, handle, getDeviceProperties(device_num)->uuid, device_num};
362  CHECK(
363  device_memory_allocation_map_.try_emplace(device_ptr, device_ptr_metadata).second);
364  return reinterpret_cast<int8_t*>(device_ptr);
365 }
366 
367 void CudaMgr::freeDeviceMem(int8_t* device_ptr) {
368  // take lock
369  std::lock_guard<std::mutex> map_lock(device_mutex_);
370  // find in map
371  auto const cu_device_ptr = reinterpret_cast<CUdeviceptr>(device_ptr);
372  auto const itr = device_memory_allocation_map_.find(cu_device_ptr);
373  CHECK(itr != device_memory_allocation_map_.end());
374  // get attributes
375  auto const size = itr->second.size;
376  auto const handle = itr->second.handle;
377  // attempt to unmap, release, free
378  auto status_unmap = cuMemUnmap(cu_device_ptr, size);
379  auto status_release = cuMemRelease(handle);
380  auto status_free = cuMemAddressFree(cu_device_ptr, size);
381  // remove from map
383  // check for errors
384  checkError(status_unmap);
385  checkError(status_release);
386  checkError(status_free);
387 }
388 
389 void CudaMgr::zeroDeviceMem(int8_t* device_ptr,
390  const size_t num_bytes,
391  const int device_num,
392  CUstream cuda_stream) {
393  setDeviceMem(device_ptr, 0, num_bytes, device_num, cuda_stream);
394 }
395 
396 void CudaMgr::setDeviceMem(int8_t* device_ptr,
397  const unsigned char uc,
398  const size_t num_bytes,
399  const int device_num,
400  CUstream cuda_stream) {
401  setContext(device_num);
402  if (!cuda_stream) {
403  checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
404  } else {
405  checkError(cuMemsetD8Async(
406  reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes, cuda_stream));
407  checkError(cuStreamSynchronize(cuda_stream));
408  }
409 }
410 
416  for (int i = 0; i < device_count_; i++) {
417  if (device_properties_[i].computeMajor < 5) {
418  return false;
419  }
420  }
421  return true;
422 }
423 
429  for (int i = 0; i < device_count_; i++) {
430  if (device_properties_[i].computeMajor < 7) {
431  return false;
432  }
433  }
434  return true;
435 }
436 
441 size_t CudaMgr::computeMinSharedMemoryPerBlockForAllDevices() const {
442  int shared_mem_size =
443  device_count_ > 0 ? device_properties_.front().sharedMemPerBlock : 0;
444  for (int d = 1; d < device_count_; d++) {
445  shared_mem_size = std::min(shared_mem_size, device_properties_[d].sharedMemPerBlock);
446  }
447  return shared_mem_size;
448 }
449 
454 size_t CudaMgr::computeMinNumMPsForAllDevices() const {
455  int num_mps = device_count_ > 0 ? device_properties_.front().numMPs : 0;
456  for (int d = 1; d < device_count_; d++) {
457  num_mps = std::min(num_mps, device_properties_[d].numMPs);
458  }
459  return num_mps;
460 }
461 
462 void CudaMgr::createDeviceContexts() {
463  CHECK_EQ(device_contexts_.size(), size_t(0));
464  device_contexts_.resize(device_count_);
465  for (int d = 0; d < device_count_; ++d) {
466  CUresult status = cuCtxCreate(&device_contexts_[d], 0, device_properties_[d].device);
467  if (status != CUDA_SUCCESS) {
468  // this is called from destructor so we need
469  // to clean up
470  // destroy all contexts up to this point
471  for (int destroy_id = 0; destroy_id <= d; ++destroy_id) {
472  try {
473  checkError(cuCtxDestroy(device_contexts_[destroy_id]));
474  } catch (const CudaErrorException& e) {
475  LOG(ERROR) << "Failed to destroy CUDA context for device ID " << destroy_id
476  << " with " << e.what()
477  << ". CUDA contexts were being destroyed due to an error creating "
478  "CUDA context for device ID "
479  << d << " out of " << device_count_ << " (" << errorMessage(status)
480  << ").";
481  }
482  }
483  // checkError will translate the message and throw
484  checkError(status);
485  }
486  }
487 }
488 
489 void CudaMgr::setContext(const int device_num) const {
490  // deviceNum is the device number relative to startGpu (realDeviceNum - startGpu_)
491  CHECK_LT(device_num, device_count_);
492  cuCtxSetCurrent(device_contexts_[device_num]);
493 }
494 
495 int CudaMgr::getContext() const {
496  CUcontext cnow;
497  checkError(cuCtxGetCurrent(&cnow));
498  if (cnow == NULL) {
499  throw std::runtime_error("no cuda device context");
500  }
501  int device_num{0};
502  for (auto& c : device_contexts_) {
503  if (c == cnow) {
504  return device_num;
505  }
506  ++device_num;
507  }
508  // TODO(sy): Change device_contexts_ to have O(1) lookup? (Or maybe not worth it.)
509  throw std::runtime_error("invalid cuda device context");
510 }
511 
512 void CudaMgr::logDeviceProperties() const {
513  LOG(INFO) << "Using " << device_count_ << " Gpus.";
514  for (int d = 0; d < device_count_; ++d) {
515  VLOG(1) << "Device: " << device_properties_[d].device;
516  VLOG(1) << "UUID: " << device_properties_[d].uuid;
517  VLOG(1) << "Clock (khz): " << device_properties_[d].clockKhz;
518  VLOG(1) << "Compute Major: " << device_properties_[d].computeMajor;
519  VLOG(1) << "Compute Minor: " << device_properties_[d].computeMinor;
520  VLOG(1) << "PCI bus id: " << device_properties_[d].pciBusId;
521  VLOG(1) << "PCI deviceId id: " << device_properties_[d].pciDeviceId;
522  VLOG(1) << "Per device global memory: "
523  << device_properties_[d].globalMem / 1073741824.0 << " GB";
524  VLOG(1) << "Memory clock (khz): " << device_properties_[d].memoryClockKhz;
525  VLOG(1) << "Memory bandwidth: " << device_properties_[d].memoryBandwidthGBs
526  << " GB/sec";
527 
528  VLOG(1) << "Constant Memory: " << device_properties_[d].constantMem;
529  VLOG(1) << "Shared memory per multiprocessor: "
530  << device_properties_[d].sharedMemPerMP;
531  VLOG(1) << "Shared memory per block: " << device_properties_[d].sharedMemPerBlock;
532  VLOG(1) << "Number of MPs: " << device_properties_[d].numMPs;
533  VLOG(1) << "Warp Size: " << device_properties_[d].warpSize;
534  VLOG(1) << "Max threads per block: " << device_properties_[d].maxThreadsPerBlock;
535  VLOG(1) << "Max registers per block: " << device_properties_[d].maxRegistersPerBlock;
536  VLOG(1) << "Max register per MP: " << device_properties_[d].maxRegistersPerMP;
537  VLOG(1) << "Memory bus width in bits: " << device_properties_[d].memoryBusWidth;
538  }
539 }
540 
541 void CudaMgr::checkError(CUresult status) const {
542  if (status != CUDA_SUCCESS) {
543  throw CudaErrorException(status);
544  }
545 }
546 
547 } // namespace CudaMgr_Namespace
548 
549 std::string get_cuda_home(void) {
550  static const char* CUDA_DEFAULT_PATH = "/usr/local/cuda";
551  const char* env = nullptr;
552 
553  if (!(env = getenv("CUDA_HOME")) && !(env = getenv("CUDA_DIR"))) {
554  // check if the default CUDA directory exists: /usr/local/cuda
555  if (boost::filesystem::exists(boost::filesystem::path(CUDA_DEFAULT_PATH))) {
556  env = CUDA_DEFAULT_PATH;
557  }
558  }
559 
560  if (env == nullptr) {
561  LOG(WARNING) << "Could not find CUDA installation path: environment variables "
562  "CUDA_HOME or CUDA_DIR are not defined";
563  return "";
564  }
565 
566  // check if the CUDA directory is sensible:
567  auto cuda_include_dir = env + std::string("/include");
568  auto cuda_h_file = cuda_include_dir + "/cuda.h";
569  if (!boost::filesystem::exists(boost::filesystem::path(cuda_h_file))) {
570  LOG(WARNING) << "cuda.h does not exist in `" << cuda_include_dir << "`. Discarding `"
571  << env << "` as CUDA installation path.";
572  return "";
573  }
574 
575  return std::string(env);
576 }
577 
578 std::string get_cuda_libdevice_dir(void) {
579  static const char* CUDA_DEFAULT_PATH = "/usr/local/cuda";
580  const char* env = nullptr;
581 
582  if (!(env = getenv("CUDA_HOME")) && !(env = getenv("CUDA_DIR"))) {
583  // check if the default CUDA directory exists: /usr/local/cuda
584  if (boost::filesystem::exists(boost::filesystem::path(CUDA_DEFAULT_PATH))) {
585  env = CUDA_DEFAULT_PATH;
586  }
587  }
588 
589  if (env == nullptr) {
590  LOG(WARNING) << "Could not find CUDA installation path: environment variables "
591  "CUDA_HOME or CUDA_DIR are not defined";
592  return "";
593  }
594 
595  // check if the CUDA directory is sensible:
596  auto libdevice_dir = env + std::string("/nvvm/libdevice");
597  auto libdevice_bc_file = libdevice_dir + "/libdevice.10.bc";
598  if (!boost::filesystem::exists(boost::filesystem::path(libdevice_bc_file))) {
599  LOG(WARNING) << "`" << libdevice_bc_file << "` does not exist. Discarding `" << env
600  << "` as CUDA installation path with libdevice.";
601  return "";
602  }
603 
604  return libdevice_dir;
605 }
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:124
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:265
std::string get_cuda_libdevice_dir(void)
Definition: CudaMgr.cpp:578
int CUjit_option
Definition: nocuda.h:26
heavyai::DeviceGroup device_group_
Definition: CudaMgr.h:267
int CUcontext
Definition: nocuda.h:22
std::mutex device_mutex_
Definition: CudaMgr.h:270
size_t getGranularity(const int device_num) const
Definition: CudaMgr.cpp:106
void * CUstream
Definition: nocuda.h:23
#define LOG(tag)
Definition: Logger.h:285
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:396
int8_t * allocatePinnedHostMem(const size_t num_bytes)
Definition: CudaMgr.cpp:305
DeviceMemoryAllocationMap device_memory_allocation_map_
Definition: CudaMgr.h:269
void nvidia_jit_warmup()
unsigned long long CUdeviceptr
Definition: nocuda.h:28
void setContext(const int device_num) const
Definition: CudaMgr.cpp:489
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:264
std::string to_string(char const *&&v)
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:268
std::string get_cuda_home(void)
Definition: CudaMgr.cpp:549
void freeDeviceMem(int8_t *device_ptr)
Definition: CudaMgr.cpp:367
std::string errorMessage(CUresult const status)
Definition: CudaMgr.cpp:40
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:165
size_t computePaddedBufferSize(size_t buf_size, size_t granularity) const
Definition: CudaMgr.cpp:102
#define CHECK_LT(x, y)
Definition: Logger.h:303
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:415
#define CHECK_LE(x, y)
Definition: Logger.h:304
int CUresult
Definition: nocuda.h:21
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:140
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:266
#define CHECK(condition)
Definition: Logger.h:291
void synchronizeDevices() const
Definition: CudaMgr.cpp:117
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:389
const DeviceProperties * getDeviceProperties(const size_t device_num) const
Definition: CudaMgr.h:144
CudaMgr(const int num_gpus, const int start_gpu=0)
Definition: CudaMgr.cpp:48
bool isArchVoltaOrGreaterForAll() const
Definition: CudaMgr.cpp:428
#define VLOG(n)
Definition: Logger.h:388
void * CUmodule
Definition: nocuda.h:24
virtual int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:312