OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CudaMgr.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CudaMgr/CudaMgr.h"
20 
21 #include <algorithm>
22 #include <iostream>
23 #include <stdexcept>
24 
25 #include <boost/filesystem.hpp>
26 #include "Logger/Logger.h"
27 
28 namespace CudaMgr_Namespace {
29 
30 CudaErrorException::CudaErrorException(CUresult status)
31  : std::runtime_error(errorMessage(status)), status_(status) {
32  // cuda already de-initialized can occur during system shutdown. avoid making calls to
33  // the logger to prevent failing during a standard teardown.
34  if (status != CUDA_ERROR_DEINITIALIZED) {
35  VLOG(1) << errorMessage(status);
36  VLOG(1) << boost::stacktrace::stacktrace();
37  }
38 }
39 
40 std::string errorMessage(CUresult const status) {
41  const char* errorString{nullptr};
42  cuGetErrorString(status, &errorString);
43  return errorString
44  ? "CUDA Error (" + std::to_string(status) + "): " + std::string(errorString)
45  : "CUDA Driver API error code " + std::to_string(status);
46 }
47 
48 CudaMgr::CudaMgr(const int num_gpus, const int start_gpu)
49  : start_gpu_(start_gpu)
50  , min_shared_memory_per_block_for_all_devices(0)
51  , min_num_mps_for_all_devices(0)
52  , device_memory_allocation_map_{std::make_unique<DeviceMemoryAllocationMap>()} {
53  checkError(cuInit(0));
54  checkError(cuDeviceGetCount(&device_count_));
55 
56  if (num_gpus > 0) { // numGpus <= 0 will just use number of gpus found
57  device_count_ = std::min(device_count_, num_gpus);
58  } else {
59  // if we are using all gpus we cannot start on a gpu other than 0
60  CHECK_EQ(start_gpu_, 0);
61  }
62  fillDeviceProperties();
63  initDeviceGroup();
64  createDeviceContexts();
65  logDeviceProperties();
66 
67  // warm up the GPU JIT
68  LOG(INFO) << "Warming up the GPU JIT Compiler... (this may take several seconds)";
69  setContext(0);
71  LOG(INFO) << "GPU JIT Compiler initialized.";
72 }
73 
74 void CudaMgr::initDeviceGroup() {
75  for (int device_id = 0; device_id < device_count_; device_id++) {
76  device_group_.push_back(
77  {device_id, device_id + start_gpu_, device_properties_[device_id].uuid});
78  }
79 }
80 
82  try {
83  // We don't want to remove the cudaMgr before all other processes have cleaned up.
84  // This should be enforced by the lifetime policies, but take this lock to be safe.
85  std::lock_guard<std::mutex> device_lock(device_mutex_);
87 
88  CHECK(getDeviceMemoryAllocationMap().mapEmpty());
89  device_memory_allocation_map_ = nullptr;
90 
91  for (int d = 0; d < device_count_; ++d) {
92  checkError(cuCtxDestroy(device_contexts_[d]));
93  }
94  } catch (const CudaErrorException& e) {
95  if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
96  // TODO(adb / asuhan): Verify cuModuleUnload removes the context
97  return;
98  }
99  LOG(ERROR) << "CUDA Error: " << e.what();
100  } catch (const std::runtime_error& e) {
101  LOG(ERROR) << "CUDA Error: " << e.what();
102  }
103 }
104 
105 size_t CudaMgr::computePaddedBufferSize(size_t buf_size, size_t granularity) const {
106  return (((buf_size + (granularity - 1)) / granularity) * granularity);
107 }
108 
109 size_t CudaMgr::getGranularity(const int device_num) const {
110  CUmemAllocationProp allocation_prop{};
111  allocation_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
112  allocation_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
113  allocation_prop.location.id = device_num;
114  size_t granularity{};
115  checkError(cuMemGetAllocationGranularity(
116  &granularity, &allocation_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
117  return granularity;
118 }
119 
121  for (int d = 0; d < device_count_; ++d) {
122  setContext(d);
123  checkError(cuCtxSynchronize());
124  }
125 }
126 
127 void CudaMgr::copyHostToDevice(int8_t* device_ptr,
128  const int8_t* host_ptr,
129  const size_t num_bytes,
130  const int device_num,
131  CUstream cuda_stream) {
132  setContext(device_num);
133  if (!cuda_stream) {
134  checkError(
135  cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
136  } else {
137  checkError(cuMemcpyHtoDAsync(
138  reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes, cuda_stream));
139  checkError(cuStreamSynchronize(cuda_stream));
140  }
141 }
142 
143 void CudaMgr::copyDeviceToHost(int8_t* host_ptr,
144  const int8_t* device_ptr,
145  const size_t num_bytes,
146  CUstream cuda_stream) {
147  // set device_num based on device_ptr
148  auto const cu_device_ptr = reinterpret_cast<CUdeviceptr>(device_ptr);
149  {
150  std::lock_guard<std::mutex> device_lock(device_mutex_);
151  auto const [allocation_base, allocation] =
152  getDeviceMemoryAllocationMap().getAllocation(cu_device_ptr);
153  CHECK_LE(cu_device_ptr + num_bytes, allocation_base + allocation.size);
154  setContext(allocation.device_num);
155  }
156  if (!cuda_stream) {
157  checkError(cuMemcpyDtoH(host_ptr, cu_device_ptr, num_bytes));
158  } else {
159  checkError(cuMemcpyDtoHAsync(host_ptr, cu_device_ptr, num_bytes, cuda_stream));
160  checkError(cuStreamSynchronize(cuda_stream));
161  }
162 }
163 
164 void CudaMgr::copyDeviceToDevice(int8_t* dest_ptr,
165  int8_t* src_ptr,
166  const size_t num_bytes,
167  const int dest_device_num,
168  const int src_device_num,
169  CUstream cuda_stream) {
170  // dest_device_num and src_device_num are the device numbers relative to start_gpu_
171  // (real_device_num - start_gpu_)
172  if (src_device_num == dest_device_num) {
173  setContext(src_device_num);
174  if (!cuda_stream) {
175  checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
176  reinterpret_cast<CUdeviceptr>(src_ptr),
177  num_bytes));
178  } else {
179  checkError(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
180  reinterpret_cast<CUdeviceptr>(src_ptr),
181  num_bytes,
182  cuda_stream));
183  checkError(cuStreamSynchronize(cuda_stream));
184  }
185  } else {
186  if (!cuda_stream) {
187  checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
188  device_contexts_[dest_device_num],
189  reinterpret_cast<CUdeviceptr>(src_ptr),
190  device_contexts_[src_device_num],
191  num_bytes)); // will we always have peer?
192  } else {
193  checkError(cuMemcpyPeerAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
194  device_contexts_[dest_device_num],
195  reinterpret_cast<CUdeviceptr>(src_ptr),
196  device_contexts_[src_device_num],
197  num_bytes,
198  cuda_stream)); // will we always have peer?
199  checkError(cuStreamSynchronize(cuda_stream));
200  }
201  }
202 }
203 
204 void CudaMgr::loadGpuModuleData(CUmodule* module,
205  const void* image,
206  unsigned int num_options,
207  CUjit_option* options,
208  void** option_vals,
209  const int device_id) const {
210  setContext(device_id);
211  checkError(cuModuleLoadDataEx(module, image, num_options, options, option_vals));
212 }
213 
214 void CudaMgr::unloadGpuModuleData(CUmodule* module, const int device_id) const {
215  std::lock_guard<std::mutex> device_lock(device_mutex_);
216  CHECK(module);
217  setContext(device_id);
218  try {
219  auto code = cuModuleUnload(*module);
220  // If the Cuda driver has already shut down, ignore the resulting errors.
221  if (code != CUDA_ERROR_DEINITIALIZED) {
222  checkError(code);
223  }
224  } catch (const std::runtime_error& e) {
225  LOG(ERROR) << "CUDA Error: " << e.what();
226  }
227 }
228 
229 std::vector<CudaMgr::CudaMemoryUsage> CudaMgr::getCudaMemoryUsage() {
230  std::vector<CudaMgr::CudaMemoryUsage> m;
231  std::lock_guard<std::mutex> map_lock(device_mutex_);
232  CUcontext cnow;
233  checkError(cuCtxGetCurrent(&cnow));
234  for (int device_num = 0; device_num < device_count_; ++device_num) {
235  setContext(device_num);
236  CudaMemoryUsage usage;
237  cuMemGetInfo(&usage.free, &usage.total);
238  m.push_back(usage);
239  }
240  cuCtxSetCurrent(cnow);
241  return m;
242 }
243 
244 std::string CudaMgr::getCudaMemoryUsageInString() {
245  auto const device_mem_status = getCudaMemoryUsage();
246  std::ostringstream oss;
247  int device_id = 0;
248  oss << "{ \"name\": \"GPU Memory Info\", ";
249  for (auto& info : device_mem_status) {
250  oss << "{\"device_id\": " << device_id++ << ", \"freeMB:\": " << info.free / 1048576.0
251  << ", \"totalMB\": " << info.total / 1048576.0 << "} ";
252  }
253  oss << "}";
254  return oss.str();
255 }
256 
257 void CudaMgr::fillDeviceProperties() {
258  device_properties_.resize(device_count_);
259  cuDriverGetVersion(&gpu_driver_version_);
260  for (int device_num = 0; device_num < device_count_; ++device_num) {
261  checkError(
262  cuDeviceGet(&device_properties_[device_num].device, device_num + start_gpu_));
263  CUuuid cuda_uuid;
264  checkError(cuDeviceGetUuid(&cuda_uuid, device_properties_[device_num].device));
265  device_properties_[device_num].uuid = heavyai::UUID(cuda_uuid.bytes);
266  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMajor,
267  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
268  device_properties_[device_num].device));
269  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMinor,
270  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
271  device_properties_[device_num].device));
272  checkError(cuDeviceTotalMem(&device_properties_[device_num].globalMem,
273  device_properties_[device_num].device));
274  checkError(cuDeviceGetAttribute(&device_properties_[device_num].constantMem,
275  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
276  device_properties_[device_num].device));
277  checkError(
278  cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerMP,
279  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
280  device_properties_[device_num].device));
281  checkError(cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerBlock,
282  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
283  device_properties_[device_num].device));
284  checkError(cuDeviceGetAttribute(&device_properties_[device_num].numMPs,
285  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
286  device_properties_[device_num].device));
287  checkError(cuDeviceGetAttribute(&device_properties_[device_num].warpSize,
288  CU_DEVICE_ATTRIBUTE_WARP_SIZE,
289  device_properties_[device_num].device));
290  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxThreadsPerBlock,
291  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
292  device_properties_[device_num].device));
293  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerBlock,
294  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
295  device_properties_[device_num].device));
296  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerMP,
297  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
298  device_properties_[device_num].device));
299  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciBusId,
300  CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
301  device_properties_[device_num].device));
302  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciDeviceId,
303  CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
304  device_properties_[device_num].device));
305  checkError(cuDeviceGetAttribute(&device_properties_[device_num].clockKhz,
306  CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
307  device_properties_[device_num].device));
308  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryClockKhz,
309  CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
310  device_properties_[device_num].device));
311  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryBusWidth,
312  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
313  device_properties_[device_num].device));
314  device_properties_[device_num].memoryBandwidthGBs =
315  device_properties_[device_num].memoryClockKhz / 1000000.0 / 8.0 *
316  device_properties_[device_num].memoryBusWidth;
317 
318  // capture memory allocation granularity
319  device_properties_[device_num].allocationGranularity = getGranularity(device_num);
320  }
322  computeMinSharedMemoryPerBlockForAllDevices();
323  min_num_mps_for_all_devices = computeMinNumMPsForAllDevices();
324 }
325 
326 int8_t* CudaMgr::allocatePinnedHostMem(const size_t num_bytes) {
327  setContext(0);
328  void* host_ptr;
329  checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
330  return reinterpret_cast<int8_t*>(host_ptr);
331 }
332 
333 int8_t* CudaMgr::allocateDeviceMem(const size_t num_bytes,
334  const int device_num,
335  const bool is_slab) {
336  std::lock_guard<std::mutex> map_lock(device_mutex_);
337  setContext(device_num);
338 
339  CUdeviceptr device_ptr{};
340  CUmemGenericAllocationHandle handle{};
341  auto granularity = getGranularity(device_num);
342  // reserve the actual memory
343  auto padded_num_bytes = computePaddedBufferSize(num_bytes, granularity);
344  auto status = cuMemAddressReserve(&device_ptr, padded_num_bytes, granularity, 0, 0);
345 
346  if (status == CUDA_SUCCESS) {
347  // create a handle for the allocation
348  CUmemAllocationProp allocation_prop{};
349  allocation_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
350  allocation_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
351  allocation_prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
352  allocation_prop.location.id = device_num + start_gpu_;
353  status = cuMemCreate(&handle, padded_num_bytes, &allocation_prop, 0);
354 
355  if (status == CUDA_SUCCESS) {
356  // map the memory
357  status = cuMemMap(device_ptr, padded_num_bytes, 0, handle, 0);
358 
359  if (status == CUDA_SUCCESS) {
360  // set the memory access
361  CUmemAccessDesc access_desc{};
362  access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
363  access_desc.location.id = device_num + start_gpu_;
364  access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
365  status = cuMemSetAccess(device_ptr, padded_num_bytes, &access_desc, 1);
366  }
367  }
368  }
369 
370  if (status != CUDA_SUCCESS) {
371  // clean up in reverse order
372  if (device_ptr && handle) {
373  cuMemUnmap(device_ptr, padded_num_bytes);
374  }
375  if (handle) {
376  cuMemRelease(handle);
377  }
378  if (device_ptr) {
379  cuMemAddressFree(device_ptr, padded_num_bytes);
380  }
381  throw CudaErrorException(status);
382  }
383  // emplace in the map
384  auto const& device_uuid = getDeviceProperties(device_num)->uuid;
385  getDeviceMemoryAllocationMap().addAllocation(
386  device_ptr, padded_num_bytes, handle, device_uuid, device_num, is_slab);
387  // notify
388  getDeviceMemoryAllocationMap().notifyMapChanged(device_uuid, is_slab);
389  return reinterpret_cast<int8_t*>(device_ptr);
390 }
391 
392 void CudaMgr::freeDeviceMem(int8_t* device_ptr) {
393  // take lock
394  std::lock_guard<std::mutex> map_lock(device_mutex_);
395  // fetch and remove from map
396  auto const cu_device_ptr = reinterpret_cast<CUdeviceptr>(device_ptr);
397  auto allocation = getDeviceMemoryAllocationMap().removeAllocation(cu_device_ptr);
398  // attempt to unmap, release, free
399  auto status_unmap = cuMemUnmap(cu_device_ptr, allocation.size);
400  auto status_release = cuMemRelease(allocation.handle);
401  auto status_free = cuMemAddressFree(cu_device_ptr, allocation.size);
402  // check for errors
403  checkError(status_unmap);
404  checkError(status_release);
405  checkError(status_free);
406  // notify
407  getDeviceMemoryAllocationMap().notifyMapChanged(allocation.device_uuid,
408  allocation.is_slab);
409 }
410 
411 void CudaMgr::zeroDeviceMem(int8_t* device_ptr,
412  const size_t num_bytes,
413  const int device_num,
414  CUstream cuda_stream) {
415  setDeviceMem(device_ptr, 0, num_bytes, device_num, cuda_stream);
416 }
417 
418 void CudaMgr::setDeviceMem(int8_t* device_ptr,
419  const unsigned char uc,
420  const size_t num_bytes,
421  const int device_num,
422  CUstream cuda_stream) {
423  setContext(device_num);
424  if (!cuda_stream) {
425  checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
426  } else {
427  checkError(cuMemsetD8Async(
428  reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes, cuda_stream));
429  checkError(cuStreamSynchronize(cuda_stream));
430  }
431 }
432 
438  for (int i = 0; i < device_count_; i++) {
439  if (device_properties_[i].computeMajor < 5) {
440  return false;
441  }
442  }
443  return true;
444 }
445 
451  for (int i = 0; i < device_count_; i++) {
452  if (device_properties_[i].computeMajor < 7) {
453  return false;
454  }
455  }
456  return true;
457 }
458 
463 size_t CudaMgr::computeMinSharedMemoryPerBlockForAllDevices() const {
464  int shared_mem_size =
465  device_count_ > 0 ? device_properties_.front().sharedMemPerBlock : 0;
466  for (int d = 1; d < device_count_; d++) {
467  shared_mem_size = std::min(shared_mem_size, device_properties_[d].sharedMemPerBlock);
468  }
469  return shared_mem_size;
470 }
471 
476 size_t CudaMgr::computeMinNumMPsForAllDevices() const {
477  int num_mps = device_count_ > 0 ? device_properties_.front().numMPs : 0;
478  for (int d = 1; d < device_count_; d++) {
479  num_mps = std::min(num_mps, device_properties_[d].numMPs);
480  }
481  return num_mps;
482 }
483 
484 void CudaMgr::createDeviceContexts() {
485  CHECK_EQ(device_contexts_.size(), size_t(0));
486  device_contexts_.resize(device_count_);
487  for (int d = 0; d < device_count_; ++d) {
488  CUresult status = cuCtxCreate(&device_contexts_[d], 0, device_properties_[d].device);
489  if (status != CUDA_SUCCESS) {
490  // this is called from destructor so we need
491  // to clean up
492  // destroy all contexts up to this point
493  for (int destroy_id = 0; destroy_id <= d; ++destroy_id) {
494  try {
495  checkError(cuCtxDestroy(device_contexts_[destroy_id]));
496  } catch (const CudaErrorException& e) {
497  LOG(ERROR) << "Failed to destroy CUDA context for device ID " << destroy_id
498  << " with " << e.what()
499  << ". CUDA contexts were being destroyed due to an error creating "
500  "CUDA context for device ID "
501  << d << " out of " << device_count_ << " (" << errorMessage(status)
502  << ").";
503  }
504  }
505  // checkError will translate the message and throw
506  checkError(status);
507  }
508  }
509 }
510 
511 void CudaMgr::setContext(const int device_num) const {
512  // deviceNum is the device number relative to startGpu (realDeviceNum - startGpu_)
513  CHECK_LT(device_num, device_count_);
514  cuCtxSetCurrent(device_contexts_[device_num]);
515 }
516 
517 int CudaMgr::getContext() const {
518  CUcontext cnow;
519  checkError(cuCtxGetCurrent(&cnow));
520  if (cnow == NULL) {
521  throw std::runtime_error("no cuda device context");
522  }
523  int device_num{0};
524  for (auto& c : device_contexts_) {
525  if (c == cnow) {
526  return device_num;
527  }
528  ++device_num;
529  }
530  // TODO(sy): Change device_contexts_ to have O(1) lookup? (Or maybe not worth it.)
531  throw std::runtime_error("invalid cuda device context");
532 }
533 
534 void CudaMgr::logDeviceProperties() const {
535  LOG(INFO) << "Using " << device_count_ << " Gpus.";
536  for (int d = 0; d < device_count_; ++d) {
537  VLOG(1) << "Device: " << device_properties_[d].device;
538  VLOG(1) << "UUID: " << device_properties_[d].uuid;
539  VLOG(1) << "Clock (khz): " << device_properties_[d].clockKhz;
540  VLOG(1) << "Compute Major: " << device_properties_[d].computeMajor;
541  VLOG(1) << "Compute Minor: " << device_properties_[d].computeMinor;
542  VLOG(1) << "PCI bus id: " << device_properties_[d].pciBusId;
543  VLOG(1) << "PCI deviceId id: " << device_properties_[d].pciDeviceId;
544  VLOG(1) << "Per device global memory: "
545  << device_properties_[d].globalMem / 1073741824.0 << " GB";
546  VLOG(1) << "Memory clock (khz): " << device_properties_[d].memoryClockKhz;
547  VLOG(1) << "Memory bandwidth: " << device_properties_[d].memoryBandwidthGBs
548  << " GB/sec";
549 
550  VLOG(1) << "Constant Memory: " << device_properties_[d].constantMem;
551  VLOG(1) << "Shared memory per multiprocessor: "
552  << device_properties_[d].sharedMemPerMP;
553  VLOG(1) << "Shared memory per block: " << device_properties_[d].sharedMemPerBlock;
554  VLOG(1) << "Number of MPs: " << device_properties_[d].numMPs;
555  VLOG(1) << "Warp Size: " << device_properties_[d].warpSize;
556  VLOG(1) << "Max threads per block: " << device_properties_[d].maxThreadsPerBlock;
557  VLOG(1) << "Max registers per block: " << device_properties_[d].maxRegistersPerBlock;
558  VLOG(1) << "Max register per MP: " << device_properties_[d].maxRegistersPerMP;
559  VLOG(1) << "Memory bus width in bits: " << device_properties_[d].memoryBusWidth;
560  }
561 }
562 
563 void CudaMgr::checkError(CUresult status) const {
564  if (status != CUDA_SUCCESS) {
565  throw CudaErrorException(status);
566  }
567 }
568 
569 DeviceMemoryAllocationMap& CudaMgr::getDeviceMemoryAllocationMap() {
570  CHECK(device_memory_allocation_map_);
571  return *device_memory_allocation_map_;
572 }
573 
574 int CudaMgr::exportHandle(const uint64_t handle) const {
575  int fd{-1};
576  checkError(cuMemExportToShareableHandle(
577  &fd, handle, CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR, 0));
578  return fd;
579 }
580 
581 } // namespace CudaMgr_Namespace
582 
583 std::string get_cuda_home(void) {
584  static const char* CUDA_DEFAULT_PATH = "/usr/local/cuda";
585  const char* env = nullptr;
586 
587  if (!(env = getenv("CUDA_HOME")) && !(env = getenv("CUDA_DIR"))) {
588  // check if the default CUDA directory exists: /usr/local/cuda
589  if (boost::filesystem::exists(boost::filesystem::path(CUDA_DEFAULT_PATH))) {
590  env = CUDA_DEFAULT_PATH;
591  }
592  }
593 
594  if (env == nullptr) {
595  LOG(WARNING) << "Could not find CUDA installation path: environment variables "
596  "CUDA_HOME or CUDA_DIR are not defined";
597  return "";
598  }
599 
600  // check if the CUDA directory is sensible:
601  auto cuda_include_dir = env + std::string("/include");
602  auto cuda_h_file = cuda_include_dir + "/cuda.h";
603  if (!boost::filesystem::exists(boost::filesystem::path(cuda_h_file))) {
604  LOG(WARNING) << "cuda.h does not exist in `" << cuda_include_dir << "`. Discarding `"
605  << env << "` as CUDA installation path.";
606  return "";
607  }
608 
609  return std::string(env);
610 }
611 
612 std::string get_cuda_libdevice_dir(void) {
613  static const char* CUDA_DEFAULT_PATH = "/usr/local/cuda";
614  const char* env = nullptr;
615 
616  if (!(env = getenv("CUDA_HOME")) && !(env = getenv("CUDA_DIR"))) {
617  // check if the default CUDA directory exists: /usr/local/cuda
618  if (boost::filesystem::exists(boost::filesystem::path(CUDA_DEFAULT_PATH))) {
619  env = CUDA_DEFAULT_PATH;
620  }
621  }
622 
623  if (env == nullptr) {
624  LOG(WARNING) << "Could not find CUDA installation path: environment variables "
625  "CUDA_HOME or CUDA_DIR are not defined";
626  return "";
627  }
628 
629  // check if the CUDA directory is sensible:
630  auto libdevice_dir = env + std::string("/nvvm/libdevice");
631  auto libdevice_bc_file = libdevice_dir + "/libdevice.10.bc";
632  if (!boost::filesystem::exists(boost::filesystem::path(libdevice_bc_file))) {
633  LOG(WARNING) << "`" << libdevice_bc_file << "` does not exist. Discarding `" << env
634  << "` as CUDA installation path with libdevice.";
635  return "";
636  }
637 
638  return libdevice_dir;
639 }
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:127
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:264
std::string get_cuda_libdevice_dir(void)
Definition: CudaMgr.cpp:612
int CUjit_option
Definition: nocuda.h:26
heavyai::DeviceGroup device_group_
Definition: CudaMgr.h:266
int CUcontext
Definition: nocuda.h:22
std::mutex device_mutex_
Definition: CudaMgr.h:268
size_t getGranularity(const int device_num) const
Definition: CudaMgr.cpp:109
void * CUstream
Definition: nocuda.h:23
#define LOG(tag)
Definition: Logger.h:285
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:418
int8_t * allocatePinnedHostMem(const size_t num_bytes)
Definition: CudaMgr.cpp:326
void nvidia_jit_warmup()
unsigned long long CUdeviceptr
Definition: nocuda.h:28
void setContext(const int device_num) const
Definition: CudaMgr.cpp:511
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:263
std::string to_string(char const *&&v)
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:267
std::string get_cuda_home(void)
Definition: CudaMgr.cpp:583
void freeDeviceMem(int8_t *device_ptr)
Definition: CudaMgr.cpp:392
std::string errorMessage(CUresult const status)
Definition: CudaMgr.cpp:40
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:164
size_t computePaddedBufferSize(size_t buf_size, size_t granularity) const
Definition: CudaMgr.cpp:105
#define CHECK_LT(x, y)
Definition: Logger.h:303
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:437
#define CHECK_LE(x, y)
Definition: Logger.h:304
int CUresult
Definition: nocuda.h:21
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:143
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:265
virtual int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num, const bool is_slab=false)
Definition: CudaMgr.cpp:333
#define CHECK(condition)
Definition: Logger.h:291
void synchronizeDevices() const
Definition: CudaMgr.cpp:120
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:411
const DeviceProperties * getDeviceProperties(const size_t device_num) const
Definition: CudaMgr.h:137
CudaMgr(const int num_gpus, const int start_gpu=0)
Definition: CudaMgr.cpp:48
bool isArchVoltaOrGreaterForAll() const
Definition: CudaMgr.cpp:450
#define VLOG(n)
Definition: Logger.h:388
void * CUmodule
Definition: nocuda.h:24