26 #include <boost/filesystem.hpp>
29 namespace CudaMgr_Namespace {
31 CudaErrorException::CudaErrorException(
CUresult status)
32 : std::runtime_error(
errorMessage(status)), status_(status) {
35 if (status != CUDA_ERROR_DEINITIALIZED) {
37 VLOG(1) << boost::stacktrace::stacktrace();
42 const char* errorString{
nullptr};
43 cuGetErrorString(status, &errorString);
45 ?
"CUDA Error (" +
std::to_string(status) +
"): " + std::string(errorString)
50 : start_gpu_(start_gpu)
51 , min_shared_memory_per_block_for_all_devices(0)
52 , min_num_mps_for_all_devices(0) {
53 checkError(cuInit(0));
62 fillDeviceProperties();
64 createDeviceContexts();
65 printDeviceProperties();
68 LOG(
INFO) <<
"Warming up the GPU JIT Compiler... (this may take several seconds)";
71 LOG(
INFO) <<
"GPU JIT Compiler initialized.";
74 void CudaMgr::initDeviceGroup() {
75 for (
int device_id = 0; device_id <
device_count_; device_id++) {
91 }
catch (
const CudaErrorException& e) {
92 if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
96 LOG(
ERROR) <<
"CUDA Error: " << e.what();
97 }
catch (
const std::runtime_error& e) {
98 LOG(
ERROR) <<
"CUDA Error: " << e.what();
105 checkError(cuCtxSynchronize());
110 const int8_t* host_ptr,
111 const size_t num_bytes,
112 const int device_num,
117 cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
119 checkError(cuMemcpyHtoDAsync(
120 reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes, cuda_stream));
121 checkError(cuStreamSynchronize(cuda_stream));
126 const int8_t* device_ptr,
127 const size_t num_bytes,
128 const int device_num,
132 checkError(cuMemcpyDtoH(
133 host_ptr, reinterpret_cast<const CUdeviceptr>(device_ptr), num_bytes));
135 checkError(cuMemcpyDtoHAsync(host_ptr,
136 reinterpret_cast<const CUdeviceptr>(device_ptr),
139 checkError(cuStreamSynchronize(cuda_stream));
145 const size_t num_bytes,
146 const int dest_device_num,
147 const int src_device_num,
151 if (src_device_num == dest_device_num) {
154 checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
155 reinterpret_cast<CUdeviceptr>(src_ptr),
158 checkError(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
159 reinterpret_cast<CUdeviceptr>(src_ptr),
162 checkError(cuStreamSynchronize(cuda_stream));
166 checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
168 reinterpret_cast<CUdeviceptr>(src_ptr),
172 checkError(cuMemcpyPeerAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
174 reinterpret_cast<CUdeviceptr>(src_ptr),
178 checkError(cuStreamSynchronize(cuda_stream));
183 void CudaMgr::loadGpuModuleData(
CUmodule* module,
185 unsigned int num_options,
188 const int device_id)
const {
190 checkError(cuModuleLoadDataEx(module, image, num_options, options, option_vals));
193 void CudaMgr::unloadGpuModuleData(
CUmodule* module,
const int device_id)
const {
199 auto code = cuModuleUnload(*module);
201 if (code != CUDA_ERROR_DEINITIALIZED) {
204 }
catch (
const std::runtime_error& e) {
205 LOG(
ERROR) <<
"CUDA Error: " << e.what();
209 CudaMgr::CudaMemoryUsage CudaMgr::getCudaMemoryUsage() {
210 CudaMemoryUsage usage;
211 cuMemGetInfo(&usage.free, &usage.total);
215 void CudaMgr::fillDeviceProperties() {
217 cuDriverGetVersion(&gpu_driver_version_);
218 for (
int device_num = 0; device_num <
device_count_; ++device_num) {
225 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
228 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
233 CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
237 CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
240 CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
243 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
246 CU_DEVICE_ATTRIBUTE_WARP_SIZE,
249 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
252 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
255 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
258 CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
261 CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
264 CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
267 CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
270 CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
277 computeMinSharedMemoryPerBlockForAllDevices();
284 checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
285 return reinterpret_cast<int8_t*
>(host_ptr);
291 checkError(cuMemAlloc(&device_ptr, num_bytes));
292 return reinterpret_cast<int8_t*
>(device_ptr);
296 checkError(cuMemFreeHost(reinterpret_cast<void*>(host_ptr)));
302 checkError(cuMemFree(reinterpret_cast<CUdeviceptr>(device_ptr)));
306 const size_t num_bytes,
307 const int device_num,
309 setDeviceMem(device_ptr, 0, num_bytes, device_num, cuda_stream);
313 const unsigned char uc,
314 const size_t num_bytes,
315 const int device_num,
319 checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
321 checkError(cuMemsetD8Async(
322 reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes, cuda_stream));
323 checkError(cuStreamSynchronize(cuda_stream));
357 size_t CudaMgr::computeMinSharedMemoryPerBlockForAllDevices()
const {
358 int shared_mem_size =
363 return shared_mem_size;
370 size_t CudaMgr::computeMinNumMPsForAllDevices()
const {
378 void CudaMgr::createDeviceContexts() {
383 if (status != CUDA_SUCCESS) {
387 for (
int destroy_id = 0; destroy_id <= d; ++destroy_id) {
390 }
catch (
const CudaErrorException& e) {
391 LOG(
ERROR) <<
"Failed to destroy CUDA context for device ID " << destroy_id
392 <<
" with " << e.what()
393 <<
". CUDA contexts were being destroyed due to an error creating "
394 "CUDA context for device ID "
395 << d <<
" out of " << device_count_ <<
" (" <<
errorMessage(status)
407 CHECK_LT(device_num, device_count_);
413 checkError(cuCtxGetCurrent(&cnow));
415 throw std::runtime_error(
"no cuda device context");
425 throw std::runtime_error(
"invalid cuda device context");
428 void CudaMgr::printDeviceProperties()
const {
429 LOG(
INFO) <<
"Using " << device_count_ <<
" Gpus.";
438 VLOG(1) <<
"Per device global memory: "
445 VLOG(1) <<
"Shared memory per multiprocessor: "
457 void CudaMgr::checkError(
CUresult status)
const {
458 if (status != CUDA_SUCCESS) {
459 throw CudaErrorException(status);
466 static const char* CUDA_DEFAULT_PATH =
"/usr/local/cuda";
467 const char* env =
nullptr;
469 if (!(env = getenv(
"CUDA_HOME")) && !(env = getenv(
"CUDA_DIR"))) {
471 if (boost::filesystem::exists(boost::filesystem::path(CUDA_DEFAULT_PATH))) {
472 env = CUDA_DEFAULT_PATH;
476 if (env ==
nullptr) {
477 LOG(
WARNING) <<
"Could not find CUDA installation path: environment variables "
478 "CUDA_HOME or CUDA_DIR are not defined";
483 auto cuda_include_dir = env + std::string(
"/include");
484 auto cuda_h_file = cuda_include_dir +
"/cuda.h";
485 if (!boost::filesystem::exists(boost::filesystem::path(cuda_h_file))) {
486 LOG(
WARNING) <<
"cuda.h does not exist in `" << cuda_include_dir <<
"`. Discarding `"
487 << env <<
"` as CUDA installation path.";
491 return std::string(env);
495 static const char* CUDA_DEFAULT_PATH =
"/usr/local/cuda";
496 const char* env =
nullptr;
498 if (!(env = getenv(
"CUDA_HOME")) && !(env = getenv(
"CUDA_DIR"))) {
500 if (boost::filesystem::exists(boost::filesystem::path(CUDA_DEFAULT_PATH))) {
501 env = CUDA_DEFAULT_PATH;
505 if (env ==
nullptr) {
506 LOG(
WARNING) <<
"Could not find CUDA installation path: environment variables "
507 "CUDA_HOME or CUDA_DIR are not defined";
512 auto libdevice_dir = env + std::string(
"/nvvm/libdevice");
513 auto libdevice_bc_file = libdevice_dir +
"/libdevice.10.bc";
514 if (!boost::filesystem::exists(boost::filesystem::path(libdevice_bc_file))) {
515 LOG(
WARNING) <<
"`" << libdevice_bc_file <<
"` does not exist. Discarding `" << env
516 <<
"` as CUDA installation path with libdevice.";
520 return libdevice_dir;
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
size_t min_num_mps_for_all_devices
std::string get_cuda_libdevice_dir(void)
heavyai::DeviceGroup device_group_
std::mutex device_cleanup_mutex_
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
int8_t * allocatePinnedHostMem(const size_t num_bytes)
unsigned long long CUdeviceptr
void setContext(const int device_num) const
size_t min_shared_memory_per_block_for_all_devices
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
std::vector< CUcontext > device_contexts_
std::string get_cuda_home(void)
void freeDeviceMem(int8_t *device_ptr)
std::string errorMessage(CUresult const status)
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
bool isArchMaxwellOrLaterForAll() const
std::vector< DeviceProperties > device_properties_
void freePinnedHostMem(int8_t *host_ptr)
void synchronizeDevices() const
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
CudaMgr(const int num_gpus, const int start_gpu=0)
bool isArchVoltaOrGreaterForAll() const
int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)