25 #include <boost/filesystem.hpp>
28 namespace CudaMgr_Namespace {
30 CudaErrorException::CudaErrorException(
CUresult status)
31 : std::runtime_error(
errorMessage(status)), status_(status) {
34 if (status != CUDA_ERROR_DEINITIALIZED) {
36 VLOG(1) << boost::stacktrace::stacktrace();
41 const char* errorString{
nullptr};
42 cuGetErrorString(status, &errorString);
44 ?
"CUDA Error (" +
std::to_string(status) +
"): " + std::string(errorString)
49 : start_gpu_(start_gpu)
50 , min_shared_memory_per_block_for_all_devices(0)
51 , min_num_mps_for_all_devices(0) {
52 checkError(cuInit(0));
61 fillDeviceProperties();
63 createDeviceContexts();
64 logDeviceProperties();
67 LOG(
INFO) <<
"Warming up the GPU JIT Compiler... (this may take several seconds)";
70 LOG(
INFO) <<
"GPU JIT Compiler initialized.";
73 void CudaMgr::initDeviceGroup() {
74 for (
int device_id = 0; device_id <
device_count_; device_id++) {
91 }
catch (
const CudaErrorException& e) {
92 if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
96 LOG(
ERROR) <<
"CUDA Error: " << e.what();
97 }
catch (
const std::runtime_error& e) {
98 LOG(
ERROR) <<
"CUDA Error: " << e.what();
103 return (((buf_size + (granularity - 1)) / granularity) * granularity);
107 CUmemAllocationProp allocation_prop{};
108 allocation_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
109 allocation_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
110 allocation_prop.location.id = device_num;
111 size_t granularity{};
112 checkError(cuMemGetAllocationGranularity(
113 &granularity, &allocation_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
120 checkError(cuCtxSynchronize());
125 const int8_t* host_ptr,
126 const size_t num_bytes,
127 const int device_num,
132 cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
134 checkError(cuMemcpyHtoDAsync(
135 reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes, cuda_stream));
136 checkError(cuStreamSynchronize(cuda_stream));
141 const int8_t* device_ptr,
142 const size_t num_bytes,
145 auto const cu_device_ptr =
reinterpret_cast<CUdeviceptr>(device_ptr);
151 auto const& allocation_base = itr->first;
152 auto const& allocation_size = itr->second.size;
153 CHECK_LE(cu_device_ptr + num_bytes, allocation_base + allocation_size);
154 auto const& allocation_device_num = itr->second.device_num;
158 checkError(cuMemcpyDtoH(host_ptr, cu_device_ptr, num_bytes));
160 checkError(cuMemcpyDtoHAsync(host_ptr, cu_device_ptr, num_bytes, cuda_stream));
161 checkError(cuStreamSynchronize(cuda_stream));
167 const size_t num_bytes,
168 const int dest_device_num,
169 const int src_device_num,
173 if (src_device_num == dest_device_num) {
176 checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
177 reinterpret_cast<CUdeviceptr>(src_ptr),
180 checkError(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
181 reinterpret_cast<CUdeviceptr>(src_ptr),
184 checkError(cuStreamSynchronize(cuda_stream));
188 checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
190 reinterpret_cast<CUdeviceptr>(src_ptr),
194 checkError(cuMemcpyPeerAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
196 reinterpret_cast<CUdeviceptr>(src_ptr),
200 checkError(cuStreamSynchronize(cuda_stream));
205 void CudaMgr::loadGpuModuleData(
CUmodule* module,
207 unsigned int num_options,
210 const int device_id)
const {
212 checkError(cuModuleLoadDataEx(module, image, num_options, options, option_vals));
215 void CudaMgr::unloadGpuModuleData(
CUmodule* module,
const int device_id)
const {
220 auto code = cuModuleUnload(*module);
222 if (code != CUDA_ERROR_DEINITIALIZED) {
225 }
catch (
const std::runtime_error& e) {
226 LOG(
ERROR) <<
"CUDA Error: " << e.what();
230 CudaMgr::CudaMemoryUsage CudaMgr::getCudaMemoryUsage() {
231 CudaMemoryUsage usage;
232 cuMemGetInfo(&usage.free, &usage.total);
236 void CudaMgr::fillDeviceProperties() {
238 cuDriverGetVersion(&gpu_driver_version_);
239 for (
int device_num = 0; device_num <
device_count_; ++device_num) {
246 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
249 CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
254 CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
258 CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
261 CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
264 CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
267 CU_DEVICE_ATTRIBUTE_WARP_SIZE,
270 CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
273 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
276 CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
279 CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
282 CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
285 CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
288 CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
291 CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
301 computeMinSharedMemoryPerBlockForAllDevices();
308 checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
309 return reinterpret_cast<int8_t*
>(host_ptr);
317 CUmemGenericAllocationHandle handle{};
321 auto status = cuMemAddressReserve(&device_ptr, padded_num_bytes, granularity, 0, 0);
323 if (status == CUDA_SUCCESS) {
325 CUmemAllocationProp allocation_prop{};
326 allocation_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
327 allocation_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
328 allocation_prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
329 allocation_prop.location.id = device_num +
start_gpu_;
330 status = cuMemCreate(&handle, padded_num_bytes, &allocation_prop, 0);
332 if (status == CUDA_SUCCESS) {
334 status = cuMemMap(device_ptr, padded_num_bytes, 0, handle, 0);
336 if (status == CUDA_SUCCESS) {
338 CUmemAccessDesc access_desc{};
339 access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
340 access_desc.location.id = device_num +
start_gpu_;
341 access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
342 status = cuMemSetAccess(device_ptr, padded_num_bytes, &access_desc, 1);
347 if (status != CUDA_SUCCESS) {
349 if (device_ptr && handle) {
350 cuMemUnmap(device_ptr, padded_num_bytes);
353 cuMemRelease(handle);
356 cuMemAddressFree(device_ptr, padded_num_bytes);
358 throw CudaErrorException(status);
364 return reinterpret_cast<int8_t*
>(device_ptr);
371 auto const cu_device_ptr =
reinterpret_cast<CUdeviceptr>(device_ptr);
375 auto const size = itr->second.size;
376 auto const handle = itr->second.handle;
378 auto status_unmap = cuMemUnmap(cu_device_ptr, size);
379 auto status_release = cuMemRelease(handle);
380 auto status_free = cuMemAddressFree(cu_device_ptr, size);
384 checkError(status_unmap);
385 checkError(status_release);
386 checkError(status_free);
390 const size_t num_bytes,
391 const int device_num,
393 setDeviceMem(device_ptr, 0, num_bytes, device_num, cuda_stream);
397 const unsigned char uc,
398 const size_t num_bytes,
399 const int device_num,
403 checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
405 checkError(cuMemsetD8Async(
406 reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes, cuda_stream));
407 checkError(cuStreamSynchronize(cuda_stream));
441 size_t CudaMgr::computeMinSharedMemoryPerBlockForAllDevices()
const {
442 int shared_mem_size =
447 return shared_mem_size;
454 size_t CudaMgr::computeMinNumMPsForAllDevices()
const {
462 void CudaMgr::createDeviceContexts() {
467 if (status != CUDA_SUCCESS) {
471 for (
int destroy_id = 0; destroy_id <= d; ++destroy_id) {
474 }
catch (
const CudaErrorException& e) {
475 LOG(
ERROR) <<
"Failed to destroy CUDA context for device ID " << destroy_id
476 <<
" with " << e.what()
477 <<
". CUDA contexts were being destroyed due to an error creating "
478 "CUDA context for device ID "
479 << d <<
" out of " << device_count_ <<
" (" <<
errorMessage(status)
491 CHECK_LT(device_num, device_count_);
497 checkError(cuCtxGetCurrent(&cnow));
499 throw std::runtime_error(
"no cuda device context");
509 throw std::runtime_error(
"invalid cuda device context");
512 void CudaMgr::logDeviceProperties()
const {
513 LOG(
INFO) <<
"Using " << device_count_ <<
" Gpus.";
522 VLOG(1) <<
"Per device global memory: "
529 VLOG(1) <<
"Shared memory per multiprocessor: "
541 void CudaMgr::checkError(
CUresult status)
const {
542 if (status != CUDA_SUCCESS) {
543 throw CudaErrorException(status);
550 static const char* CUDA_DEFAULT_PATH =
"/usr/local/cuda";
551 const char* env =
nullptr;
553 if (!(env = getenv(
"CUDA_HOME")) && !(env = getenv(
"CUDA_DIR"))) {
555 if (boost::filesystem::exists(boost::filesystem::path(CUDA_DEFAULT_PATH))) {
556 env = CUDA_DEFAULT_PATH;
560 if (env ==
nullptr) {
561 LOG(
WARNING) <<
"Could not find CUDA installation path: environment variables "
562 "CUDA_HOME or CUDA_DIR are not defined";
567 auto cuda_include_dir = env + std::string(
"/include");
568 auto cuda_h_file = cuda_include_dir +
"/cuda.h";
569 if (!boost::filesystem::exists(boost::filesystem::path(cuda_h_file))) {
570 LOG(
WARNING) <<
"cuda.h does not exist in `" << cuda_include_dir <<
"`. Discarding `"
571 << env <<
"` as CUDA installation path.";
575 return std::string(env);
579 static const char* CUDA_DEFAULT_PATH =
"/usr/local/cuda";
580 const char* env =
nullptr;
582 if (!(env = getenv(
"CUDA_HOME")) && !(env = getenv(
"CUDA_DIR"))) {
584 if (boost::filesystem::exists(boost::filesystem::path(CUDA_DEFAULT_PATH))) {
585 env = CUDA_DEFAULT_PATH;
589 if (env ==
nullptr) {
590 LOG(
WARNING) <<
"Could not find CUDA installation path: environment variables "
591 "CUDA_HOME or CUDA_DIR are not defined";
596 auto libdevice_dir = env + std::string(
"/nvvm/libdevice");
597 auto libdevice_bc_file = libdevice_dir +
"/libdevice.10.bc";
598 if (!boost::filesystem::exists(boost::filesystem::path(libdevice_bc_file))) {
599 LOG(
WARNING) <<
"`" << libdevice_bc_file <<
"` does not exist. Discarding `" << env
600 <<
"` as CUDA installation path with libdevice.";
604 return libdevice_dir;
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
size_t min_num_mps_for_all_devices
std::string get_cuda_libdevice_dir(void)
heavyai::DeviceGroup device_group_
size_t getGranularity(const int device_num) const
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
int8_t * allocatePinnedHostMem(const size_t num_bytes)
DeviceMemoryAllocationMap device_memory_allocation_map_
unsigned long long CUdeviceptr
void setContext(const int device_num) const
size_t min_shared_memory_per_block_for_all_devices
std::vector< CUcontext > device_contexts_
std::string get_cuda_home(void)
void freeDeviceMem(int8_t *device_ptr)
std::string errorMessage(CUresult const status)
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
size_t computePaddedBufferSize(size_t buf_size, size_t granularity) const
bool isArchMaxwellOrLaterForAll() const
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, CUstream cuda_stream=0)
std::vector< DeviceProperties > device_properties_
void synchronizeDevices() const
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
const DeviceProperties * getDeviceProperties(const size_t device_num) const
CudaMgr(const int num_gpus, const int start_gpu=0)
bool isArchVoltaOrGreaterForAll() const
virtual int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)