32 namespace CudaMgr_Namespace {
46 class CudaErrorException :
public std::runtime_error {
50 CUresult getStatus()
const {
return status_; }
82 CudaMgr(
const int num_gpus,
const int start_gpu = 0);
91 const int8_t* host_ptr,
92 const size_t num_bytes,
96 const int8_t* device_ptr,
97 const size_t num_bytes,
102 const size_t num_bytes,
103 const int dest_device_num,
104 const int src_device_num,
112 const size_t num_bytes,
113 const int device_num,
116 const unsigned char uc,
117 const size_t num_bytes,
118 const int device_num,
136 throw std::runtime_error(
"Specified device number " +
std::to_string(device_num) +
137 " is out of range of number of devices (" +
171 LOG(
WARNING) <<
"Unrecognized Nvidia device architecture, falling back to "
172 "Kepler-compatibility.";
182 switch (device_properties.computeMajor) {
190 if (device_properties.computeMinor == 0) {
211 void printDeviceProperties()
const;
213 const std::vector<CUcontext>& getDeviceContexts()
const {
return device_contexts_; }
214 const int getGpuDriverVersion()
const {
return gpu_driver_version_; }
216 void loadGpuModuleData(
CUmodule* module,
218 unsigned int num_options,
220 void** option_values,
221 const int device_id)
const;
222 void unloadGpuModuleData(
CUmodule* module,
const int device_id)
const;
224 struct CudaMemoryUsage {
229 static CudaMemoryUsage getCudaMemoryUsage();
234 void fillDeviceProperties();
235 void initDeviceGroup();
236 void createDeviceContexts();
237 size_t computeMinSharedMemoryPerBlockForAllDevices()
const;
238 size_t computeMinNumMPsForAllDevices()
const;
239 void checkError(
CUresult cu_result)
const;
241 int gpu_driver_version_;
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
size_t min_num_mps_for_all_devices
heavyai::DeviceGroup device_group_
std::vector< DeviceIdentifier > DeviceGroup
std::mutex device_cleanup_mutex_
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
int8_t * allocatePinnedHostMem(const size_t num_bytes)
void setContext(const int device_num) const
bool isArchPascalOrLater() const
size_t min_shared_memory_per_block_for_all_devices
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
size_t getMinNumMPsForAllDevices() const
std::vector< CUcontext > device_contexts_
std::string get_cuda_home(void)
void freeDeviceMem(int8_t *device_ptr)
std::string errorMessage(CUresult const status)
int getDeviceCount() const
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
size_t getMinSharedMemoryPerBlockForAllDevices() const
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
bool isArchMaxwellOrLaterForAll() const
std::vector< DeviceProperties > device_properties_
void freePinnedHostMem(int8_t *host_ptr)
void synchronizeDevices() const
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
const DeviceProperties * getDeviceProperties(const size_t device_num) const
bool isArchMaxwell() const
const heavyai::DeviceGroup & getDeviceGroup() const
bool isArchPascal() const
CudaMgr(const int num_gpus, const int start_gpu=0)
const std::vector< DeviceProperties > & getAllDeviceProperties() const
bool isArchVoltaOrGreaterForAll() const
NvidiaDeviceArch getDeviceArch() const
bool isArchMaxwellOrLater() const
int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)