33 namespace CudaMgr_Namespace {
47 class CudaErrorException :
public std::runtime_error {
51 CUresult getStatus()
const {
return status_; }
83 CudaMgr(
const int num_gpus,
const int start_gpu = 0);
92 const int8_t* host_ptr,
93 const size_t num_bytes,
97 const int8_t* device_ptr,
98 const size_t num_bytes,
103 const size_t num_bytes,
104 const int dest_device_num,
105 const int src_device_num,
113 const size_t num_bytes,
114 const int device_num,
117 const unsigned char uc,
118 const size_t num_bytes,
119 const int device_num,
137 throw std::runtime_error(
"Specified device number " +
std::to_string(device_num) +
138 " is out of range of number of devices (" +
172 LOG(
WARNING) <<
"Unrecognized Nvidia device architecture, falling back to "
173 "Kepler-compatibility.";
183 switch (device_properties.computeMajor) {
191 if (device_properties.computeMinor < 5) {
212 void printDeviceProperties()
const;
214 const std::vector<CUcontext>& getDeviceContexts()
const {
return device_contexts_; }
215 const int getGpuDriverVersion()
const {
return gpu_driver_version_; }
217 void loadGpuModuleData(
CUmodule* module,
219 unsigned int num_options,
221 void** option_values,
222 const int device_id)
const;
223 void unloadGpuModuleData(
CUmodule* module,
const int device_id)
const;
225 struct CudaMemoryUsage {
230 static CudaMemoryUsage getCudaMemoryUsage();
235 void fillDeviceProperties();
236 void initDeviceGroup();
237 void createDeviceContexts();
238 size_t computeMinSharedMemoryPerBlockForAllDevices()
const;
239 size_t computeMinNumMPsForAllDevices()
const;
240 void checkError(
CUresult cu_result)
const;
242 int gpu_driver_version_;
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
size_t min_num_mps_for_all_devices
std::string get_cuda_libdevice_dir(void)
heavyai::DeviceGroup device_group_
std::vector< DeviceIdentifier > DeviceGroup
std::mutex device_cleanup_mutex_
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
int8_t * allocatePinnedHostMem(const size_t num_bytes)
void setContext(const int device_num) const
bool isArchPascalOrLater() const
size_t min_shared_memory_per_block_for_all_devices
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
size_t getMinNumMPsForAllDevices() const
std::vector< CUcontext > device_contexts_
std::string get_cuda_home(void)
void freeDeviceMem(int8_t *device_ptr)
std::string errorMessage(CUresult const status)
int getDeviceCount() const
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
size_t getMinSharedMemoryPerBlockForAllDevices() const
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
bool isArchMaxwellOrLaterForAll() const
std::vector< DeviceProperties > device_properties_
void freePinnedHostMem(int8_t *host_ptr)
void synchronizeDevices() const
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
const DeviceProperties * getDeviceProperties(const size_t device_num) const
bool isArchMaxwell() const
const heavyai::DeviceGroup & getDeviceGroup() const
bool isArchPascal() const
CudaMgr(const int num_gpus, const int start_gpu=0)
const std::vector< DeviceProperties > & getAllDeviceProperties() const
bool isArchVoltaOrGreaterForAll() const
NvidiaDeviceArch getDeviceArch() const
bool isArchMaxwellOrLater() const
int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)