34 namespace CudaMgr_Namespace {
58 class CudaErrorException :
public std::runtime_error {
62 CUresult getStatus()
const {
return status_; }
95 CudaMgr(
const int num_gpus,
const int start_gpu = 0);
106 const int8_t* host_ptr,
107 const size_t num_bytes,
108 const int device_num,
111 const int8_t* device_ptr,
112 const size_t num_bytes,
116 const size_t num_bytes,
117 const int dest_device_num,
118 const int src_device_num,
126 const size_t num_bytes,
127 const int device_num,
130 const unsigned char uc,
131 const size_t num_bytes,
132 const int device_num,
150 throw std::runtime_error(
"Specified device number " +
std::to_string(device_num) +
151 " is out of range of number of devices (" +
185 LOG(
WARNING) <<
"Unrecognized Nvidia device architecture, falling back to "
186 "Kepler-compatibility.";
196 switch (device_properties.computeMajor) {
204 if (device_properties.computeMinor < 5) {
225 void logDeviceProperties()
const;
227 const std::vector<CUcontext>& getDeviceContexts()
const {
230 const int getGpuDriverVersion()
const {
231 return gpu_driver_version_;
234 void loadGpuModuleData(
CUmodule* module,
236 unsigned int num_options,
238 void** option_values,
239 const int device_id)
const;
240 void unloadGpuModuleData(
CUmodule* module,
const int device_id)
const;
242 struct CudaMemoryUsage {
247 static CudaMemoryUsage getCudaMemoryUsage();
252 void fillDeviceProperties();
253 void initDeviceGroup();
254 void createDeviceContexts();
255 size_t computeMinSharedMemoryPerBlockForAllDevices()
const;
256 size_t computeMinNumMPsForAllDevices()
const;
257 void checkError(
CUresult cu_result)
const;
259 int gpu_driver_version_;
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
size_t min_num_mps_for_all_devices
std::string get_cuda_libdevice_dir(void)
heavyai::DeviceGroup device_group_
std::vector< DeviceIdentifier > DeviceGroup
size_t getGranularity(const int device_num) const
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
int8_t * allocatePinnedHostMem(const size_t num_bytes)
DeviceMemoryAllocationMap device_memory_allocation_map_
void setContext(const int device_num) const
bool isArchPascalOrLater() const
size_t min_shared_memory_per_block_for_all_devices
uint64_t DeviceMemoryPtrConstant
size_t getMinNumMPsForAllDevices() const
std::vector< CUcontext > device_contexts_
std::string get_cuda_home(void)
void freeDeviceMem(int8_t *device_ptr)
std::string errorMessage(CUresult const status)
int getDeviceCount() const
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
size_t getMinSharedMemoryPerBlockForAllDevices() const
size_t computePaddedBufferSize(size_t buf_size, size_t granularity) const
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
size_t allocationGranularity
bool isArchMaxwellOrLaterForAll() const
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, CUstream cuda_stream=0)
std::vector< DeviceProperties > device_properties_
std::map< DeviceMemoryPtrConstant, DeviceMemoryMetadata > DeviceMemoryAllocationMap
void freePinnedHostMem(int8_t *host_ptr)
void synchronizeDevices() const
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
const DeviceProperties * getDeviceProperties(const size_t device_num) const
bool isArchMaxwell() const
const heavyai::DeviceGroup & getDeviceGroup() const
bool isArchPascal() const
CudaMgr(const int num_gpus, const int start_gpu=0)
const std::vector< DeviceProperties > & getAllDeviceProperties() const
bool isArchVoltaOrGreaterForAll() const
NvidiaDeviceArch getDeviceArch() const
bool isArchMaxwellOrLater() const
virtual int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)