OmniSciDB  6686921089
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CudaMgr_Namespace::CudaMgr Class Reference

#include <CudaMgr.h>

Public Member Functions

 CudaMgr (const int num_gpus, const int start_gpu=0)
 
 ~CudaMgr ()
 
void synchronizeDevices () const
 
int getDeviceCount () const
 
int getStartGpu () const
 
const omnisci::DeviceGroupgetDeviceGroup () const
 
void copyHostToDevice (int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num)
 
void copyDeviceToHost (int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, const int device_num)
 
void copyDeviceToDevice (int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num)
 
int8_t * allocatePinnedHostMem (const size_t num_bytes)
 
int8_t * allocateDeviceMem (const size_t num_bytes, const int device_num)
 
void freePinnedHostMem (int8_t *host_ptr)
 
void freeDeviceMem (int8_t *device_ptr)
 
void zeroDeviceMem (int8_t *device_ptr, const size_t num_bytes, const int device_num)
 
void setDeviceMem (int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num)
 
size_t getMinSharedMemoryPerBlockForAllDevices () const
 
size_t getMinNumMPsForAllDevices () const
 
const std::vector
< DeviceProperties > & 
getAllDeviceProperties () const
 
const DevicePropertiesgetDeviceProperties (const size_t device_num) const
 
bool isArchMaxwell () const
 
bool isArchMaxwellOrLater () const
 
bool isArchPascal () const
 
bool isArchPascalOrLater () const
 
bool isArchMaxwellOrLaterForAll () const
 
bool isArchVoltaOrGreaterForAll () const
 
NvidiaDeviceArch getDeviceArch () const
 
void setContext (const int device_num) const
 

Static Public Member Functions

static std::string deviceArchToSM (const NvidiaDeviceArch arch)
 

Private Attributes

int device_count_
 
int start_gpu_
 
size_t min_shared_memory_per_block_for_all_devices
 
size_t min_num_mps_for_all_devices
 
std::vector< DevicePropertiesdevice_properties_
 
omnisci::DeviceGroup device_group_
 
std::vector< CUcontextdevice_contexts_
 
std::mutex device_cleanup_mutex_
 

Detailed Description

Definition at line 80 of file CudaMgr.h.

Constructor & Destructor Documentation

CudaMgr_Namespace::CudaMgr::CudaMgr ( const int  num_gpus,
const int  start_gpu = 0 
)

Definition at line 48 of file CudaMgr.cpp.

References CHECK_EQ, device_count_, logger::INFO, LOG, nvidia_jit_warmup(), setContext(), and start_gpu_.

49  : start_gpu_(start_gpu)
52  checkError(cuInit(0));
53  checkError(cuDeviceGetCount(&device_count_));
54 
55  if (num_gpus > 0) { // numGpus <= 0 will just use number of gpus found
56  device_count_ = std::min(device_count_, num_gpus);
57  } else {
58  // if we are using all gpus we cannot start on a gpu other than 0
59  CHECK_EQ(start_gpu_, 0);
60  }
61  fillDeviceProperties();
62  initDeviceGroup();
63  createDeviceContexts();
64  printDeviceProperties();
65 
66  // warm up the GPU JIT
67  LOG(INFO) << "Warming up the GPU JIT Compiler... (this may take several seconds)";
68  setContext(0);
70  LOG(INFO) << "GPU JIT Compiler initialized.";
71 }
#define CHECK_EQ(x, y)
Definition: Logger.h:217
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:239
#define LOG(tag)
Definition: Logger.h:203
void nvidia_jit_warmup()
void setContext(const int device_num) const
Definition: CudaMgr.cpp:361
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:238

+ Here is the call graph for this function:

CudaMgr_Namespace::CudaMgr::~CudaMgr ( )

Definition at line 80 of file CudaMgr.cpp.

References device_cleanup_mutex_, device_contexts_, device_count_, logger::ERROR, LOG, and synchronizeDevices().

80  {
81  try {
82  // We don't want to remove the cudaMgr before all other processes have cleaned up.
83  // This should be enforced by the lifetime policies, but take this lock to be safe.
84  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
85 
87  for (int d = 0; d < device_count_; ++d) {
88  checkError(cuCtxDestroy(device_contexts_[d]));
89  }
90  } catch (const CudaErrorException& e) {
91  if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
92  // TODO(adb / asuhan): Verify cuModuleUnload removes the context
93  return;
94  }
95  LOG(ERROR) << "CUDA Error: " << e.what();
96  } catch (const std::runtime_error& e) {
97  LOG(ERROR) << "CUDA Error: " << e.what();
98  }
99 }
std::mutex device_cleanup_mutex_
Definition: CudaMgr.h:244
#define LOG(tag)
Definition: Logger.h:203
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:242
void synchronizeDevices() const
Definition: CudaMgr.cpp:101

+ Here is the call graph for this function:

Member Function Documentation

int8_t * CudaMgr_Namespace::CudaMgr::allocateDeviceMem ( const size_t  num_bytes,
const int  device_num 
)

Definition at line 252 of file CudaMgr.cpp.

References setContext().

Referenced by Buffer_Namespace::GpuCudaBufferMgr::addSlab().

252  {
253  setContext(device_num);
254  CUdeviceptr device_ptr;
255  checkError(cuMemAlloc(&device_ptr, num_bytes));
256  return reinterpret_cast<int8_t*>(device_ptr);
257 }
unsigned long long CUdeviceptr
Definition: nocuda.h:27
void setContext(const int device_num) const
Definition: CudaMgr.cpp:361

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int8_t * CudaMgr_Namespace::CudaMgr::allocatePinnedHostMem ( const size_t  num_bytes)

Definition at line 245 of file CudaMgr.cpp.

References setContext().

245  {
246  setContext(0);
247  void* host_ptr;
248  checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
249  return reinterpret_cast<int8_t*>(host_ptr);
250 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:361

+ Here is the call graph for this function:

void CudaMgr_Namespace::CudaMgr::copyDeviceToDevice ( int8_t *  dest_ptr,
int8_t *  src_ptr,
const size_t  num_bytes,
const int  dest_device_num,
const int  src_device_num 
)

Definition at line 126 of file CudaMgr.cpp.

References device_contexts_, and setContext().

Referenced by Buffer_Namespace::GpuCudaBuffer::readData(), and Buffer_Namespace::GpuCudaBuffer::writeData().

130  {
131  // dest_device_num and src_device_num are the device numbers relative to start_gpu_
132  // (real_device_num - start_gpu_)
133  if (src_device_num == dest_device_num) {
134  setContext(src_device_num);
135  checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
136  reinterpret_cast<CUdeviceptr>(src_ptr),
137  num_bytes));
138  } else {
139  checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
140  device_contexts_[dest_device_num],
141  reinterpret_cast<CUdeviceptr>(src_ptr),
142  device_contexts_[src_device_num],
143  num_bytes)); // will we always have peer?
144  }
145 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:361
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:242

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::copyDeviceToHost ( int8_t *  host_ptr,
const int8_t *  device_ptr,
const size_t  num_bytes,
const int  device_num 
)

Definition at line 117 of file CudaMgr.cpp.

References setContext().

Referenced by Buffer_Namespace::GpuCudaBuffer::readData(), and Buffer_Namespace::CpuBuffer::writeData().

120  {
121  setContext(device_num);
122  checkError(
123  cuMemcpyDtoH(host_ptr, reinterpret_cast<const CUdeviceptr>(device_ptr), num_bytes));
124 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:361

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::copyHostToDevice ( int8_t *  device_ptr,
const int8_t *  host_ptr,
const size_t  num_bytes,
const int  device_num 
)

Definition at line 108 of file CudaMgr.cpp.

References setContext().

Referenced by Buffer_Namespace::CpuBuffer::readData(), and Buffer_Namespace::GpuCudaBuffer::writeData().

111  {
112  setContext(device_num);
113  checkError(
114  cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
115 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:361

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

static std::string CudaMgr_Namespace::CudaMgr::deviceArchToSM ( const NvidiaDeviceArch  arch)
inlinestatic

Definition at line 148 of file CudaMgr.h.

References CudaMgr_Namespace::Ampere, CudaMgr_Namespace::Kepler, LOG, CudaMgr_Namespace::Maxwell, CudaMgr_Namespace::Pascal, CudaMgr_Namespace::Turing, UNREACHABLE, CudaMgr_Namespace::Volta, and logger::WARNING.

148  {
149  // Must match ${CUDA_COMPILATION_ARCH} CMAKE flag
150  switch (arch) {
152  return "sm_35";
154  return "sm_50";
156  return "sm_60";
158  return "sm_70";
160  return "sm_75";
162  return "sm_75";
163  default:
164  LOG(WARNING) << "Unrecognized Nvidia device architecture, falling back to "
165  "Kepler-compatibility.";
166  return "sm_35";
167  }
168  UNREACHABLE();
169  return "";
170  }
#define LOG(tag)
Definition: Logger.h:203
#define UNREACHABLE()
Definition: Logger.h:253
void CudaMgr_Namespace::CudaMgr::freeDeviceMem ( int8_t *  device_ptr)

Definition at line 263 of file CudaMgr.cpp.

References device_cleanup_mutex_.

Referenced by Buffer_Namespace::GpuCudaBufferMgr::freeAllMem().

263  {
264  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
265 
266  checkError(cuMemFree(reinterpret_cast<CUdeviceptr>(device_ptr)));
267 }
std::mutex device_cleanup_mutex_
Definition: CudaMgr.h:244

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::freePinnedHostMem ( int8_t *  host_ptr)

Definition at line 259 of file CudaMgr.cpp.

259  {
260  checkError(cuMemFreeHost(reinterpret_cast<void*>(host_ptr)));
261 }
const std::vector<DeviceProperties>& CudaMgr_Namespace::CudaMgr::getAllDeviceProperties ( ) const
inline

Definition at line 120 of file CudaMgr.h.

References device_properties_.

120  {
121  return device_properties_;
122  }
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240
NvidiaDeviceArch CudaMgr_Namespace::CudaMgr::getDeviceArch ( ) const
inline

Definition at line 172 of file CudaMgr.h.

References CudaMgr_Namespace::Ampere, device_properties_, CudaMgr_Namespace::Kepler, CudaMgr_Namespace::Maxwell, CudaMgr_Namespace::Pascal, CudaMgr_Namespace::Turing, and CudaMgr_Namespace::Volta.

172  {
173  if (device_properties_.size() > 0) {
174  const auto& device_properties = device_properties_.front();
175  switch (device_properties.computeMajor) {
176  case 3:
178  case 5:
180  case 6:
182  case 7:
183  if (device_properties.computeMinor == 0) {
185  } else {
187  }
188  case 8:
190  default:
192  }
193  } else {
194  // always fallback to Kepler if an architecture cannot be detected
196  }
197  }
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240
int CudaMgr_Namespace::CudaMgr::getDeviceCount ( ) const
inline

Definition at line 86 of file CudaMgr.h.

References device_count_.

Referenced by get_available_gpus(), isArchMaxwell(), isArchMaxwellOrLater(), isArchPascal(), and isArchPascalOrLater().

86 { return device_count_; }

+ Here is the caller graph for this function:

const omnisci::DeviceGroup& CudaMgr_Namespace::CudaMgr::getDeviceGroup ( ) const
inline

Definition at line 88 of file CudaMgr.h.

References device_group_.

88 { return device_group_; }
omnisci::DeviceGroup device_group_
Definition: CudaMgr.h:241
const DeviceProperties* CudaMgr_Namespace::CudaMgr::getDeviceProperties ( const size_t  device_num) const
inline

Definition at line 123 of file CudaMgr.h.

References device_properties_, and to_string().

123  {
124  // device_num is the device number relative to start_gpu_ (real_device_num -
125  // start_gpu_)
126  if (device_num < device_properties_.size()) {
127  return &device_properties_[device_num];
128  }
129  throw std::runtime_error("Specified device number " + std::to_string(device_num) +
130  " is out of range of number of devices (" +
131  std::to_string(device_properties_.size()) + ")");
132  }
std::string to_string(char const *&&v)
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240

+ Here is the call graph for this function:

size_t CudaMgr_Namespace::CudaMgr::getMinNumMPsForAllDevices ( ) const
inline

Definition at line 118 of file CudaMgr.h.

References min_num_mps_for_all_devices.

118 { return min_num_mps_for_all_devices; }
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:239
size_t CudaMgr_Namespace::CudaMgr::getMinSharedMemoryPerBlockForAllDevices ( ) const
inline

Definition at line 114 of file CudaMgr.h.

References min_shared_memory_per_block_for_all_devices.

114  {
116  }
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:238
int CudaMgr_Namespace::CudaMgr::getStartGpu ( ) const
inline

Definition at line 87 of file CudaMgr.h.

References start_gpu_.

87 { return start_gpu_; }
bool CudaMgr_Namespace::CudaMgr::isArchMaxwell ( ) const
inline

Definition at line 133 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

133  {
134  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 5);
135  }
int getDeviceCount() const
Definition: CudaMgr.h:86
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240

+ Here is the call graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchMaxwellOrLater ( ) const
inline

Definition at line 136 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

136  {
137  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 5);
138  }
int getDeviceCount() const
Definition: CudaMgr.h:86
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240

+ Here is the call graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchMaxwellOrLaterForAll ( ) const

Returns true if all devices have Maxwell micro-architecture, or later. Returns false, if there is any device with compute capability of < 5.0

Definition at line 287 of file CudaMgr.cpp.

References device_count_, device_properties_, and i.

287  {
288  for (int i = 0; i < device_count_; i++) {
289  if (device_properties_[i].computeMajor < 5) {
290  return false;
291  }
292  }
293  return true;
294 }
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240
bool CudaMgr_Namespace::CudaMgr::isArchPascal ( ) const
inline

Definition at line 139 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

139  {
140  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 6);
141  }
int getDeviceCount() const
Definition: CudaMgr.h:86
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240

+ Here is the call graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchPascalOrLater ( ) const
inline

Definition at line 142 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

Referenced by Executor::isArchPascalOrLater().

142  {
143  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 6);
144  }
int getDeviceCount() const
Definition: CudaMgr.h:86
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchVoltaOrGreaterForAll ( ) const

Returns true if all devices have Volta micro-architecture Returns false, if there is any non-Volta device available.

Definition at line 300 of file CudaMgr.cpp.

References device_count_, device_properties_, and i.

300  {
301  for (int i = 0; i < device_count_; i++) {
302  if (device_properties_[i].computeMajor < 7) {
303  return false;
304  }
305  }
306  return true;
307 }
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240
void CudaMgr_Namespace::CudaMgr::setContext ( const int  device_num) const

Definition at line 361 of file CudaMgr.cpp.

References CHECK_LT, and device_contexts_.

Referenced by allocateDeviceMem(), allocatePinnedHostMem(), copyDeviceToDevice(), copyDeviceToHost(), copyHostToDevice(), CudaMgr(), setDeviceMem(), and synchronizeDevices().

361  {
362  // deviceNum is the device number relative to startGpu (realDeviceNum - startGpu_)
363  CHECK_LT(device_num, device_count_);
364  cuCtxSetCurrent(device_contexts_[device_num]);
365 }
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:242
#define CHECK_LT(x, y)
Definition: Logger.h:219

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::setDeviceMem ( int8_t *  device_ptr,
const unsigned char  uc,
const size_t  num_bytes,
const int  device_num 
)

Definition at line 275 of file CudaMgr.cpp.

References setContext().

Referenced by zeroDeviceMem().

278  {
279  setContext(device_num);
280  checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
281 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:361

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::synchronizeDevices ( ) const

Definition at line 101 of file CudaMgr.cpp.

References device_count_, and setContext().

Referenced by ~CudaMgr(), and Buffer_Namespace::GpuCudaBufferMgr::~GpuCudaBufferMgr().

101  {
102  for (int d = 0; d < device_count_; ++d) {
103  setContext(d);
104  checkError(cuCtxSynchronize());
105  }
106 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:361

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::zeroDeviceMem ( int8_t *  device_ptr,
const size_t  num_bytes,
const int  device_num 
)

Definition at line 269 of file CudaMgr.cpp.

References setDeviceMem().

271  {
272  setDeviceMem(device_ptr, 0, num_bytes, device_num);
273 }
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:275

+ Here is the call graph for this function:

Member Data Documentation

std::mutex CudaMgr_Namespace::CudaMgr::device_cleanup_mutex_
mutableprivate

Definition at line 244 of file CudaMgr.h.

Referenced by freeDeviceMem(), and ~CudaMgr().

std::vector<CUcontext> CudaMgr_Namespace::CudaMgr::device_contexts_
private

Definition at line 242 of file CudaMgr.h.

Referenced by copyDeviceToDevice(), setContext(), and ~CudaMgr().

int CudaMgr_Namespace::CudaMgr::device_count_
private
omnisci::DeviceGroup CudaMgr_Namespace::CudaMgr::device_group_
private

Definition at line 241 of file CudaMgr.h.

Referenced by getDeviceGroup().

size_t CudaMgr_Namespace::CudaMgr::min_num_mps_for_all_devices
private

Definition at line 239 of file CudaMgr.h.

Referenced by getMinNumMPsForAllDevices().

size_t CudaMgr_Namespace::CudaMgr::min_shared_memory_per_block_for_all_devices
private

Definition at line 238 of file CudaMgr.h.

Referenced by getMinSharedMemoryPerBlockForAllDevices().

int CudaMgr_Namespace::CudaMgr::start_gpu_
private

Definition at line 237 of file CudaMgr.h.

Referenced by CudaMgr(), and getStartGpu().


The documentation for this class was generated from the following files: