OmniSciDB  467d548b97
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
CudaMgr_Namespace::CudaMgr Class Reference

#include <CudaMgr.h>

Public Member Functions

 CudaMgr (const int num_gpus, const int start_gpu=0)
 
 ~CudaMgr ()
 
void synchronizeDevices () const
 
int getDeviceCount () const
 
int getStartGpu () const
 
const omnisci::DeviceGroupgetDeviceGroup () const
 
void copyHostToDevice (int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num)
 
void copyDeviceToHost (int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, const int device_num)
 
void copyDeviceToDevice (int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num)
 
int8_t * allocatePinnedHostMem (const size_t num_bytes)
 
int8_t * allocateDeviceMem (const size_t num_bytes, const int device_num)
 
void freePinnedHostMem (int8_t *host_ptr)
 
void freeDeviceMem (int8_t *device_ptr)
 
void zeroDeviceMem (int8_t *device_ptr, const size_t num_bytes, const int device_num)
 
void setDeviceMem (int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num)
 
size_t getMinSharedMemoryPerBlockForAllDevices () const
 
size_t getMinNumMPsForAllDevices () const
 
const std::vector
< DeviceProperties > & 
getAllDeviceProperties () const
 
const DevicePropertiesgetDeviceProperties (const size_t device_num) const
 
bool isArchMaxwell () const
 
bool isArchMaxwellOrLater () const
 
bool isArchPascal () const
 
bool isArchPascalOrLater () const
 
bool isArchMaxwellOrLaterForAll () const
 
bool isArchVoltaOrGreaterForAll () const
 
NvidiaDeviceArch getDeviceArch () const
 
void setContext (const int device_num) const
 

Static Public Member Functions

static std::string deviceArchToSM (const NvidiaDeviceArch arch)
 

Private Attributes

int device_count_
 
int start_gpu_
 
size_t min_shared_memory_per_block_for_all_devices
 
size_t min_num_mps_for_all_devices
 
std::vector< DevicePropertiesdevice_properties_
 
omnisci::DeviceGroup device_group_
 
std::vector< CUcontextdevice_contexts_
 
std::mutex device_cleanup_mutex_
 

Detailed Description

Definition at line 80 of file CudaMgr.h.

Constructor & Destructor Documentation

CudaMgr_Namespace::CudaMgr::CudaMgr ( const int  num_gpus,
const int  start_gpu = 0 
)

Definition at line 48 of file CudaMgr.cpp.

References CHECK_EQ, CHECK_LE, device_count_, logger::INFO, LOG, nvidia_jit_warmup(), setContext(), and start_gpu_.

49  : start_gpu_(start_gpu)
52  checkError(cuInit(0));
53  checkError(cuDeviceGetCount(&device_count_));
54 
55  if (num_gpus > 0) { // numGpus <= 0 will just use number of gpus found
56  CHECK_LE(num_gpus + start_gpu_, device_count_);
57  device_count_ = std::min(device_count_, num_gpus);
58  } else {
59  // if we are using all gpus we cannot start on a gpu other than 0
60  CHECK_EQ(start_gpu_, 0);
61  }
62  fillDeviceProperties();
63  initDeviceGroup();
64  createDeviceContexts();
65  printDeviceProperties();
66 
67  // warm up the GPU JIT
68  LOG(INFO) << "Warming up the GPU JIT Compiler... (this may take several seconds)";
69  setContext(0);
71  LOG(INFO) << "GPU JIT Compiler initialized.";
72 }
#define CHECK_EQ(x, y)
Definition: Logger.h:205
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:239
#define LOG(tag)
Definition: Logger.h:188
void nvidia_jit_warmup()
void setContext(const int device_num) const
Definition: CudaMgr.cpp:362
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:238
#define CHECK_LE(x, y)
Definition: Logger.h:208

+ Here is the call graph for this function:

CudaMgr_Namespace::CudaMgr::~CudaMgr ( )

Definition at line 81 of file CudaMgr.cpp.

References device_cleanup_mutex_, device_contexts_, device_count_, logger::ERROR, LOG, and synchronizeDevices().

81  {
82  try {
83  // We don't want to remove the cudaMgr before all other processes have cleaned up.
84  // This should be enforced by the lifetime policies, but take this lock to be safe.
85  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
86 
88  for (int d = 0; d < device_count_; ++d) {
89  checkError(cuCtxDestroy(device_contexts_[d]));
90  }
91  } catch (const CudaErrorException& e) {
92  if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
93  // TODO(adb / asuhan): Verify cuModuleUnload removes the context
94  return;
95  }
96  LOG(ERROR) << "CUDA Error: " << e.what();
97  } catch (const std::runtime_error& e) {
98  LOG(ERROR) << "CUDA Error: " << e.what();
99  }
100 }
std::mutex device_cleanup_mutex_
Definition: CudaMgr.h:244
#define LOG(tag)
Definition: Logger.h:188
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:242
void synchronizeDevices() const
Definition: CudaMgr.cpp:102

+ Here is the call graph for this function:

Member Function Documentation

int8_t * CudaMgr_Namespace::CudaMgr::allocateDeviceMem ( const size_t  num_bytes,
const int  device_num 
)

Definition at line 253 of file CudaMgr.cpp.

References setContext().

Referenced by Buffer_Namespace::GpuCudaBufferMgr::addSlab().

253  {
254  setContext(device_num);
255  CUdeviceptr device_ptr;
256  checkError(cuMemAlloc(&device_ptr, num_bytes));
257  return reinterpret_cast<int8_t*>(device_ptr);
258 }
unsigned long long CUdeviceptr
Definition: nocuda.h:27
void setContext(const int device_num) const
Definition: CudaMgr.cpp:362

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int8_t * CudaMgr_Namespace::CudaMgr::allocatePinnedHostMem ( const size_t  num_bytes)

Definition at line 246 of file CudaMgr.cpp.

References setContext().

246  {
247  setContext(0);
248  void* host_ptr;
249  checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
250  return reinterpret_cast<int8_t*>(host_ptr);
251 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:362

+ Here is the call graph for this function:

void CudaMgr_Namespace::CudaMgr::copyDeviceToDevice ( int8_t *  dest_ptr,
int8_t *  src_ptr,
const size_t  num_bytes,
const int  dest_device_num,
const int  src_device_num 
)

Definition at line 127 of file CudaMgr.cpp.

References device_contexts_, and setContext().

Referenced by Buffer_Namespace::GpuCudaBuffer::readData(), and Buffer_Namespace::GpuCudaBuffer::writeData().

131  {
132  // dest_device_num and src_device_num are the device numbers relative to start_gpu_
133  // (real_device_num - start_gpu_)
134  if (src_device_num == dest_device_num) {
135  setContext(src_device_num);
136  checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
137  reinterpret_cast<CUdeviceptr>(src_ptr),
138  num_bytes));
139  } else {
140  checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
141  device_contexts_[dest_device_num],
142  reinterpret_cast<CUdeviceptr>(src_ptr),
143  device_contexts_[src_device_num],
144  num_bytes)); // will we always have peer?
145  }
146 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:362
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:242

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::copyDeviceToHost ( int8_t *  host_ptr,
const int8_t *  device_ptr,
const size_t  num_bytes,
const int  device_num 
)

Definition at line 118 of file CudaMgr.cpp.

References setContext().

Referenced by Buffer_Namespace::GpuCudaBuffer::readData(), and Buffer_Namespace::CpuBuffer::writeData().

121  {
122  setContext(device_num);
123  checkError(
124  cuMemcpyDtoH(host_ptr, reinterpret_cast<const CUdeviceptr>(device_ptr), num_bytes));
125 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:362

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::copyHostToDevice ( int8_t *  device_ptr,
const int8_t *  host_ptr,
const size_t  num_bytes,
const int  device_num 
)

Definition at line 109 of file CudaMgr.cpp.

References setContext().

Referenced by Buffer_Namespace::CpuBuffer::readData(), and Buffer_Namespace::GpuCudaBuffer::writeData().

112  {
113  setContext(device_num);
114  checkError(
115  cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
116 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:362

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

static std::string CudaMgr_Namespace::CudaMgr::deviceArchToSM ( const NvidiaDeviceArch  arch)
inlinestatic

Definition at line 148 of file CudaMgr.h.

References CudaMgr_Namespace::Ampere, CudaMgr_Namespace::Kepler, LOG, CudaMgr_Namespace::Maxwell, CudaMgr_Namespace::Pascal, CudaMgr_Namespace::Turing, UNREACHABLE, CudaMgr_Namespace::Volta, and logger::WARNING.

Referenced by UdfCompiler::compileToGpuByteCode().

148  {
149  // Must match ${CUDA_COMPILATION_ARCH} CMAKE flag
150  switch (arch) {
152  return "sm_35";
154  return "sm_50";
156  return "sm_60";
158  return "sm_70";
160  return "sm_75";
162  return "sm_75";
163  default:
164  LOG(WARNING) << "Unrecognized Nvidia device architecture, falling back to "
165  "Kepler-compatibility.";
166  return "sm_35";
167  }
168  UNREACHABLE();
169  return "";
170  }
#define LOG(tag)
Definition: Logger.h:188
#define UNREACHABLE()
Definition: Logger.h:241

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::freeDeviceMem ( int8_t *  device_ptr)

Definition at line 264 of file CudaMgr.cpp.

References device_cleanup_mutex_.

Referenced by Buffer_Namespace::GpuCudaBufferMgr::freeAllMem().

264  {
265  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
266 
267  checkError(cuMemFree(reinterpret_cast<CUdeviceptr>(device_ptr)));
268 }
std::mutex device_cleanup_mutex_
Definition: CudaMgr.h:244

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::freePinnedHostMem ( int8_t *  host_ptr)

Definition at line 260 of file CudaMgr.cpp.

260  {
261  checkError(cuMemFreeHost(reinterpret_cast<void*>(host_ptr)));
262 }
const std::vector<DeviceProperties>& CudaMgr_Namespace::CudaMgr::getAllDeviceProperties ( ) const
inline

Definition at line 120 of file CudaMgr.h.

References device_properties_.

120  {
121  return device_properties_;
122  }
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240
NvidiaDeviceArch CudaMgr_Namespace::CudaMgr::getDeviceArch ( ) const
inline

Definition at line 172 of file CudaMgr.h.

References CudaMgr_Namespace::Ampere, device_properties_, CudaMgr_Namespace::Kepler, CudaMgr_Namespace::Maxwell, CudaMgr_Namespace::Pascal, CudaMgr_Namespace::Turing, and CudaMgr_Namespace::Volta.

172  {
173  if (device_properties_.size() > 0) {
174  const auto& device_properties = device_properties_.front();
175  switch (device_properties.computeMajor) {
176  case 3:
178  case 5:
180  case 6:
182  case 7:
183  if (device_properties.computeMinor == 0) {
185  } else {
187  }
188  case 8:
190  default:
192  }
193  } else {
194  // always fallback to Kepler if an architecture cannot be detected
196  }
197  }
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240
int CudaMgr_Namespace::CudaMgr::getDeviceCount ( ) const
inline

Definition at line 86 of file CudaMgr.h.

References device_count_.

Referenced by get_available_gpus(), isArchMaxwell(), isArchMaxwellOrLater(), isArchPascal(), and isArchPascalOrLater().

86 { return device_count_; }

+ Here is the caller graph for this function:

const omnisci::DeviceGroup& CudaMgr_Namespace::CudaMgr::getDeviceGroup ( ) const
inline

Definition at line 88 of file CudaMgr.h.

References device_group_.

88 { return device_group_; }
omnisci::DeviceGroup device_group_
Definition: CudaMgr.h:241
const DeviceProperties* CudaMgr_Namespace::CudaMgr::getDeviceProperties ( const size_t  device_num) const
inline

Definition at line 123 of file CudaMgr.h.

References device_properties_, and to_string().

123  {
124  // device_num is the device number relative to start_gpu_ (real_device_num -
125  // start_gpu_)
126  if (device_num < device_properties_.size()) {
127  return &device_properties_[device_num];
128  }
129  throw std::runtime_error("Specified device number " + std::to_string(device_num) +
130  " is out of range of number of devices (" +
131  std::to_string(device_properties_.size()) + ")");
132  }
std::string to_string(char const *&&v)
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240

+ Here is the call graph for this function:

size_t CudaMgr_Namespace::CudaMgr::getMinNumMPsForAllDevices ( ) const
inline

Definition at line 118 of file CudaMgr.h.

References min_num_mps_for_all_devices.

118 { return min_num_mps_for_all_devices; }
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:239
size_t CudaMgr_Namespace::CudaMgr::getMinSharedMemoryPerBlockForAllDevices ( ) const
inline

Definition at line 114 of file CudaMgr.h.

References min_shared_memory_per_block_for_all_devices.

114  {
116  }
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:238
int CudaMgr_Namespace::CudaMgr::getStartGpu ( ) const
inline

Definition at line 87 of file CudaMgr.h.

References start_gpu_.

87 { return start_gpu_; }
bool CudaMgr_Namespace::CudaMgr::isArchMaxwell ( ) const
inline

Definition at line 133 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

133  {
134  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 5);
135  }
int getDeviceCount() const
Definition: CudaMgr.h:86
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240

+ Here is the call graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchMaxwellOrLater ( ) const
inline

Definition at line 136 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

136  {
137  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 5);
138  }
int getDeviceCount() const
Definition: CudaMgr.h:86
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240

+ Here is the call graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchMaxwellOrLaterForAll ( ) const

Returns true if all devices have Maxwell micro-architecture, or later. Returns false, if there is any device with compute capability of < 5.0

Definition at line 288 of file CudaMgr.cpp.

References device_count_, and device_properties_.

288  {
289  for (int i = 0; i < device_count_; i++) {
290  if (device_properties_[i].computeMajor < 5) {
291  return false;
292  }
293  }
294  return true;
295 }
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240
bool CudaMgr_Namespace::CudaMgr::isArchPascal ( ) const
inline

Definition at line 139 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

139  {
140  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 6);
141  }
int getDeviceCount() const
Definition: CudaMgr.h:86
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240

+ Here is the call graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchPascalOrLater ( ) const
inline

Definition at line 142 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

142  {
143  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 6);
144  }
int getDeviceCount() const
Definition: CudaMgr.h:86
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240

+ Here is the call graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchVoltaOrGreaterForAll ( ) const

Returns true if all devices have Volta micro-architecture Returns false, if there is any non-Volta device available.

Definition at line 301 of file CudaMgr.cpp.

References device_count_, and device_properties_.

301  {
302  for (int i = 0; i < device_count_; i++) {
303  if (device_properties_[i].computeMajor < 7) {
304  return false;
305  }
306  }
307  return true;
308 }
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:240
void CudaMgr_Namespace::CudaMgr::setContext ( const int  device_num) const

Definition at line 362 of file CudaMgr.cpp.

References CHECK_LT, and device_contexts_.

Referenced by allocateDeviceMem(), allocatePinnedHostMem(), copyDeviceToDevice(), copyDeviceToHost(), copyHostToDevice(), CudaMgr(), Executor::interrupt(), setDeviceMem(), and synchronizeDevices().

362  {
363  // deviceNum is the device number relative to startGpu (realDeviceNum - startGpu_)
364  CHECK_LT(device_num, device_count_);
365  cuCtxSetCurrent(device_contexts_[device_num]);
366 }
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:242
#define CHECK_LT(x, y)
Definition: Logger.h:207

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::setDeviceMem ( int8_t *  device_ptr,
const unsigned char  uc,
const size_t  num_bytes,
const int  device_num 
)

Definition at line 276 of file CudaMgr.cpp.

References setContext().

Referenced by zeroDeviceMem().

279  {
280  setContext(device_num);
281  checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
282 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:362

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::synchronizeDevices ( ) const

Definition at line 102 of file CudaMgr.cpp.

References device_count_, and setContext().

Referenced by ~CudaMgr(), and Buffer_Namespace::GpuCudaBufferMgr::~GpuCudaBufferMgr().

102  {
103  for (int d = 0; d < device_count_; ++d) {
104  setContext(d);
105  checkError(cuCtxSynchronize());
106  }
107 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:362

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::zeroDeviceMem ( int8_t *  device_ptr,
const size_t  num_bytes,
const int  device_num 
)

Definition at line 270 of file CudaMgr.cpp.

References setDeviceMem().

272  {
273  setDeviceMem(device_ptr, 0, num_bytes, device_num);
274 }
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:276

+ Here is the call graph for this function:

Member Data Documentation

std::mutex CudaMgr_Namespace::CudaMgr::device_cleanup_mutex_
mutableprivate

Definition at line 244 of file CudaMgr.h.

Referenced by freeDeviceMem(), and ~CudaMgr().

std::vector<CUcontext> CudaMgr_Namespace::CudaMgr::device_contexts_
private

Definition at line 242 of file CudaMgr.h.

Referenced by copyDeviceToDevice(), setContext(), and ~CudaMgr().

int CudaMgr_Namespace::CudaMgr::device_count_
private
omnisci::DeviceGroup CudaMgr_Namespace::CudaMgr::device_group_
private

Definition at line 241 of file CudaMgr.h.

Referenced by getDeviceGroup().

size_t CudaMgr_Namespace::CudaMgr::min_num_mps_for_all_devices
private

Definition at line 239 of file CudaMgr.h.

Referenced by getMinNumMPsForAllDevices().

size_t CudaMgr_Namespace::CudaMgr::min_shared_memory_per_block_for_all_devices
private

Definition at line 238 of file CudaMgr.h.

Referenced by getMinSharedMemoryPerBlockForAllDevices().

int CudaMgr_Namespace::CudaMgr::start_gpu_
private

Definition at line 237 of file CudaMgr.h.

Referenced by CudaMgr(), and getStartGpu().


The documentation for this class was generated from the following files: