OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CudaMgr_Namespace::CudaMgr Class Reference

#include <CudaMgr.h>

Public Member Functions

 CudaMgr (const int num_gpus, const int start_gpu=0)
 
virtual ~CudaMgr ()
 
void synchronizeDevices () const
 
int getDeviceCount () const
 
int getStartGpu () const
 
const heavyai::DeviceGroupgetDeviceGroup () const
 
size_t computePaddedBufferSize (size_t buf_size, size_t granularity) const
 
size_t getGranularity (const int device_num) const
 
void copyHostToDevice (int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
 
void copyDeviceToHost (int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, CUstream cuda_stream=0)
 
void copyDeviceToDevice (int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
 
int8_t * allocatePinnedHostMem (const size_t num_bytes)
 
virtual int8_t * allocateDeviceMem (const size_t num_bytes, const int device_num, const bool is_slab=false)
 
void freePinnedHostMem (int8_t *host_ptr)
 
void freeDeviceMem (int8_t *device_ptr)
 
void zeroDeviceMem (int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
 
void setDeviceMem (int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
 
size_t getMinSharedMemoryPerBlockForAllDevices () const
 
size_t getMinNumMPsForAllDevices () const
 
const std::vector
< DeviceProperties > & 
getAllDeviceProperties () const
 
const DevicePropertiesgetDeviceProperties (const size_t device_num) const
 
bool isArchMaxwell () const
 
bool isArchMaxwellOrLater () const
 
bool isArchPascal () const
 
bool isArchPascalOrLater () const
 
bool isArchMaxwellOrLaterForAll () const
 
bool isArchVoltaOrGreaterForAll () const
 
NvidiaDeviceArch getDeviceArch () const
 
void setContext (const int device_num) const
 
int getContext () const
 

Static Public Member Functions

static std::string deviceArchToSM (const NvidiaDeviceArch arch)
 

Private Attributes

int device_count_
 
int start_gpu_
 
size_t min_shared_memory_per_block_for_all_devices
 
size_t min_num_mps_for_all_devices
 
std::vector< DevicePropertiesdevice_properties_
 
heavyai::DeviceGroup device_group_
 
std::vector< CUcontextdevice_contexts_
 
std::mutex device_mutex_
 

Detailed Description

Definition at line 84 of file CudaMgr.h.

Constructor & Destructor Documentation

CudaMgr_Namespace::CudaMgr::CudaMgr ( const int  num_gpus,
const int  start_gpu = 0 
)

Definition at line 48 of file CudaMgr.cpp.

References CHECK_EQ, device_count_, device_group_, device_properties_, logger::INFO, LOG, nvidia_jit_warmup(), setContext(), and start_gpu_.

49  : start_gpu_(start_gpu)
52  , device_memory_allocation_map_{std::make_unique<DeviceMemoryAllocationMap>()} {
53  checkError(cuInit(0));
54  checkError(cuDeviceGetCount(&device_count_));
55 
56  if (num_gpus > 0) { // numGpus <= 0 will just use number of gpus found
57  device_count_ = std::min(device_count_, num_gpus);
58  } else {
59  // if we are using all gpus we cannot start on a gpu other than 0
60  CHECK_EQ(start_gpu_, 0);
61  }
62  fillDeviceProperties();
63  initDeviceGroup();
64  createDeviceContexts();
65  logDeviceProperties();
66 
67  // warm up the GPU JIT
68  LOG(INFO) << "Warming up the GPU JIT Compiler... (this may take several seconds)";
69  setContext(0);
71  LOG(INFO) << "GPU JIT Compiler initialized.";
72 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:264
#define LOG(tag)
Definition: Logger.h:285
void nvidia_jit_warmup()
void setContext(const int device_num) const
Definition: CudaMgr.cpp:511
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:263

+ Here is the call graph for this function:

CudaMgr_Namespace::CudaMgr::~CudaMgr ( )
virtual

Definition at line 81 of file CudaMgr.cpp.

References CHECK, device_contexts_, device_count_, device_mutex_, logger::ERROR, LOG, and synchronizeDevices().

81  {
82  try {
83  // We don't want to remove the cudaMgr before all other processes have cleaned up.
84  // This should be enforced by the lifetime policies, but take this lock to be safe.
85  std::lock_guard<std::mutex> device_lock(device_mutex_);
87 
88  CHECK(getDeviceMemoryAllocationMap().mapEmpty());
89  device_memory_allocation_map_ = nullptr;
90 
91  for (int d = 0; d < device_count_; ++d) {
92  checkError(cuCtxDestroy(device_contexts_[d]));
93  }
94  } catch (const CudaErrorException& e) {
95  if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
96  // TODO(adb / asuhan): Verify cuModuleUnload removes the context
97  return;
98  }
99  LOG(ERROR) << "CUDA Error: " << e.what();
100  } catch (const std::runtime_error& e) {
101  LOG(ERROR) << "CUDA Error: " << e.what();
102  }
103 }
std::mutex device_mutex_
Definition: CudaMgr.h:268
#define LOG(tag)
Definition: Logger.h:285
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:267
#define CHECK(condition)
Definition: Logger.h:291
void synchronizeDevices() const
Definition: CudaMgr.cpp:120

+ Here is the call graph for this function:

Member Function Documentation

int8_t * CudaMgr_Namespace::CudaMgr::allocateDeviceMem ( const size_t  num_bytes,
const int  device_num,
const bool  is_slab = false 
)
virtual

Definition at line 333 of file CudaMgr.cpp.

References computePaddedBufferSize(), device_mutex_, getDeviceProperties(), getGranularity(), setContext(), start_gpu_, and CudaMgr_Namespace::DeviceProperties::uuid.

Referenced by Buffer_Namespace::GpuCudaBufferMgr::addSlab().

335  {
336  std::lock_guard<std::mutex> map_lock(device_mutex_);
337  setContext(device_num);
338 
339  CUdeviceptr device_ptr{};
340  CUmemGenericAllocationHandle handle{};
341  auto granularity = getGranularity(device_num);
342  // reserve the actual memory
343  auto padded_num_bytes = computePaddedBufferSize(num_bytes, granularity);
344  auto status = cuMemAddressReserve(&device_ptr, padded_num_bytes, granularity, 0, 0);
345 
346  if (status == CUDA_SUCCESS) {
347  // create a handle for the allocation
348  CUmemAllocationProp allocation_prop{};
349  allocation_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
350  allocation_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
351  allocation_prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
352  allocation_prop.location.id = device_num + start_gpu_;
353  status = cuMemCreate(&handle, padded_num_bytes, &allocation_prop, 0);
354 
355  if (status == CUDA_SUCCESS) {
356  // map the memory
357  status = cuMemMap(device_ptr, padded_num_bytes, 0, handle, 0);
358 
359  if (status == CUDA_SUCCESS) {
360  // set the memory access
361  CUmemAccessDesc access_desc{};
362  access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
363  access_desc.location.id = device_num + start_gpu_;
364  access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
365  status = cuMemSetAccess(device_ptr, padded_num_bytes, &access_desc, 1);
366  }
367  }
368  }
369 
370  if (status != CUDA_SUCCESS) {
371  // clean up in reverse order
372  if (device_ptr && handle) {
373  cuMemUnmap(device_ptr, padded_num_bytes);
374  }
375  if (handle) {
376  cuMemRelease(handle);
377  }
378  if (device_ptr) {
379  cuMemAddressFree(device_ptr, padded_num_bytes);
380  }
381  throw CudaErrorException(status);
382  }
383  // emplace in the map
384  auto const& device_uuid = getDeviceProperties(device_num)->uuid;
385  getDeviceMemoryAllocationMap().addAllocation(
386  device_ptr, padded_num_bytes, handle, device_uuid, device_num, is_slab);
387  // notify
388  getDeviceMemoryAllocationMap().notifyMapChanged(device_uuid, is_slab);
389  return reinterpret_cast<int8_t*>(device_ptr);
390 }
std::mutex device_mutex_
Definition: CudaMgr.h:268
size_t getGranularity(const int device_num) const
Definition: CudaMgr.cpp:109
unsigned long long CUdeviceptr
Definition: nocuda.h:28
void setContext(const int device_num) const
Definition: CudaMgr.cpp:511
size_t computePaddedBufferSize(size_t buf_size, size_t granularity) const
Definition: CudaMgr.cpp:105
const DeviceProperties * getDeviceProperties(const size_t device_num) const
Definition: CudaMgr.h:137

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

int8_t * CudaMgr_Namespace::CudaMgr::allocatePinnedHostMem ( const size_t  num_bytes)

Definition at line 326 of file CudaMgr.cpp.

References setContext().

326  {
327  setContext(0);
328  void* host_ptr;
329  checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
330  return reinterpret_cast<int8_t*>(host_ptr);
331 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:511

+ Here is the call graph for this function:

size_t CudaMgr_Namespace::CudaMgr::computePaddedBufferSize ( size_t  buf_size,
size_t  granularity 
) const

Definition at line 105 of file CudaMgr.cpp.

Referenced by allocateDeviceMem().

105  {
106  return (((buf_size + (granularity - 1)) / granularity) * granularity);
107 }

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::copyDeviceToDevice ( int8_t *  dest_ptr,
int8_t *  src_ptr,
const size_t  num_bytes,
const int  dest_device_num,
const int  src_device_num,
CUstream  cuda_stream = 0 
)

Definition at line 164 of file CudaMgr.cpp.

References device_contexts_, and setContext().

Referenced by Buffer_Namespace::GpuCudaBuffer::readData(), and Buffer_Namespace::GpuCudaBuffer::writeData().

169  {
170  // dest_device_num and src_device_num are the device numbers relative to start_gpu_
171  // (real_device_num - start_gpu_)
172  if (src_device_num == dest_device_num) {
173  setContext(src_device_num);
174  if (!cuda_stream) {
175  checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
176  reinterpret_cast<CUdeviceptr>(src_ptr),
177  num_bytes));
178  } else {
179  checkError(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
180  reinterpret_cast<CUdeviceptr>(src_ptr),
181  num_bytes,
182  cuda_stream));
183  checkError(cuStreamSynchronize(cuda_stream));
184  }
185  } else {
186  if (!cuda_stream) {
187  checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
188  device_contexts_[dest_device_num],
189  reinterpret_cast<CUdeviceptr>(src_ptr),
190  device_contexts_[src_device_num],
191  num_bytes)); // will we always have peer?
192  } else {
193  checkError(cuMemcpyPeerAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
194  device_contexts_[dest_device_num],
195  reinterpret_cast<CUdeviceptr>(src_ptr),
196  device_contexts_[src_device_num],
197  num_bytes,
198  cuda_stream)); // will we always have peer?
199  checkError(cuStreamSynchronize(cuda_stream));
200  }
201  }
202 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:511
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:267

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::copyDeviceToHost ( int8_t *  host_ptr,
const int8_t *  device_ptr,
const size_t  num_bytes,
CUstream  cuda_stream = 0 
)

Definition at line 143 of file CudaMgr.cpp.

References CHECK_LE, device_mutex_, and setContext().

Referenced by Buffer_Namespace::GpuCudaBuffer::readData(), and Buffer_Namespace::CpuBuffer::writeData().

146  {
147  // set device_num based on device_ptr
148  auto const cu_device_ptr = reinterpret_cast<CUdeviceptr>(device_ptr);
149  {
150  std::lock_guard<std::mutex> device_lock(device_mutex_);
151  auto const [allocation_base, allocation] =
152  getDeviceMemoryAllocationMap().getAllocation(cu_device_ptr);
153  CHECK_LE(cu_device_ptr + num_bytes, allocation_base + allocation.size);
154  setContext(allocation.device_num);
155  }
156  if (!cuda_stream) {
157  checkError(cuMemcpyDtoH(host_ptr, cu_device_ptr, num_bytes));
158  } else {
159  checkError(cuMemcpyDtoHAsync(host_ptr, cu_device_ptr, num_bytes, cuda_stream));
160  checkError(cuStreamSynchronize(cuda_stream));
161  }
162 }
std::mutex device_mutex_
Definition: CudaMgr.h:268
unsigned long long CUdeviceptr
Definition: nocuda.h:28
void setContext(const int device_num) const
Definition: CudaMgr.cpp:511
#define CHECK_LE(x, y)
Definition: Logger.h:304

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::copyHostToDevice ( int8_t *  device_ptr,
const int8_t *  host_ptr,
const size_t  num_bytes,
const int  device_num,
CUstream  cuda_stream = 0 
)

Definition at line 127 of file CudaMgr.cpp.

References setContext().

Referenced by Buffer_Namespace::CpuBuffer::readData(), and Buffer_Namespace::GpuCudaBuffer::writeData().

131  {
132  setContext(device_num);
133  if (!cuda_stream) {
134  checkError(
135  cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
136  } else {
137  checkError(cuMemcpyHtoDAsync(
138  reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes, cuda_stream));
139  checkError(cuStreamSynchronize(cuda_stream));
140  }
141 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:511

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

static std::string CudaMgr_Namespace::CudaMgr::deviceArchToSM ( const NvidiaDeviceArch  arch)
inlinestatic

Definition at line 162 of file CudaMgr.h.

References CudaMgr_Namespace::Ampere, CudaMgr_Namespace::Kepler, LOG, CudaMgr_Namespace::Maxwell, CudaMgr_Namespace::Pascal, CudaMgr_Namespace::Turing, UNREACHABLE, CudaMgr_Namespace::Volta, and logger::WARNING.

162  {
163  // Must match ${CUDA_COMPILATION_ARCH} CMAKE flag
164  switch (arch) {
166  return "sm_35";
168  return "sm_50";
170  return "sm_60";
172  return "sm_70";
174  return "sm_75";
176  return "sm_75";
177  default:
178  LOG(WARNING) << "Unrecognized Nvidia device architecture, falling back to "
179  "Kepler-compatibility.";
180  return "sm_35";
181  }
182  UNREACHABLE();
183  return "";
184  }
#define LOG(tag)
Definition: Logger.h:285
#define UNREACHABLE()
Definition: Logger.h:338
void CudaMgr_Namespace::CudaMgr::freeDeviceMem ( int8_t *  device_ptr)

Definition at line 392 of file CudaMgr.cpp.

References device_mutex_.

Referenced by Buffer_Namespace::GpuCudaBufferMgr::freeAllMem().

392  {
393  // take lock
394  std::lock_guard<std::mutex> map_lock(device_mutex_);
395  // fetch and remove from map
396  auto const cu_device_ptr = reinterpret_cast<CUdeviceptr>(device_ptr);
397  auto allocation = getDeviceMemoryAllocationMap().removeAllocation(cu_device_ptr);
398  // attempt to unmap, release, free
399  auto status_unmap = cuMemUnmap(cu_device_ptr, allocation.size);
400  auto status_release = cuMemRelease(allocation.handle);
401  auto status_free = cuMemAddressFree(cu_device_ptr, allocation.size);
402  // check for errors
403  checkError(status_unmap);
404  checkError(status_release);
405  checkError(status_free);
406  // notify
407  getDeviceMemoryAllocationMap().notifyMapChanged(allocation.device_uuid,
408  allocation.is_slab);
409 }
std::mutex device_mutex_
Definition: CudaMgr.h:268
unsigned long long CUdeviceptr
Definition: nocuda.h:28

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::freePinnedHostMem ( int8_t *  host_ptr)

Definition at line 74 of file CudaMgrNoCuda.cpp.

References CHECK.

74  {
75  CHECK(false);
76 }
#define CHECK(condition)
Definition: Logger.h:291
const std::vector<DeviceProperties>& CudaMgr_Namespace::CudaMgr::getAllDeviceProperties ( ) const
inline

Definition at line 134 of file CudaMgr.h.

References device_properties_.

Referenced by Executor::blockSize(), Executor::deviceCycles(), and Executor::warpSize().

134  {
135  return device_properties_;
136  }
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:265

+ Here is the caller graph for this function:

int CudaMgr_Namespace::CudaMgr::getContext ( ) const

Definition at line 517 of file CudaMgr.cpp.

References device_contexts_.

Referenced by QueryEngine::getCudaStream(), and QueryEngine::QueryEngine().

517  {
518  CUcontext cnow;
519  checkError(cuCtxGetCurrent(&cnow));
520  if (cnow == NULL) {
521  throw std::runtime_error("no cuda device context");
522  }
523  int device_num{0};
524  for (auto& c : device_contexts_) {
525  if (c == cnow) {
526  return device_num;
527  }
528  ++device_num;
529  }
530  // TODO(sy): Change device_contexts_ to have O(1) lookup? (Or maybe not worth it.)
531  throw std::runtime_error("invalid cuda device context");
532 }
int CUcontext
Definition: nocuda.h:22
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:267

+ Here is the caller graph for this function:

NvidiaDeviceArch CudaMgr_Namespace::CudaMgr::getDeviceArch ( ) const
inline

Definition at line 186 of file CudaMgr.h.

References CudaMgr_Namespace::Ampere, device_properties_, CudaMgr_Namespace::Kepler, CudaMgr_Namespace::Maxwell, CudaMgr_Namespace::Pascal, CudaMgr_Namespace::Turing, and CudaMgr_Namespace::Volta.

186  {
187  if (device_properties_.size() > 0) {
188  const auto& device_properties = device_properties_.front();
189  switch (device_properties.computeMajor) {
190  case 3:
192  case 5:
194  case 6:
196  case 7:
197  if (device_properties.computeMinor < 5) {
199  } else {
201  }
202  case 8:
204  default:
206  }
207  } else {
208  // always fallback to Kepler if an architecture cannot be detected
210  }
211  }
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:265
int CudaMgr_Namespace::CudaMgr::getDeviceCount ( ) const
inline

Definition at line 90 of file CudaMgr.h.

References device_count_.

Referenced by Executor::deviceCount(), get_available_gpus(), isArchMaxwell(), isArchMaxwellOrLater(), isArchPascal(), isArchPascalOrLater(), and QueryEngine::QueryEngine().

90 { return device_count_; }

+ Here is the caller graph for this function:

const heavyai::DeviceGroup& CudaMgr_Namespace::CudaMgr::getDeviceGroup ( ) const
inline

Definition at line 92 of file CudaMgr.h.

References device_group_.

92 { return device_group_; }
heavyai::DeviceGroup device_group_
Definition: CudaMgr.h:266
const DeviceProperties* CudaMgr_Namespace::CudaMgr::getDeviceProperties ( const size_t  device_num) const
inline

Definition at line 137 of file CudaMgr.h.

References device_properties_, and to_string().

Referenced by allocateDeviceMem().

137  {
138  // device_num is the device number relative to start_gpu_ (real_device_num -
139  // start_gpu_)
140  if (device_num < device_properties_.size()) {
141  return &device_properties_[device_num];
142  }
143  throw std::runtime_error("Specified device number " + std::to_string(device_num) +
144  " is out of range of number of devices (" +
145  std::to_string(device_properties_.size()) + ")");
146  }
std::string to_string(char const *&&v)
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:265

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

size_t CudaMgr_Namespace::CudaMgr::getGranularity ( const int  device_num) const

Definition at line 109 of file CudaMgr.cpp.

Referenced by allocateDeviceMem().

109  {
110  CUmemAllocationProp allocation_prop{};
111  allocation_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
112  allocation_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
113  allocation_prop.location.id = device_num;
114  size_t granularity{};
115  checkError(cuMemGetAllocationGranularity(
116  &granularity, &allocation_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
117  return granularity;
118 }

+ Here is the caller graph for this function:

size_t CudaMgr_Namespace::CudaMgr::getMinNumMPsForAllDevices ( ) const
inline

Definition at line 132 of file CudaMgr.h.

References min_num_mps_for_all_devices.

132 { return min_num_mps_for_all_devices; }
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:264
size_t CudaMgr_Namespace::CudaMgr::getMinSharedMemoryPerBlockForAllDevices ( ) const
inline

Definition at line 128 of file CudaMgr.h.

References min_shared_memory_per_block_for_all_devices.

128  {
130  }
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:263
int CudaMgr_Namespace::CudaMgr::getStartGpu ( ) const
inline

Definition at line 91 of file CudaMgr.h.

References start_gpu_.

91 { return start_gpu_; }
bool CudaMgr_Namespace::CudaMgr::isArchMaxwell ( ) const
inline

Definition at line 147 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

147  {
148  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 5);
149  }
int getDeviceCount() const
Definition: CudaMgr.h:90
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:265

+ Here is the call graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchMaxwellOrLater ( ) const
inline

Definition at line 150 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

150  {
151  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 5);
152  }
int getDeviceCount() const
Definition: CudaMgr.h:90
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:265

+ Here is the call graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchMaxwellOrLaterForAll ( ) const

Returns true if all devices have Maxwell micro-architecture, or later. Returns false, if there is any device with compute capability of < 5.0

Definition at line 437 of file CudaMgr.cpp.

References device_count_, and device_properties_.

437  {
438  for (int i = 0; i < device_count_; i++) {
439  if (device_properties_[i].computeMajor < 5) {
440  return false;
441  }
442  }
443  return true;
444 }
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:265
bool CudaMgr_Namespace::CudaMgr::isArchPascal ( ) const
inline

Definition at line 153 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

153  {
154  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 6);
155  }
int getDeviceCount() const
Definition: CudaMgr.h:90
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:265

+ Here is the call graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchPascalOrLater ( ) const
inline

Definition at line 156 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

Referenced by Executor::isArchPascalOrLater().

156  {
157  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 6);
158  }
int getDeviceCount() const
Definition: CudaMgr.h:90
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:265

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchVoltaOrGreaterForAll ( ) const

Returns true if all devices have Volta micro-architecture Returns false, if there is any non-Volta device available.

Definition at line 450 of file CudaMgr.cpp.

References device_count_, and device_properties_.

450  {
451  for (int i = 0; i < device_count_; i++) {
452  if (device_properties_[i].computeMajor < 7) {
453  return false;
454  }
455  }
456  return true;
457 }
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:265
void CudaMgr_Namespace::CudaMgr::setContext ( const int  device_num) const

Definition at line 511 of file CudaMgr.cpp.

References CHECK_LT, and device_contexts_.

Referenced by allocateDeviceMem(), allocatePinnedHostMem(), copyDeviceToDevice(), copyDeviceToHost(), copyHostToDevice(), CudaMgr(), QueryEngine::QueryEngine(), setDeviceMem(), and synchronizeDevices().

511  {
512  // deviceNum is the device number relative to startGpu (realDeviceNum - startGpu_)
513  CHECK_LT(device_num, device_count_);
514  cuCtxSetCurrent(device_contexts_[device_num]);
515 }
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:267
#define CHECK_LT(x, y)
Definition: Logger.h:303

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::setDeviceMem ( int8_t *  device_ptr,
const unsigned char  uc,
const size_t  num_bytes,
const int  device_num,
CUstream  cuda_stream = 0 
)

Definition at line 418 of file CudaMgr.cpp.

References setContext().

Referenced by zeroDeviceMem().

422  {
423  setContext(device_num);
424  if (!cuda_stream) {
425  checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
426  } else {
427  checkError(cuMemsetD8Async(
428  reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes, cuda_stream));
429  checkError(cuStreamSynchronize(cuda_stream));
430  }
431 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:511

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::synchronizeDevices ( ) const

Definition at line 120 of file CudaMgr.cpp.

References device_count_, and setContext().

Referenced by ~CudaMgr(), and Buffer_Namespace::GpuCudaBufferMgr::~GpuCudaBufferMgr().

120  {
121  for (int d = 0; d < device_count_; ++d) {
122  setContext(d);
123  checkError(cuCtxSynchronize());
124  }
125 }
void setContext(const int device_num) const
Definition: CudaMgr.cpp:511

+ Here is the call graph for this function:

+ Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::zeroDeviceMem ( int8_t *  device_ptr,
const size_t  num_bytes,
const int  device_num,
CUstream  cuda_stream = 0 
)

Definition at line 411 of file CudaMgr.cpp.

References setDeviceMem().

414  {
415  setDeviceMem(device_ptr, 0, num_bytes, device_num, cuda_stream);
416 }
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:418

+ Here is the call graph for this function:

Member Data Documentation

std::vector<CUcontext> CudaMgr_Namespace::CudaMgr::device_contexts_
private

Definition at line 267 of file CudaMgr.h.

Referenced by copyDeviceToDevice(), getContext(), setContext(), and ~CudaMgr().

int CudaMgr_Namespace::CudaMgr::device_count_
private
heavyai::DeviceGroup CudaMgr_Namespace::CudaMgr::device_group_
private

Definition at line 266 of file CudaMgr.h.

Referenced by CudaMgr(), and getDeviceGroup().

std::mutex CudaMgr_Namespace::CudaMgr::device_mutex_
mutableprivate

Definition at line 268 of file CudaMgr.h.

Referenced by allocateDeviceMem(), copyDeviceToHost(), freeDeviceMem(), and ~CudaMgr().

size_t CudaMgr_Namespace::CudaMgr::min_num_mps_for_all_devices
private

Definition at line 264 of file CudaMgr.h.

Referenced by getMinNumMPsForAllDevices().

size_t CudaMgr_Namespace::CudaMgr::min_shared_memory_per_block_for_all_devices
private

Definition at line 263 of file CudaMgr.h.

Referenced by getMinSharedMemoryPerBlockForAllDevices().

int CudaMgr_Namespace::CudaMgr::start_gpu_
private

Definition at line 262 of file CudaMgr.h.

Referenced by allocateDeviceMem(), CudaMgr(), and getStartGpu().


The documentation for this class was generated from the following files: