#include <CudaMgr.h>

Public Member Functions
	CudaMgr (const int num_gpus, const int start_gpu=0)

virtual	~CudaMgr ()

void	synchronizeDevices () const

int	getDeviceCount () const

int	getStartGpu () const

const heavyai::DeviceGroup &	getDeviceGroup () const

size_t	computePaddedBufferSize (size_t buf_size, size_t granularity) const

size_t	getGranularity (const int device_num) const

void	copyHostToDevice (int8_t device_ptr, const int8_t host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)

void	copyDeviceToHost (int8_t host_ptr, const int8_t device_ptr, const size_t num_bytes, CUstream cuda_stream=0)

void	copyDeviceToDevice (int8_t dest_ptr, int8_t src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)

int8_t *	allocatePinnedHostMem (const size_t num_bytes)

virtual int8_t *	allocateDeviceMem (const size_t num_bytes, const int device_num, const bool is_slab=false)

void	freePinnedHostMem (int8_t *host_ptr)

void	freeDeviceMem (int8_t *device_ptr)

void	zeroDeviceMem (int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)

void	setDeviceMem (int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)

size_t	getMinSharedMemoryPerBlockForAllDevices () const

size_t	getMinNumMPsForAllDevices () const

const std::vector < DeviceProperties > &	getAllDeviceProperties () const

const DeviceProperties *	getDeviceProperties (const size_t device_num) const

bool	isArchMaxwell () const

bool	isArchMaxwellOrLater () const

bool	isArchPascal () const

bool	isArchPascalOrLater () const

bool	isArchMaxwellOrLaterForAll () const

bool	isArchVoltaOrGreaterForAll () const

NvidiaDeviceArch	getDeviceArch () const

void	setContext (const int device_num) const

int	getContext () const

Static Public Member Functions
static std::string	deviceArchToSM (const NvidiaDeviceArch arch)

Private Attributes
int	device_count_

int	start_gpu_

size_t	min_shared_memory_per_block_for_all_devices

size_t	min_num_mps_for_all_devices

std::vector< DeviceProperties >	device_properties_

heavyai::DeviceGroup	device_group_

std::vector< CUcontext >	device_contexts_

std::mutex	device_mutex_

Detailed Description

Definition at line 84 of file CudaMgr.h.

Constructor & Destructor Documentation

CudaMgr_Namespace::CudaMgr::CudaMgr	(	const int	num_gpus,
		const int	start_gpu = `0`
	)

Definition at line 48 of file CudaMgr.cpp.

References CHECK_EQ, device_count_, device_group_, device_properties_, logger::INFO, LOG, nvidia_jit_warmup(), setContext(), and start_gpu_.

     : start_gpu_(start_gpu)
     , min_shared_memory_per_block_for_all_devices(0)
     , min_num_mps_for_all_devices(0)
     , device_memory_allocation_map_{std::make_unique<DeviceMemoryAllocationMap>()} {
   checkError(cuInit(0));
   checkError(cuDeviceGetCount(&device_count_));
 
   if (num_gpus > 0) {  // numGpus <= 0 will just use number of gpus found
     device_count_ = std::min(device_count_, num_gpus);
   } else {
     // if we are using all gpus we cannot start on a gpu other than 0
     CHECK_EQ(start_gpu_, 0);
   }
   fillDeviceProperties();
   initDeviceGroup();
   createDeviceContexts();
   logDeviceProperties();
 
   // warm up the GPU JIT
   LOG(INFO) << "Warming up the GPU JIT Compiler... (this may take several seconds)";
   setContext(0);
   nvidia_jit_warmup();
   LOG(INFO) << "GPU JIT Compiler initialized.";
 }

Here is the call graph for this function:

CudaMgr_Namespace::CudaMgr::~CudaMgr ( )

virtual

Definition at line 81 of file CudaMgr.cpp.

References CHECK, device_contexts_, device_count_, device_mutex_, logger::ERROR, LOG, and synchronizeDevices().

                   {
   try {
     // We don't want to remove the cudaMgr before all other processes have cleaned up.
     // This should be enforced by the lifetime policies, but take this lock to be safe.
     std::lock_guard<std::mutex> device_lock(device_mutex_);
     synchronizeDevices();
 
     CHECK(getDeviceMemoryAllocationMap().mapEmpty());
     device_memory_allocation_map_ = nullptr;
 
     for (int d = 0; d < device_count_; ++d) {
       checkError(cuCtxDestroy(device_contexts_[d]));
     }
   } catch (const CudaErrorException& e) {
     if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
       // TODO(adb / asuhan): Verify cuModuleUnload removes the context
       return;
     }
     LOG(ERROR) << "CUDA Error: " << e.what();
   } catch (const std::runtime_error& e) {
     LOG(ERROR) << "CUDA Error: " << e.what();
   }
 }

Here is the call graph for this function:

Member Function Documentation

int8_t * CudaMgr_Namespace::CudaMgr::allocateDeviceMem	(	const size_t	num_bytes,
		const int	device_num,
		const bool	is_slab = `false`
	)

virtual

Definition at line 333 of file CudaMgr.cpp.

References computePaddedBufferSize(), device_mutex_, getDeviceProperties(), getGranularity(), setContext(), start_gpu_, and CudaMgr_Namespace::DeviceProperties::uuid.

Referenced by Buffer_Namespace::GpuCudaBufferMgr::addSlab().

                                                        {
   std::lock_guard<std::mutex> map_lock(device_mutex_);
   setContext(device_num);
 
   CUdeviceptr device_ptr{};
   CUmemGenericAllocationHandle handle{};
   auto granularity = getGranularity(device_num);
   // reserve the actual memory
   auto padded_num_bytes = computePaddedBufferSize(num_bytes, granularity);
   auto status = cuMemAddressReserve(&device_ptr, padded_num_bytes, granularity, 0, 0);
 
   if (status == CUDA_SUCCESS) {
     // create a handle for the allocation
     CUmemAllocationProp allocation_prop{};
     allocation_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
     allocation_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
     allocation_prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
     allocation_prop.location.id = device_num + start_gpu_;
     status = cuMemCreate(&handle, padded_num_bytes, &allocation_prop, 0);
 
     if (status == CUDA_SUCCESS) {
       // map the memory
       status = cuMemMap(device_ptr, padded_num_bytes, 0, handle, 0);
 
       if (status == CUDA_SUCCESS) {
         // set the memory access
         CUmemAccessDesc access_desc{};
         access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
         access_desc.location.id = device_num + start_gpu_;
         access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
         status = cuMemSetAccess(device_ptr, padded_num_bytes, &access_desc, 1);
       }
     }
   }
 
   if (status != CUDA_SUCCESS) {
     // clean up in reverse order
     if (device_ptr && handle) {
       cuMemUnmap(device_ptr, padded_num_bytes);
     }
     if (handle) {
       cuMemRelease(handle);
     }
     if (device_ptr) {
       cuMemAddressFree(device_ptr, padded_num_bytes);
     }
     throw CudaErrorException(status);
   }
   // emplace in the map
   auto const& device_uuid = getDeviceProperties(device_num)->uuid;
   getDeviceMemoryAllocationMap().addAllocation(
       device_ptr, padded_num_bytes, handle, device_uuid, device_num, is_slab);
   // notify
   getDeviceMemoryAllocationMap().notifyMapChanged(device_uuid, is_slab);
   return reinterpret_cast<int8_t*>(device_ptr);
 }

Here is the call graph for this function:

Here is the caller graph for this function:

int8_t * CudaMgr_Namespace::CudaMgr::allocatePinnedHostMem ( const size_t num_bytes )

Definition at line 326 of file CudaMgr.cpp.

References setContext().

                                                              {
   setContext(0);
   void* host_ptr;
   checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
   return reinterpret_cast<int8_t*>(host_ptr);
 }

Here is the call graph for this function:

size_t CudaMgr_Namespace::CudaMgr::computePaddedBufferSize	(	size_t	buf_size,
		size_t	granularity
	)		const

Definition at line 105 of file CudaMgr.cpp.

Referenced by allocateDeviceMem().

                                                                                  {
   return (((buf_size + (granularity - 1)) / granularity) * granularity);
 }

Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::copyDeviceToDevice	(	int8_t *	dest_ptr,
		int8_t *	src_ptr,
		const size_t	num_bytes,
		const int	dest_device_num,
		const int	src_device_num,
		CUstream	cuda_stream = `0`
	)

Definition at line 164 of file CudaMgr.cpp.

References device_contexts_, and setContext().

Referenced by Buffer_Namespace::GpuCudaBuffer::readData(), and Buffer_Namespace::GpuCudaBuffer::writeData().

                                                        {
   // dest_device_num and src_device_num are the device numbers relative to start_gpu_
   // (real_device_num - start_gpu_)
   if (src_device_num == dest_device_num) {
     setContext(src_device_num);
     if (!cuda_stream) {
       checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
                           reinterpret_cast<CUdeviceptr>(src_ptr),
                           num_bytes));
     } else {
       checkError(cuMemcpyAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
                                reinterpret_cast<CUdeviceptr>(src_ptr),
                                num_bytes,
                                cuda_stream));
       checkError(cuStreamSynchronize(cuda_stream));
     }
   } else {
     if (!cuda_stream) {
       checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
                               device_contexts_[dest_device_num],
                               reinterpret_cast<CUdeviceptr>(src_ptr),
                               device_contexts_[src_device_num],
                               num_bytes));  // will we always have peer?
     } else {
       checkError(cuMemcpyPeerAsync(reinterpret_cast<CUdeviceptr>(dest_ptr),
                                    device_contexts_[dest_device_num],
                                    reinterpret_cast<CUdeviceptr>(src_ptr),
                                    device_contexts_[src_device_num],
                                    num_bytes,
                                    cuda_stream));  // will we always have peer?
       checkError(cuStreamSynchronize(cuda_stream));
     }
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::copyDeviceToHost	(	int8_t *	host_ptr,
		const int8_t *	device_ptr,
		const size_t	num_bytes,
		CUstream	cuda_stream = `0`
	)

Definition at line 143 of file CudaMgr.cpp.

References CHECK_LE, device_mutex_, and setContext().

Referenced by Buffer_Namespace::GpuCudaBuffer::readData(), and Buffer_Namespace::CpuBuffer::writeData().

                                                      {
   // set device_num based on device_ptr
   auto const cu_device_ptr = reinterpret_cast<CUdeviceptr>(device_ptr);
   {
     std::lock_guard<std::mutex> device_lock(device_mutex_);
     auto const [allocation_base, allocation] =
         getDeviceMemoryAllocationMap().getAllocation(cu_device_ptr);
     CHECK_LE(cu_device_ptr + num_bytes, allocation_base + allocation.size);
     setContext(allocation.device_num);
   }
   if (!cuda_stream) {
     checkError(cuMemcpyDtoH(host_ptr, cu_device_ptr, num_bytes));
   } else {
     checkError(cuMemcpyDtoHAsync(host_ptr, cu_device_ptr, num_bytes, cuda_stream));
     checkError(cuStreamSynchronize(cuda_stream));
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::copyHostToDevice	(	int8_t *	device_ptr,
		const int8_t *	host_ptr,
		const size_t	num_bytes,
		const int	device_num,
		CUstream	cuda_stream = `0`
	)

Definition at line 127 of file CudaMgr.cpp.

References setContext().

Referenced by Buffer_Namespace::CpuBuffer::readData(), and Buffer_Namespace::GpuCudaBuffer::writeData().

                                                      {
   setContext(device_num);
   if (!cuda_stream) {
     checkError(
         cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
   } else {
     checkError(cuMemcpyHtoDAsync(
         reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes, cuda_stream));
     checkError(cuStreamSynchronize(cuda_stream));
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

static std::string CudaMgr_Namespace::CudaMgr::deviceArchToSM ( const NvidiaDeviceArch arch )

inlinestatic

Definition at line 162 of file CudaMgr.h.

References CudaMgr_Namespace::Ampere, CudaMgr_Namespace::Kepler, LOG, CudaMgr_Namespace::Maxwell, CudaMgr_Namespace::Pascal, CudaMgr_Namespace::Turing, UNREACHABLE, CudaMgr_Namespace::Volta, and logger::WARNING.

                                                                {
     // Must match ${CUDA_COMPILATION_ARCH} CMAKE flag
     switch (arch) {
       case NvidiaDeviceArch::Kepler:
         return "sm_35";
       case NvidiaDeviceArch::Maxwell:
         return "sm_50";
       case NvidiaDeviceArch::Pascal:
         return "sm_60";
       case NvidiaDeviceArch::Volta:
         return "sm_70";
       case NvidiaDeviceArch::Turing:
         return "sm_75";
       case NvidiaDeviceArch::Ampere:
         return "sm_75";
       default:
         LOG(WARNING) << "Unrecognized Nvidia device architecture, falling back to "
                         "Kepler-compatibility.";
         return "sm_35";
     }
     UNREACHABLE();
     return "";
   }

void CudaMgr_Namespace::CudaMgr::freeDeviceMem ( int8_t * device_ptr )

Definition at line 392 of file CudaMgr.cpp.

References device_mutex_.

Referenced by Buffer_Namespace::GpuCudaBufferMgr::freeAllMem().

                                               {
   // take lock
   std::lock_guard<std::mutex> map_lock(device_mutex_);
   // fetch and remove from map
   auto const cu_device_ptr = reinterpret_cast<CUdeviceptr>(device_ptr);
   auto allocation = getDeviceMemoryAllocationMap().removeAllocation(cu_device_ptr);
   // attempt to unmap, release, free
   auto status_unmap = cuMemUnmap(cu_device_ptr, allocation.size);
   auto status_release = cuMemRelease(allocation.handle);
   auto status_free = cuMemAddressFree(cu_device_ptr, allocation.size);
   // check for errors
   checkError(status_unmap);
   checkError(status_release);
   checkError(status_free);
   // notify
   getDeviceMemoryAllocationMap().notifyMapChanged(allocation.device_uuid,
                                                   allocation.is_slab);
 }

Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::freePinnedHostMem ( int8_t * host_ptr )

Definition at line 74 of file CudaMgrNoCuda.cpp.

References CHECK.

                                                 {
   CHECK(false);
 }

const std::vector<DeviceProperties>& CudaMgr_Namespace::CudaMgr::getAllDeviceProperties ( ) const

inline

Definition at line 134 of file CudaMgr.h.

References device_properties_.

Referenced by Executor::blockSize(), Executor::deviceCycles(), and Executor::warpSize().

                                                                     {
     return device_properties_;
   }

Here is the caller graph for this function:

int CudaMgr_Namespace::CudaMgr::getContext ( ) const

Definition at line 517 of file CudaMgr.cpp.

References device_contexts_.

Referenced by QueryEngine::getCudaStream(), and QueryEngine::QueryEngine().

                               {
   CUcontext cnow;
   checkError(cuCtxGetCurrent(&cnow));
   if (cnow == NULL) {
     throw std::runtime_error("no cuda device context");
   }
   int device_num{0};
   for (auto& c : device_contexts_) {
     if (c == cnow) {
       return device_num;
     }
     ++device_num;
   }
   // TODO(sy): Change device_contexts_ to have O(1) lookup? (Or maybe not worth it.)
   throw std::runtime_error("invalid cuda device context");
 }

Here is the caller graph for this function:

NvidiaDeviceArch CudaMgr_Namespace::CudaMgr::getDeviceArch ( ) const

inline

Definition at line 186 of file CudaMgr.h.

References CudaMgr_Namespace::Ampere, device_properties_, CudaMgr_Namespace::Kepler, CudaMgr_Namespace::Maxwell, CudaMgr_Namespace::Pascal, CudaMgr_Namespace::Turing, and CudaMgr_Namespace::Volta.

                                          {
     if (device_properties_.size() > 0) {
       const auto& device_properties = device_properties_.front();
       switch (device_properties.computeMajor) {
         case 3:
           return NvidiaDeviceArch::Kepler;
         case 5:
           return NvidiaDeviceArch::Maxwell;
         case 6:
           return NvidiaDeviceArch::Pascal;
         case 7:
           if (device_properties.computeMinor < 5) {
             return NvidiaDeviceArch::Volta;
           } else {
             return NvidiaDeviceArch::Turing;
           }
         case 8:
           return NvidiaDeviceArch::Ampere;
         default:
           return NvidiaDeviceArch::Kepler;
       }
     } else {
       // always fallback to Kepler if an architecture cannot be detected
       return NvidiaDeviceArch::Kepler;
     }
   }

int CudaMgr_Namespace::CudaMgr::getDeviceCount ( ) const

inline

Definition at line 90 of file CudaMgr.h.

References device_count_.

Referenced by Executor::deviceCount(), get_available_gpus(), isArchMaxwell(), isArchMaxwellOrLater(), isArchPascal(), isArchPascalOrLater(), and QueryEngine::QueryEngine().

90 { return device_count_; }

CudaMgr_Namespace::CudaMgr::device_count_

int device_count_

Definition: CudaMgr.h:261

Here is the caller graph for this function:

const heavyai::DeviceGroup& CudaMgr_Namespace::CudaMgr::getDeviceGroup ( ) const

inline

Definition at line 92 of file CudaMgr.h.

References device_group_.

92 { return device_group_; }

CudaMgr_Namespace::CudaMgr::device_group_

heavyai::DeviceGroup device_group_

Definition: CudaMgr.h:266

const DeviceProperties* CudaMgr_Namespace::CudaMgr::getDeviceProperties ( const size_t device_num ) const

inline

Definition at line 137 of file CudaMgr.h.

References device_properties_, and to_string().

Referenced by allocateDeviceMem().

                                                                              {
     // device_num is the device number relative to start_gpu_ (real_device_num -
     // start_gpu_)
     if (device_num < device_properties_.size()) {
       return &device_properties_[device_num];
     }
     throw std::runtime_error("Specified device number " + std::to_string(device_num) +
                              " is out of range of number of devices (" +
                              std::to_string(device_properties_.size()) + ")");
   }

Here is the call graph for this function:

Here is the caller graph for this function:

size_t CudaMgr_Namespace::CudaMgr::getGranularity ( const int device_num ) const

Definition at line 109 of file CudaMgr.cpp.

Referenced by allocateDeviceMem().

                                                          {
   CUmemAllocationProp allocation_prop{};
   allocation_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
   allocation_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   allocation_prop.location.id = device_num;
   size_t granularity{};
   checkError(cuMemGetAllocationGranularity(
       &granularity, &allocation_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
   return granularity;
 }

Here is the caller graph for this function:

size_t CudaMgr_Namespace::CudaMgr::getMinNumMPsForAllDevices ( ) const

inline

Definition at line 132 of file CudaMgr.h.

References min_num_mps_for_all_devices.

132 { return min_num_mps_for_all_devices; }

CudaMgr_Namespace::CudaMgr::min_num_mps_for_all_devices

size_t min_num_mps_for_all_devices

Definition: CudaMgr.h:264

size_t CudaMgr_Namespace::CudaMgr::getMinSharedMemoryPerBlockForAllDevices ( ) const

inline

Definition at line 128 of file CudaMgr.h.

References min_shared_memory_per_block_for_all_devices.

                                                          {
     return min_shared_memory_per_block_for_all_devices;
   }

int CudaMgr_Namespace::CudaMgr::getStartGpu ( ) const

inline

Definition at line 91 of file CudaMgr.h.

References start_gpu_.

91 { return start_gpu_; }

CudaMgr_Namespace::CudaMgr::start_gpu_

int start_gpu_

Definition: CudaMgr.h:262

bool CudaMgr_Namespace::CudaMgr::isArchMaxwell ( ) const

inline

Definition at line 147 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

                                     {
     return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 5);
   }

Here is the call graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchMaxwellOrLater ( ) const

inline

Definition at line 150 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

                                            {
     return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 5);
   }

Here is the call graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchMaxwellOrLaterForAll ( ) const

Returns true if all devices have Maxwell micro-architecture, or later. Returns false, if there is any device with compute capability of < 5.0

Definition at line 437 of file CudaMgr.cpp.

References device_count_, and device_properties_.

                                                {
   for (int i = 0; i < device_count_; i++) {
     if (device_properties_[i].computeMajor < 5) {
       return false;
     }
   }
   return true;
 }

bool CudaMgr_Namespace::CudaMgr::isArchPascal ( ) const

inline

Definition at line 153 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

                                    {
     return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 6);
   }

Here is the call graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchPascalOrLater ( ) const

inline

Definition at line 156 of file CudaMgr.h.

References device_properties_, and getDeviceCount().

Referenced by Executor::isArchPascalOrLater().

                                           {
     return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 6);
   }

Here is the call graph for this function:

Here is the caller graph for this function:

bool CudaMgr_Namespace::CudaMgr::isArchVoltaOrGreaterForAll ( ) const

Returns true if all devices have Volta micro-architecture Returns false, if there is any non-Volta device available.

Definition at line 450 of file CudaMgr.cpp.

References device_count_, and device_properties_.

                                                {
   for (int i = 0; i < device_count_; i++) {
     if (device_properties_[i].computeMajor < 7) {
       return false;
     }
   }
   return true;
 }

void CudaMgr_Namespace::CudaMgr::setContext ( const int device_num ) const

Definition at line 511 of file CudaMgr.cpp.

References CHECK_LT, and device_contexts_.

Referenced by allocateDeviceMem(), allocatePinnedHostMem(), copyDeviceToDevice(), copyDeviceToHost(), copyHostToDevice(), CudaMgr(), QueryEngine::QueryEngine(), setDeviceMem(), and synchronizeDevices().

                                                    {
   // deviceNum is the device number relative to startGpu (realDeviceNum - startGpu_)
   CHECK_LT(device_num, device_count_);
   cuCtxSetCurrent(device_contexts_[device_num]);
 }

Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::setDeviceMem	(	int8_t *	device_ptr,
		const unsigned char	uc,
		const size_t	num_bytes,
		const int	device_num,
		CUstream	cuda_stream = `0`
	)

Definition at line 418 of file CudaMgr.cpp.

References setContext().

Referenced by zeroDeviceMem().

                                                  {
   setContext(device_num);
   if (!cuda_stream) {
     checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
   } else {
     checkError(cuMemsetD8Async(
         reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes, cuda_stream));
     checkError(cuStreamSynchronize(cuda_stream));
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::synchronizeDevices ( ) const

Definition at line 120 of file CudaMgr.cpp.

References device_count_, and setContext().

Referenced by ~CudaMgr(), and Buffer_Namespace::GpuCudaBufferMgr::~GpuCudaBufferMgr().

                                        {
   for (int d = 0; d < device_count_; ++d) {
     setContext(d);
     checkError(cuCtxSynchronize());
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void CudaMgr_Namespace::CudaMgr::zeroDeviceMem	(	int8_t *	device_ptr,
		const size_t	num_bytes,
		const int	device_num,
		CUstream	cuda_stream = `0`
	)

Definition at line 411 of file CudaMgr.cpp.

References setDeviceMem().

                                                   {
   setDeviceMem(device_ptr, 0, num_bytes, device_num, cuda_stream);
 }

Here is the call graph for this function:

Member Data Documentation

std::vector<CUcontext> CudaMgr_Namespace::CudaMgr::device_contexts_

private

Definition at line 267 of file CudaMgr.h.

Referenced by copyDeviceToDevice(), getContext(), setContext(), and ~CudaMgr().

int CudaMgr_Namespace::CudaMgr::device_count_

private

Definition at line 261 of file CudaMgr.h.

Referenced by CudaMgr(), getDeviceCount(), isArchMaxwellOrLaterForAll(), isArchVoltaOrGreaterForAll(), synchronizeDevices(), and ~CudaMgr().

heavyai::DeviceGroup CudaMgr_Namespace::CudaMgr::device_group_

private

Definition at line 266 of file CudaMgr.h.

Referenced by CudaMgr(), and getDeviceGroup().

std::mutex CudaMgr_Namespace::CudaMgr::device_mutex_

mutableprivate

Definition at line 268 of file CudaMgr.h.

Referenced by allocateDeviceMem(), copyDeviceToHost(), freeDeviceMem(), and ~CudaMgr().

std::vector<DeviceProperties> CudaMgr_Namespace::CudaMgr::device_properties_

private

Definition at line 265 of file CudaMgr.h.

Referenced by CudaMgr(), getAllDeviceProperties(), getDeviceArch(), getDeviceProperties(), isArchMaxwell(), isArchMaxwellOrLater(), isArchMaxwellOrLaterForAll(), isArchPascal(), isArchPascalOrLater(), and isArchVoltaOrGreaterForAll().

size_t CudaMgr_Namespace::CudaMgr::min_num_mps_for_all_devices

private

Definition at line 264 of file CudaMgr.h.

Referenced by getMinNumMPsForAllDevices().

size_t CudaMgr_Namespace::CudaMgr::min_shared_memory_per_block_for_all_devices

private

Definition at line 263 of file CudaMgr.h.

Referenced by getMinSharedMemoryPerBlockForAllDevices().

int CudaMgr_Namespace::CudaMgr::start_gpu_

private

Definition at line 262 of file CudaMgr.h.

Referenced by allocateDeviceMem(), CudaMgr(), and getStartGpu().

The documentation for this class was generated from the following files:

/home/jenkins-slave/workspace/core-os-doxygen/CudaMgr/CudaMgr.h
/home/jenkins-slave/workspace/core-os-doxygen/CudaMgr/CudaMgr.cpp
/home/jenkins-slave/workspace/core-os-doxygen/CudaMgr/CudaMgrNoCuda.cpp

Public Member Functions

Static Public Member Functions

Private Attributes

Detailed Description

Constructor & Destructor Documentation

Member Function Documentation

Member Data Documentation