OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CudaMgr.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cstdlib>
20 #include <map>
21 #include <mutex>
22 #include <string>
23 #include <vector>
24 
26 #include "Logger/Logger.h"
27 #include "Shared/DeviceGroup.h"
28 
29 #ifdef HAVE_CUDA
30 #include <cuda.h>
31 #else
32 #include "Shared/nocuda.h"
33 #endif // HAVE_CUDA
34 
35 namespace CudaMgr_Namespace {
36 
37 enum class NvidiaDeviceArch {
38  Kepler, // compute major = 3
39  Maxwell, // compute major = 5
40  Pascal, // compute major = 6
41  Volta, // compute major = 7, compute minor = 0
42  Turing, // compute major = 7, compute minor = 5
43  Ampere // compute major = 8
44 };
45 
46 #ifdef HAVE_CUDA
47 std::string errorMessage(CUresult const);
48 
49 class CudaErrorException : public std::runtime_error {
50  public:
51  CudaErrorException(CUresult status);
52 
53  CUresult getStatus() const { return status_; }
54 
55  private:
56  CUresult const status_;
57 };
58 #endif
59 
65  size_t globalMem;
69  int numMPs;
70  int warpSize;
74  int pciBusId;
77  int memoryBusWidth; // in bits
79  int clockKhz;
80  int numCore;
82 };
83 
84 class CudaMgr {
85  public:
86  CudaMgr(const int num_gpus, const int start_gpu = 0);
87  virtual ~CudaMgr();
88 
89  void synchronizeDevices() const;
90  int getDeviceCount() const { return device_count_; }
91  int getStartGpu() const { return start_gpu_; }
93  size_t computePaddedBufferSize(size_t buf_size, size_t granularity) const;
94  size_t getGranularity(const int device_num) const;
95 
96  void copyHostToDevice(int8_t* device_ptr,
97  const int8_t* host_ptr,
98  const size_t num_bytes,
99  const int device_num,
100  CUstream cuda_stream = 0);
101  void copyDeviceToHost(int8_t* host_ptr,
102  const int8_t* device_ptr,
103  const size_t num_bytes,
104  CUstream cuda_stream = 0);
105  void copyDeviceToDevice(int8_t* dest_ptr,
106  int8_t* src_ptr,
107  const size_t num_bytes,
108  const int dest_device_num,
109  const int src_device_num,
110  CUstream cuda_stream = 0);
111 
112  int8_t* allocatePinnedHostMem(const size_t num_bytes);
113  virtual int8_t* allocateDeviceMem(const size_t num_bytes,
114  const int device_num,
115  const bool is_slab = false);
116  void freePinnedHostMem(int8_t* host_ptr);
117  void freeDeviceMem(int8_t* device_ptr);
118  void zeroDeviceMem(int8_t* device_ptr,
119  const size_t num_bytes,
120  const int device_num,
121  CUstream cuda_stream = 0);
122  void setDeviceMem(int8_t* device_ptr,
123  const unsigned char uc,
124  const size_t num_bytes,
125  const int device_num,
126  CUstream cuda_stream = 0);
127 
130  }
131 
133 
134  const std::vector<DeviceProperties>& getAllDeviceProperties() const {
135  return device_properties_;
136  }
137  const DeviceProperties* getDeviceProperties(const size_t device_num) const {
138  // device_num is the device number relative to start_gpu_ (real_device_num -
139  // start_gpu_)
140  if (device_num < device_properties_.size()) {
141  return &device_properties_[device_num];
142  }
143  throw std::runtime_error("Specified device number " + std::to_string(device_num) +
144  " is out of range of number of devices (" +
145  std::to_string(device_properties_.size()) + ")");
146  }
147  inline bool isArchMaxwell() const {
148  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 5);
149  }
150  inline bool isArchMaxwellOrLater() const {
151  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 5);
152  }
153  inline bool isArchPascal() const {
154  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 6);
155  }
156  inline bool isArchPascalOrLater() const {
157  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 6);
158  }
159  bool isArchMaxwellOrLaterForAll() const;
160  bool isArchVoltaOrGreaterForAll() const;
161 
162  static std::string deviceArchToSM(const NvidiaDeviceArch arch) {
163  // Must match ${CUDA_COMPILATION_ARCH} CMAKE flag
164  switch (arch) {
166  return "sm_35";
168  return "sm_50";
170  return "sm_60";
172  return "sm_70";
174  return "sm_75";
176  return "sm_75";
177  default:
178  LOG(WARNING) << "Unrecognized Nvidia device architecture, falling back to "
179  "Kepler-compatibility.";
180  return "sm_35";
181  }
182  UNREACHABLE();
183  return "";
184  }
185 
187  if (device_properties_.size() > 0) {
188  const auto& device_properties = device_properties_.front();
189  switch (device_properties.computeMajor) {
190  case 3:
192  case 5:
194  case 6:
196  case 7:
197  if (device_properties.computeMinor < 5) {
199  } else {
201  }
202  case 8:
204  default:
206  }
207  } else {
208  // always fallback to Kepler if an architecture cannot be detected
210  }
211  }
212 
213  void setContext(const int device_num) const;
214  int getContext() const;
215 
216 #ifdef HAVE_CUDA
217 
218  void logDeviceProperties() const;
219 
220  const std::vector<CUcontext>& getDeviceContexts() const {
221  return device_contexts_;
222  }
223  const int getGpuDriverVersion() const {
224  return gpu_driver_version_;
225  }
226 
227  void loadGpuModuleData(CUmodule* module,
228  const void* image,
229  unsigned int num_options,
230  CUjit_option* options,
231  void** option_values,
232  const int device_id) const;
233  void unloadGpuModuleData(CUmodule* module, const int device_id) const;
234 
235  struct CudaMemoryUsage {
236  size_t free; // available GPU RAM memory on active card in bytes
237  size_t total; // total GPU RAM memory on active card in bytes
238  };
239 
240  std::vector<CudaMgr::CudaMemoryUsage> getCudaMemoryUsage();
241 
242  std::string getCudaMemoryUsageInString();
243 
244  DeviceMemoryAllocationMap& getDeviceMemoryAllocationMap();
245  int exportHandle(const uint64_t handle) const;
246 
247 #endif
248 
249  private:
250 #ifdef HAVE_CUDA
251  void fillDeviceProperties();
252  void initDeviceGroup();
253  void createDeviceContexts();
254  size_t computeMinSharedMemoryPerBlockForAllDevices() const;
255  size_t computeMinNumMPsForAllDevices() const;
256  void checkError(CUresult cu_result) const;
257 
258  int gpu_driver_version_;
259 #endif
260 
265  std::vector<DeviceProperties> device_properties_;
267  std::vector<CUcontext> device_contexts_;
268  mutable std::mutex device_mutex_;
269 
270 #ifdef HAVE_CUDA
271  DeviceMemoryAllocationMapUqPtr device_memory_allocation_map_;
272 #endif
273 };
274 
275 } // Namespace CudaMgr_Namespace
276 
277 extern std::string get_cuda_home(void);
278 extern std::string get_cuda_libdevice_dir(void);
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:127
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:264
std::string get_cuda_libdevice_dir(void)
Definition: CudaMgr.cpp:612
int CUjit_option
Definition: nocuda.h:26
heavyai::DeviceGroup device_group_
Definition: CudaMgr.h:266
std::mutex device_mutex_
Definition: CudaMgr.h:268
std::vector< DeviceIdentifier > DeviceGroup
Definition: DeviceGroup.h:31
size_t getGranularity(const int device_num) const
Definition: CudaMgr.cpp:109
void * CUstream
Definition: nocuda.h:23
#define LOG(tag)
Definition: Logger.h:285
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:418
int8_t * allocatePinnedHostMem(const size_t num_bytes)
Definition: CudaMgr.cpp:326
void setContext(const int device_num) const
Definition: CudaMgr.cpp:511
bool isArchPascalOrLater() const
Definition: CudaMgr.h:156
#define UNREACHABLE()
Definition: Logger.h:338
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:263
size_t getMinNumMPsForAllDevices() const
Definition: CudaMgr.h:132
int getStartGpu() const
Definition: CudaMgr.h:91
std::string to_string(char const *&&v)
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:267
std::string get_cuda_home(void)
Definition: CudaMgr.cpp:583
void freeDeviceMem(int8_t *device_ptr)
Definition: CudaMgr.cpp:392
std::string errorMessage(CUresult const status)
Definition: CudaMgr.cpp:40
int getDeviceCount() const
Definition: CudaMgr.h:90
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:164
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:128
size_t computePaddedBufferSize(size_t buf_size, size_t granularity) const
Definition: CudaMgr.cpp:105
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
Definition: CudaMgr.h:162
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:437
int CUresult
Definition: nocuda.h:21
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:143
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:265
int CUdevice
Definition: nocuda.h:20
void freePinnedHostMem(int8_t *host_ptr)
virtual int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num, const bool is_slab=false)
Definition: CudaMgr.cpp:333
void synchronizeDevices() const
Definition: CudaMgr.cpp:120
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:411
const DeviceProperties * getDeviceProperties(const size_t device_num) const
Definition: CudaMgr.h:137
bool isArchMaxwell() const
Definition: CudaMgr.h:147
const heavyai::DeviceGroup & getDeviceGroup() const
Definition: CudaMgr.h:92
bool isArchPascal() const
Definition: CudaMgr.h:153
CudaMgr(const int num_gpus, const int start_gpu=0)
Definition: CudaMgr.cpp:48
std::unique_ptr< DeviceMemoryAllocationMap > DeviceMemoryAllocationMapUqPtr
const std::vector< DeviceProperties > & getAllDeviceProperties() const
Definition: CudaMgr.h:134
bool isArchVoltaOrGreaterForAll() const
Definition: CudaMgr.cpp:450
NvidiaDeviceArch getDeviceArch() const
Definition: CudaMgr.h:186
bool isArchMaxwellOrLater() const
Definition: CudaMgr.h:150
void * CUmodule
Definition: nocuda.h:24