OmniSciDB  f17484ade4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CudaMgr.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cstdlib>
20 #include <map>
21 #include <mutex>
22 #include <string>
23 #include <vector>
24 
25 #include "Logger/Logger.h"
26 #include "Shared/DeviceGroup.h"
27 
28 #ifdef HAVE_CUDA
29 #include <cuda.h>
30 #else
31 #include "Shared/nocuda.h"
32 #endif // HAVE_CUDA
33 
34 namespace CudaMgr_Namespace {
35 
36 using DeviceMemoryPtrConstant = uint64_t;
37 
39  uint64_t size;
40  uint64_t handle;
43 };
44 using DeviceMemoryAllocationMap = std::map<DeviceMemoryPtrConstant, DeviceMemoryMetadata>;
45 
46 enum class NvidiaDeviceArch {
47  Kepler, // compute major = 3
48  Maxwell, // compute major = 5
49  Pascal, // compute major = 6
50  Volta, // compute major = 7, compute minor = 0
51  Turing, // compute major = 7, compute minor = 5
52  Ampere // compute major = 8
53 };
54 
55 #ifdef HAVE_CUDA
56 std::string errorMessage(CUresult const);
57 
58 class CudaErrorException : public std::runtime_error {
59  public:
60  CudaErrorException(CUresult status);
61 
62  CUresult getStatus() const { return status_; }
63 
64  private:
65  CUresult const status_;
66 };
67 #endif
68 
74  size_t globalMem;
78  int numMPs;
79  int warpSize;
83  int pciBusId;
86  int memoryBusWidth; // in bits
88  int clockKhz;
89  int numCore;
91 };
92 
93 class CudaMgr {
94  public:
95  CudaMgr(const int num_gpus, const int start_gpu = 0);
96  virtual ~CudaMgr();
97 
98  void synchronizeDevices() const;
99  int getDeviceCount() const { return device_count_; }
100  int getStartGpu() const { return start_gpu_; }
102  size_t computePaddedBufferSize(size_t buf_size, size_t granularity) const;
103  size_t getGranularity(const int device_num) const;
104 
105  void copyHostToDevice(int8_t* device_ptr,
106  const int8_t* host_ptr,
107  const size_t num_bytes,
108  const int device_num,
109  CUstream cuda_stream = 0);
110  void copyDeviceToHost(int8_t* host_ptr,
111  const int8_t* device_ptr,
112  const size_t num_bytes,
113  CUstream cuda_stream = 0);
114  void copyDeviceToDevice(int8_t* dest_ptr,
115  int8_t* src_ptr,
116  const size_t num_bytes,
117  const int dest_device_num,
118  const int src_device_num,
119  CUstream cuda_stream = 0);
120 
121  int8_t* allocatePinnedHostMem(const size_t num_bytes);
122  virtual int8_t* allocateDeviceMem(const size_t num_bytes, const int device_num);
123  void freePinnedHostMem(int8_t* host_ptr);
124  void freeDeviceMem(int8_t* device_ptr);
125  void zeroDeviceMem(int8_t* device_ptr,
126  const size_t num_bytes,
127  const int device_num,
128  CUstream cuda_stream = 0);
129  void setDeviceMem(int8_t* device_ptr,
130  const unsigned char uc,
131  const size_t num_bytes,
132  const int device_num,
133  CUstream cuda_stream = 0);
134 
137  }
138 
140 
141  const std::vector<DeviceProperties>& getAllDeviceProperties() const {
142  return device_properties_;
143  }
144  const DeviceProperties* getDeviceProperties(const size_t device_num) const {
145  // device_num is the device number relative to start_gpu_ (real_device_num -
146  // start_gpu_)
147  if (device_num < device_properties_.size()) {
148  return &device_properties_[device_num];
149  }
150  throw std::runtime_error("Specified device number " + std::to_string(device_num) +
151  " is out of range of number of devices (" +
152  std::to_string(device_properties_.size()) + ")");
153  }
154  inline bool isArchMaxwell() const {
155  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 5);
156  }
157  inline bool isArchMaxwellOrLater() const {
158  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 5);
159  }
160  inline bool isArchPascal() const {
161  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 6);
162  }
163  inline bool isArchPascalOrLater() const {
164  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 6);
165  }
166  bool isArchMaxwellOrLaterForAll() const;
167  bool isArchVoltaOrGreaterForAll() const;
168 
169  static std::string deviceArchToSM(const NvidiaDeviceArch arch) {
170  // Must match ${CUDA_COMPILATION_ARCH} CMAKE flag
171  switch (arch) {
173  return "sm_35";
175  return "sm_50";
177  return "sm_60";
179  return "sm_70";
181  return "sm_75";
183  return "sm_75";
184  default:
185  LOG(WARNING) << "Unrecognized Nvidia device architecture, falling back to "
186  "Kepler-compatibility.";
187  return "sm_35";
188  }
189  UNREACHABLE();
190  return "";
191  }
192 
194  if (device_properties_.size() > 0) {
195  const auto& device_properties = device_properties_.front();
196  switch (device_properties.computeMajor) {
197  case 3:
199  case 5:
201  case 6:
203  case 7:
204  if (device_properties.computeMinor < 5) {
206  } else {
208  }
209  case 8:
211  default:
213  }
214  } else {
215  // always fallback to Kepler if an architecture cannot be detected
217  }
218  }
219 
220  void setContext(const int device_num) const;
221  int getContext() const;
222 
223 #ifdef HAVE_CUDA
224 
225  void logDeviceProperties() const;
226 
227  const std::vector<CUcontext>& getDeviceContexts() const {
228  return device_contexts_;
229  }
230  const int getGpuDriverVersion() const {
231  return gpu_driver_version_;
232  }
233 
234  void loadGpuModuleData(CUmodule* module,
235  const void* image,
236  unsigned int num_options,
237  CUjit_option* options,
238  void** option_values,
239  const int device_id) const;
240  void unloadGpuModuleData(CUmodule* module, const int device_id) const;
241 
242  struct CudaMemoryUsage {
243  size_t free; // available GPU RAM memory on active card in bytes
244  size_t total; // total GPU RAM memory on active card in bytes
245  };
246 
247  static CudaMemoryUsage getCudaMemoryUsage();
248 #endif
249 
250  private:
251 #ifdef HAVE_CUDA
252  void fillDeviceProperties();
253  void initDeviceGroup();
254  void createDeviceContexts();
255  size_t computeMinSharedMemoryPerBlockForAllDevices() const;
256  size_t computeMinNumMPsForAllDevices() const;
257  void checkError(CUresult cu_result) const;
258 
259  int gpu_driver_version_;
260 #endif
261 
266  std::vector<DeviceProperties> device_properties_;
268  std::vector<CUcontext> device_contexts_;
270  mutable std::mutex device_mutex_;
271 };
272 
273 } // Namespace CudaMgr_Namespace
274 
275 extern std::string get_cuda_home(void);
276 extern std::string get_cuda_libdevice_dir(void);
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:124
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:265
std::string get_cuda_libdevice_dir(void)
Definition: CudaMgr.cpp:578
int CUjit_option
Definition: nocuda.h:26
heavyai::DeviceGroup device_group_
Definition: CudaMgr.h:267
std::mutex device_mutex_
Definition: CudaMgr.h:270
std::vector< DeviceIdentifier > DeviceGroup
Definition: DeviceGroup.h:31
size_t getGranularity(const int device_num) const
Definition: CudaMgr.cpp:106
void * CUstream
Definition: nocuda.h:23
#define LOG(tag)
Definition: Logger.h:285
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:396
int8_t * allocatePinnedHostMem(const size_t num_bytes)
Definition: CudaMgr.cpp:305
DeviceMemoryAllocationMap device_memory_allocation_map_
Definition: CudaMgr.h:269
void setContext(const int device_num) const
Definition: CudaMgr.cpp:489
bool isArchPascalOrLater() const
Definition: CudaMgr.h:163
#define UNREACHABLE()
Definition: Logger.h:338
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:264
uint64_t DeviceMemoryPtrConstant
Definition: CudaMgr.h:36
size_t getMinNumMPsForAllDevices() const
Definition: CudaMgr.h:139
int getStartGpu() const
Definition: CudaMgr.h:100
std::string to_string(char const *&&v)
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:268
std::string get_cuda_home(void)
Definition: CudaMgr.cpp:549
void freeDeviceMem(int8_t *device_ptr)
Definition: CudaMgr.cpp:367
std::string errorMessage(CUresult const status)
Definition: CudaMgr.cpp:40
int getDeviceCount() const
Definition: CudaMgr.h:99
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:165
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:135
size_t computePaddedBufferSize(size_t buf_size, size_t granularity) const
Definition: CudaMgr.cpp:102
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
Definition: CudaMgr.h:169
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:415
int CUresult
Definition: nocuda.h:21
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:140
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:266
int CUdevice
Definition: nocuda.h:20
std::map< DeviceMemoryPtrConstant, DeviceMemoryMetadata > DeviceMemoryAllocationMap
Definition: CudaMgr.h:44
void freePinnedHostMem(int8_t *host_ptr)
void synchronizeDevices() const
Definition: CudaMgr.cpp:117
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:389
const DeviceProperties * getDeviceProperties(const size_t device_num) const
Definition: CudaMgr.h:144
bool isArchMaxwell() const
Definition: CudaMgr.h:154
const heavyai::DeviceGroup & getDeviceGroup() const
Definition: CudaMgr.h:101
bool isArchPascal() const
Definition: CudaMgr.h:160
CudaMgr(const int num_gpus, const int start_gpu=0)
Definition: CudaMgr.cpp:48
const std::vector< DeviceProperties > & getAllDeviceProperties() const
Definition: CudaMgr.h:141
bool isArchVoltaOrGreaterForAll() const
Definition: CudaMgr.cpp:428
NvidiaDeviceArch getDeviceArch() const
Definition: CudaMgr.h:193
bool isArchMaxwellOrLater() const
Definition: CudaMgr.h:157
void * CUmodule
Definition: nocuda.h:24
virtual int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:312