OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CudaMgr.h
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #pragma once
18 
19 #include <cstdlib>
20 #include <mutex>
21 #include <string>
22 #include <vector>
23 
24 #include "Logger/Logger.h"
25 #include "Shared/DeviceGroup.h"
26 
27 #ifdef HAVE_CUDA
28 #include <cuda.h>
29 #else
30 #include "Shared/nocuda.h"
31 #endif // HAVE_CUDA
32 
33 namespace CudaMgr_Namespace {
34 
35 enum class NvidiaDeviceArch {
36  Kepler, // compute major = 3
37  Maxwell, // compute major = 5
38  Pascal, // compute major = 6
39  Volta, // compute major = 7, compute minor = 0
40  Turing, // compute major = 7, compute minor = 5
41  Ampere // compute major = 8
42 };
43 
44 #ifdef HAVE_CUDA
45 std::string errorMessage(CUresult const);
46 
47 class CudaErrorException : public std::runtime_error {
48  public:
49  CudaErrorException(CUresult status);
50 
51  CUresult getStatus() const { return status_; }
52 
53  private:
54  CUresult const status_;
55 };
56 #endif
57 
63  size_t globalMem;
67  int numMPs;
68  int warpSize;
72  int pciBusId;
75  int memoryBusWidth; // in bits
77  int clockKhz;
78  int numCore;
79 };
80 
81 class CudaMgr {
82  public:
83  CudaMgr(const int num_gpus, const int start_gpu = 0);
84  ~CudaMgr();
85 
86  void synchronizeDevices() const;
87  int getDeviceCount() const { return device_count_; }
88  int getStartGpu() const { return start_gpu_; }
90 
91  void copyHostToDevice(int8_t* device_ptr,
92  const int8_t* host_ptr,
93  const size_t num_bytes,
94  const int device_num,
95  CUstream cuda_stream = 0);
96  void copyDeviceToHost(int8_t* host_ptr,
97  const int8_t* device_ptr,
98  const size_t num_bytes,
99  const int device_num,
100  CUstream cuda_stream = 0);
101  void copyDeviceToDevice(int8_t* dest_ptr,
102  int8_t* src_ptr,
103  const size_t num_bytes,
104  const int dest_device_num,
105  const int src_device_num,
106  CUstream cuda_stream = 0);
107 
108  int8_t* allocatePinnedHostMem(const size_t num_bytes);
109  int8_t* allocateDeviceMem(const size_t num_bytes, const int device_num);
110  void freePinnedHostMem(int8_t* host_ptr);
111  void freeDeviceMem(int8_t* device_ptr);
112  void zeroDeviceMem(int8_t* device_ptr,
113  const size_t num_bytes,
114  const int device_num,
115  CUstream cuda_stream = 0);
116  void setDeviceMem(int8_t* device_ptr,
117  const unsigned char uc,
118  const size_t num_bytes,
119  const int device_num,
120  CUstream cuda_stream = 0);
121 
124  }
125 
127 
128  const std::vector<DeviceProperties>& getAllDeviceProperties() const {
129  return device_properties_;
130  }
131  const DeviceProperties* getDeviceProperties(const size_t device_num) const {
132  // device_num is the device number relative to start_gpu_ (real_device_num -
133  // start_gpu_)
134  if (device_num < device_properties_.size()) {
135  return &device_properties_[device_num];
136  }
137  throw std::runtime_error("Specified device number " + std::to_string(device_num) +
138  " is out of range of number of devices (" +
139  std::to_string(device_properties_.size()) + ")");
140  }
141  inline bool isArchMaxwell() const {
142  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 5);
143  }
144  inline bool isArchMaxwellOrLater() const {
145  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 5);
146  }
147  inline bool isArchPascal() const {
148  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 6);
149  }
150  inline bool isArchPascalOrLater() const {
151  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 6);
152  }
153  bool isArchMaxwellOrLaterForAll() const;
154  bool isArchVoltaOrGreaterForAll() const;
155 
156  static std::string deviceArchToSM(const NvidiaDeviceArch arch) {
157  // Must match ${CUDA_COMPILATION_ARCH} CMAKE flag
158  switch (arch) {
160  return "sm_35";
162  return "sm_50";
164  return "sm_60";
166  return "sm_70";
168  return "sm_75";
170  return "sm_75";
171  default:
172  LOG(WARNING) << "Unrecognized Nvidia device architecture, falling back to "
173  "Kepler-compatibility.";
174  return "sm_35";
175  }
176  UNREACHABLE();
177  return "";
178  }
179 
181  if (device_properties_.size() > 0) {
182  const auto& device_properties = device_properties_.front();
183  switch (device_properties.computeMajor) {
184  case 3:
186  case 5:
188  case 6:
190  case 7:
191  if (device_properties.computeMinor < 5) {
193  } else {
195  }
196  case 8:
198  default:
200  }
201  } else {
202  // always fallback to Kepler if an architecture cannot be detected
204  }
205  }
206 
207  void setContext(const int device_num) const;
208  int getContext() const;
209 
210 #ifdef HAVE_CUDA
211 
212  void printDeviceProperties() const;
213 
214  const std::vector<CUcontext>& getDeviceContexts() const { return device_contexts_; }
215  const int getGpuDriverVersion() const { return gpu_driver_version_; }
216 
217  void loadGpuModuleData(CUmodule* module,
218  const void* image,
219  unsigned int num_options,
220  CUjit_option* options,
221  void** option_values,
222  const int device_id) const;
223  void unloadGpuModuleData(CUmodule* module, const int device_id) const;
224 
225  struct CudaMemoryUsage {
226  size_t free; // available GPU RAM memory on active card in bytes
227  size_t total; // total GPU RAM memory on active card in bytes
228  };
229 
230  static CudaMemoryUsage getCudaMemoryUsage();
231 #endif
232 
233  private:
234 #ifdef HAVE_CUDA
235  void fillDeviceProperties();
236  void initDeviceGroup();
237  void createDeviceContexts();
238  size_t computeMinSharedMemoryPerBlockForAllDevices() const;
239  size_t computeMinNumMPsForAllDevices() const;
240  void checkError(CUresult cu_result) const;
241 
242  int gpu_driver_version_;
243 #endif
244 
249  std::vector<DeviceProperties> device_properties_;
251  std::vector<CUcontext> device_contexts_;
252 
253  mutable std::mutex device_cleanup_mutex_;
254 };
255 
256 } // Namespace CudaMgr_Namespace
257 
258 extern std::string get_cuda_home(void);
259 extern std::string get_cuda_libdevice_dir(void);
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:109
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:248
std::string get_cuda_libdevice_dir(void)
Definition: CudaMgr.cpp:494
int CUjit_option
Definition: nocuda.h:26
heavyai::DeviceGroup device_group_
Definition: CudaMgr.h:250
std::vector< DeviceIdentifier > DeviceGroup
Definition: DeviceGroup.h:31
std::mutex device_cleanup_mutex_
Definition: CudaMgr.h:253
void * CUstream
Definition: nocuda.h:23
#define LOG(tag)
Definition: Logger.h:285
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:312
int8_t * allocatePinnedHostMem(const size_t num_bytes)
Definition: CudaMgr.cpp:281
void setContext(const int device_num) const
Definition: CudaMgr.cpp:405
bool isArchPascalOrLater() const
Definition: CudaMgr.h:150
#define UNREACHABLE()
Definition: Logger.h:337
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:247
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:125
size_t getMinNumMPsForAllDevices() const
Definition: CudaMgr.h:126
int getStartGpu() const
Definition: CudaMgr.h:88
std::string to_string(char const *&&v)
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:251
std::string get_cuda_home(void)
Definition: CudaMgr.cpp:465
void freeDeviceMem(int8_t *device_ptr)
Definition: CudaMgr.cpp:299
std::string errorMessage(CUresult const status)
Definition: CudaMgr.cpp:41
int getDeviceCount() const
Definition: CudaMgr.h:87
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:143
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:122
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
Definition: CudaMgr.h:156
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:331
int CUresult
Definition: nocuda.h:21
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:249
int CUdevice
Definition: nocuda.h:20
void freePinnedHostMem(int8_t *host_ptr)
Definition: CudaMgr.cpp:295
void synchronizeDevices() const
Definition: CudaMgr.cpp:102
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:305
const DeviceProperties * getDeviceProperties(const size_t device_num) const
Definition: CudaMgr.h:131
bool isArchMaxwell() const
Definition: CudaMgr.h:141
const heavyai::DeviceGroup & getDeviceGroup() const
Definition: CudaMgr.h:89
bool isArchPascal() const
Definition: CudaMgr.h:147
CudaMgr(const int num_gpus, const int start_gpu=0)
Definition: CudaMgr.cpp:49
const std::vector< DeviceProperties > & getAllDeviceProperties() const
Definition: CudaMgr.h:128
bool isArchVoltaOrGreaterForAll() const
Definition: CudaMgr.cpp:344
NvidiaDeviceArch getDeviceArch() const
Definition: CudaMgr.h:180
bool isArchMaxwellOrLater() const
Definition: CudaMgr.h:144
void * CUmodule
Definition: nocuda.h:24
int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:288