OmniSciDB  2c44a3935d
CudaMgr.h
Go to the documentation of this file.
1 /*
2  * Copyright 2018 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <cstdlib>
19 #include <mutex>
20 #include <string>
21 #include <vector>
22 
23 #include "Shared/Logger.h"
24 #include "Shared/uuid.h"
25 
26 #ifdef HAVE_CUDA
27 #include <cuda.h>
28 #else
29 #include "Shared/nocuda.h"
30 #endif // HAVE_CUDA
31 
32 namespace omnisci {
34  const int index;
35  const int cuda_id;
36  const UUID uuid;
37 };
38 
39 using DeviceGroup = std::vector<DeviceIdentifier>;
40 } // namespace omnisci
41 
42 namespace CudaMgr_Namespace {
43 
44 enum class NvidiaDeviceArch {
45  Kepler, // compute major = 3
46  Maxwell, // compute major = 5
47  Pascal, // compute major = 6
48  Volta, // compute major = 7, compute minor = 0
49  Turing // compute major = 7, compute minor = 5
50 };
51 
52 #ifdef HAVE_CUDA
53 std::string errorMessage(CUresult const);
54 
55 class CudaErrorException : public std::runtime_error {
56  public:
57  CudaErrorException(CUresult status);
58 
59  CUresult getStatus() const { return status_; }
60 
61  private:
62  CUresult const status_;
63 };
64 #endif
65 
71  size_t globalMem;
75  int numMPs;
76  int warpSize;
80  int pciBusId;
83  int memoryBusWidth; // in bits
85  int clockKhz;
86  int numCore;
87 };
88 
89 class CudaMgr {
90  public:
91  CudaMgr(const int num_gpus, const int start_gpu = 0);
92  ~CudaMgr();
93 
94  void synchronizeDevices() const;
95  int getDeviceCount() const { return device_count_; }
96  int getStartGpu() const { return start_gpu_; }
97  const omnisci::DeviceGroup& getDeviceGroup() const { return device_group_; }
98 
99  void copyHostToDevice(int8_t* device_ptr,
100  const int8_t* host_ptr,
101  const size_t num_bytes,
102  const int device_num);
103  void copyDeviceToHost(int8_t* host_ptr,
104  const int8_t* device_ptr,
105  const size_t num_bytes,
106  const int device_num);
107  void copyDeviceToDevice(int8_t* dest_ptr,
108  int8_t* src_ptr,
109  const size_t num_bytes,
110  const int dest_device_num,
111  const int src_device_num);
112 
113  int8_t* allocatePinnedHostMem(const size_t num_bytes);
114  int8_t* allocateDeviceMem(const size_t num_bytes, const int device_num);
115  void freePinnedHostMem(int8_t* host_ptr);
116  void freeDeviceMem(int8_t* device_ptr);
117  void zeroDeviceMem(int8_t* device_ptr, const size_t num_bytes, const int device_num);
118  void setDeviceMem(int8_t* device_ptr,
119  const unsigned char uc,
120  const size_t num_bytes,
121  const int device_num);
122 
124  return min_shared_memory_per_block_for_all_devices;
125  }
126 
127  size_t getMinNumMPsForAllDevices() const { return min_num_mps_for_all_devices; }
128 
129  const std::vector<DeviceProperties>& getAllDeviceProperties() const {
130  return device_properties_;
131  }
132  const DeviceProperties* getDeviceProperties(const size_t device_num) const {
133  // device_num is the device number relative to start_gpu_ (real_device_num -
134  // start_gpu_)
135  if (device_num < device_properties_.size()) {
136  return &device_properties_[device_num];
137  }
138  throw std::runtime_error("Specified device number " + std::to_string(device_num) +
139  " is out of range of number of devices (" +
140  std::to_string(device_properties_.size()) + ")");
141  }
142  inline bool isArchMaxwell() const {
143  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 5);
144  }
145  inline bool isArchMaxwellOrLater() const {
146  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 5);
147  }
148  inline bool isArchPascal() const {
149  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 6);
150  }
151  inline bool isArchPascalOrLater() const {
152  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 6);
153  }
154  bool isArchMaxwellOrLaterForAll() const;
155  bool isArchVoltaForAll() const;
156 
157  static std::string deviceArchToSM(const NvidiaDeviceArch arch) {
158  // Must match ${CUDA_COMPILATION_ARCH} CMAKE flag
159  switch (arch) {
160  case NvidiaDeviceArch::Kepler:
161  return "sm_35";
162  case NvidiaDeviceArch::Maxwell:
163  return "sm_50";
164  case NvidiaDeviceArch::Pascal:
165  return "sm_60";
166  case NvidiaDeviceArch::Volta:
167  return "sm_70";
168  case NvidiaDeviceArch::Turing:
169  return "sm_75";
170  default:
171  LOG(WARNING) << "Unrecognized Nvidia device architecture, falling back to "
172  "Kepler-compatibility.";
173  return "sm_35";
174  }
175  UNREACHABLE();
176  return "";
177  }
178 
180  if (device_properties_.size() > 0) {
181  const auto& device_properties = device_properties_.front();
182  switch (device_properties.computeMajor) {
183  case 3:
184  return NvidiaDeviceArch::Kepler;
185  case 5:
186  return NvidiaDeviceArch::Maxwell;
187  case 6:
188  return NvidiaDeviceArch::Pascal;
189  case 7:
190  if (device_properties.computeMinor == 0) {
191  return NvidiaDeviceArch::Volta;
192  } else {
193  return NvidiaDeviceArch::Turing;
194  }
195  default:
196  return NvidiaDeviceArch::Kepler;
197  }
198  } else {
199  // always fallback to Kepler if an architecture cannot be detected
200  return NvidiaDeviceArch::Kepler;
201  }
202  }
203 
204  void setContext(const int device_num) const;
205 
206 #ifdef HAVE_CUDA
207 
208  void printDeviceProperties() const;
209 
210  const std::vector<CUcontext>& getDeviceContexts() const { return device_contexts_; }
211  const int getGpuDriverVersion() const { return gpu_driver_version_; }
212 
213  void loadGpuModuleData(CUmodule* module,
214  const void* image,
215  unsigned int num_options,
216  CUjit_option* options,
217  void** option_values,
218  const int device_id) const;
219  void unloadGpuModuleData(CUmodule* module, const int device_id) const;
220 
221  struct CudaMemoryUsage {
222  size_t free; // available GPU RAM memory on active card in bytes
223  size_t total; // total GPU RAM memory on active card in bytes
224  };
225 
226  static CudaMemoryUsage getCudaMemoryUsage();
227 #endif
228 
229  private:
230 #ifdef HAVE_CUDA
231  void fillDeviceProperties();
232  void initDeviceGroup();
233  void createDeviceContexts();
234  size_t computeMinSharedMemoryPerBlockForAllDevices() const;
235  size_t computeMinNumMPsForAllDevices() const;
236  void checkError(CUresult cu_result) const;
237 
238  int gpu_driver_version_;
239 #endif
240 
245  std::vector<DeviceProperties> device_properties_;
247  std::vector<CUcontext> device_contexts_;
248 
249  mutable std::mutex device_cleanup_mutex_;
250 };
251 
252 } // Namespace CudaMgr_Namespace
bool isArchMaxwellOrLater() const
Definition: CudaMgr.h:145
const DeviceProperties * getDeviceProperties(const size_t device_num) const
Definition: CudaMgr.h:132
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:244
int CUjit_option
Definition: nocuda.h:25
std::mutex device_cleanup_mutex_
Definition: CudaMgr.h:249
#define LOG(tag)
Definition: Logger.h:188
No-frills UUID type class to allow easy containerization and comparison of device UUIDs from differen...
#define UNREACHABLE()
Definition: Logger.h:241
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:243
NvidiaDeviceArch getDeviceArch() const
Definition: CudaMgr.h:179
int getDeviceCount() const
Definition: CudaMgr.h:95
std::string to_string(char const *&&v)
const omnisci::DeviceGroup & getDeviceGroup() const
Definition: CudaMgr.h:97
bool isArchMaxwell() const
Definition: CudaMgr.h:142
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:247
omnisci::DeviceGroup device_group_
Definition: CudaMgr.h:246
std::string errorMessage(CUresult const status)
Definition: CudaMgr.cpp:40
const UUID uuid
UUID for device (hardware invariant)
Definition: CudaMgr.h:36
bool isArchPascalOrLater() const
Definition: CudaMgr.h:151
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
Definition: CudaMgr.h:157
int getStartGpu() const
Definition: CudaMgr.h:96
int CUresult
Definition: nocuda.h:21
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:245
int CUdevice
Definition: nocuda.h:20
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:123
const int index
index into device group (currently num_gpus - start_gpu)
Definition: CudaMgr.h:34
size_t getMinNumMPsForAllDevices() const
Definition: CudaMgr.h:127
std::vector< DeviceIdentifier > DeviceGroup
Definition: CudaMgr.h:39
const std::vector< DeviceProperties > & getAllDeviceProperties() const
Definition: CudaMgr.h:129
void * CUmodule
Definition: nocuda.h:23
bool isArchPascal() const
Definition: CudaMgr.h:148
const int cuda_id
Cuda ID for device (ignores start_gpu)
Definition: CudaMgr.h:35