OmniSciDB  085a039ca4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
CudaMgr.h
Go to the documentation of this file.
1 /*
2  * Copyright 2018 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #pragma once
17 
18 #include <cstdlib>
19 #include <mutex>
20 #include <string>
21 #include <vector>
22 
23 #include "Logger/Logger.h"
24 #include "Shared/DeviceGroup.h"
25 
26 #ifdef HAVE_CUDA
27 #include <cuda.h>
28 #else
29 #include "Shared/nocuda.h"
30 #endif // HAVE_CUDA
31 
32 namespace CudaMgr_Namespace {
33 
34 enum class NvidiaDeviceArch {
35  Kepler, // compute major = 3
36  Maxwell, // compute major = 5
37  Pascal, // compute major = 6
38  Volta, // compute major = 7, compute minor = 0
39  Turing, // compute major = 7, compute minor = 5
40  Ampere // compute major = 8
41 };
42 
43 #ifdef HAVE_CUDA
44 std::string errorMessage(CUresult const);
45 
46 class CudaErrorException : public std::runtime_error {
47  public:
48  CudaErrorException(CUresult status);
49 
50  CUresult getStatus() const { return status_; }
51 
52  private:
53  CUresult const status_;
54 };
55 #endif
56 
62  size_t globalMem;
66  int numMPs;
67  int warpSize;
71  int pciBusId;
74  int memoryBusWidth; // in bits
76  int clockKhz;
77  int numCore;
78 };
79 
80 class CudaMgr {
81  public:
82  CudaMgr(const int num_gpus, const int start_gpu = 0);
83  ~CudaMgr();
84 
85  void synchronizeDevices() const;
86  int getDeviceCount() const { return device_count_; }
87  int getStartGpu() const { return start_gpu_; }
89 
90  void copyHostToDevice(int8_t* device_ptr,
91  const int8_t* host_ptr,
92  const size_t num_bytes,
93  const int device_num,
94  CUstream cuda_stream = 0);
95  void copyDeviceToHost(int8_t* host_ptr,
96  const int8_t* device_ptr,
97  const size_t num_bytes,
98  const int device_num,
99  CUstream cuda_stream = 0);
100  void copyDeviceToDevice(int8_t* dest_ptr,
101  int8_t* src_ptr,
102  const size_t num_bytes,
103  const int dest_device_num,
104  const int src_device_num,
105  CUstream cuda_stream = 0);
106 
107  int8_t* allocatePinnedHostMem(const size_t num_bytes);
108  int8_t* allocateDeviceMem(const size_t num_bytes, const int device_num);
109  void freePinnedHostMem(int8_t* host_ptr);
110  void freeDeviceMem(int8_t* device_ptr);
111  void zeroDeviceMem(int8_t* device_ptr,
112  const size_t num_bytes,
113  const int device_num,
114  CUstream cuda_stream = 0);
115  void setDeviceMem(int8_t* device_ptr,
116  const unsigned char uc,
117  const size_t num_bytes,
118  const int device_num,
119  CUstream cuda_stream = 0);
120 
123  }
124 
126 
127  const std::vector<DeviceProperties>& getAllDeviceProperties() const {
128  return device_properties_;
129  }
130  const DeviceProperties* getDeviceProperties(const size_t device_num) const {
131  // device_num is the device number relative to start_gpu_ (real_device_num -
132  // start_gpu_)
133  if (device_num < device_properties_.size()) {
134  return &device_properties_[device_num];
135  }
136  throw std::runtime_error("Specified device number " + std::to_string(device_num) +
137  " is out of range of number of devices (" +
138  std::to_string(device_properties_.size()) + ")");
139  }
140  inline bool isArchMaxwell() const {
141  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 5);
142  }
143  inline bool isArchMaxwellOrLater() const {
144  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 5);
145  }
146  inline bool isArchPascal() const {
147  return (getDeviceCount() > 0 && device_properties_[0].computeMajor == 6);
148  }
149  inline bool isArchPascalOrLater() const {
150  return (getDeviceCount() > 0 && device_properties_[0].computeMajor >= 6);
151  }
152  bool isArchMaxwellOrLaterForAll() const;
153  bool isArchVoltaOrGreaterForAll() const;
154 
155  static std::string deviceArchToSM(const NvidiaDeviceArch arch) {
156  // Must match ${CUDA_COMPILATION_ARCH} CMAKE flag
157  switch (arch) {
159  return "sm_35";
161  return "sm_50";
163  return "sm_60";
165  return "sm_70";
167  return "sm_75";
169  return "sm_75";
170  default:
171  LOG(WARNING) << "Unrecognized Nvidia device architecture, falling back to "
172  "Kepler-compatibility.";
173  return "sm_35";
174  }
175  UNREACHABLE();
176  return "";
177  }
178 
180  if (device_properties_.size() > 0) {
181  const auto& device_properties = device_properties_.front();
182  switch (device_properties.computeMajor) {
183  case 3:
185  case 5:
187  case 6:
189  case 7:
190  if (device_properties.computeMinor == 0) {
192  } else {
194  }
195  case 8:
197  default:
199  }
200  } else {
201  // always fallback to Kepler if an architecture cannot be detected
203  }
204  }
205 
206  void setContext(const int device_num) const;
207  int getContext() const;
208 
209 #ifdef HAVE_CUDA
210 
211  void printDeviceProperties() const;
212 
213  const std::vector<CUcontext>& getDeviceContexts() const { return device_contexts_; }
214  const int getGpuDriverVersion() const { return gpu_driver_version_; }
215 
216  void loadGpuModuleData(CUmodule* module,
217  const void* image,
218  unsigned int num_options,
219  CUjit_option* options,
220  void** option_values,
221  const int device_id) const;
222  void unloadGpuModuleData(CUmodule* module, const int device_id) const;
223 
224  struct CudaMemoryUsage {
225  size_t free; // available GPU RAM memory on active card in bytes
226  size_t total; // total GPU RAM memory on active card in bytes
227  };
228 
229  static CudaMemoryUsage getCudaMemoryUsage();
230 #endif
231 
232  private:
233 #ifdef HAVE_CUDA
234  void fillDeviceProperties();
235  void initDeviceGroup();
236  void createDeviceContexts();
237  size_t computeMinSharedMemoryPerBlockForAllDevices() const;
238  size_t computeMinNumMPsForAllDevices() const;
239  void checkError(CUresult cu_result) const;
240 
241  int gpu_driver_version_;
242 #endif
243 
248  std::vector<DeviceProperties> device_properties_;
250  std::vector<CUcontext> device_contexts_;
251 
252  mutable std::mutex device_cleanup_mutex_;
253 };
254 
255 } // Namespace CudaMgr_Namespace
256 
257 extern std::string get_cuda_home(void);
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:109
size_t min_num_mps_for_all_devices
Definition: CudaMgr.h:247
int CUjit_option
Definition: nocuda.h:26
heavyai::DeviceGroup device_group_
Definition: CudaMgr.h:249
std::vector< DeviceIdentifier > DeviceGroup
Definition: DeviceGroup.h:15
std::mutex device_cleanup_mutex_
Definition: CudaMgr.h:252
void * CUstream
Definition: nocuda.h:23
#define LOG(tag)
Definition: Logger.h:217
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:312
int8_t * allocatePinnedHostMem(const size_t num_bytes)
Definition: CudaMgr.cpp:281
void setContext(const int device_num) const
Definition: CudaMgr.cpp:405
bool isArchPascalOrLater() const
Definition: CudaMgr.h:149
#define UNREACHABLE()
Definition: Logger.h:267
size_t min_shared_memory_per_block_for_all_devices
Definition: CudaMgr.h:246
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:125
size_t getMinNumMPsForAllDevices() const
Definition: CudaMgr.h:125
int getStartGpu() const
Definition: CudaMgr.h:87
std::string to_string(char const *&&v)
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:250
std::string get_cuda_home(void)
Definition: CudaMgr.cpp:465
void freeDeviceMem(int8_t *device_ptr)
Definition: CudaMgr.cpp:299
std::string errorMessage(CUresult const status)
Definition: CudaMgr.cpp:41
int getDeviceCount() const
Definition: CudaMgr.h:86
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:143
size_t getMinSharedMemoryPerBlockForAllDevices() const
Definition: CudaMgr.h:121
static std::string deviceArchToSM(const NvidiaDeviceArch arch)
Definition: CudaMgr.h:155
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:331
int CUresult
Definition: nocuda.h:21
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:248
int CUdevice
Definition: nocuda.h:20
void freePinnedHostMem(int8_t *host_ptr)
Definition: CudaMgr.cpp:295
void synchronizeDevices() const
Definition: CudaMgr.cpp:102
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num, CUstream cuda_stream=0)
Definition: CudaMgr.cpp:305
const DeviceProperties * getDeviceProperties(const size_t device_num) const
Definition: CudaMgr.h:130
bool isArchMaxwell() const
Definition: CudaMgr.h:140
const heavyai::DeviceGroup & getDeviceGroup() const
Definition: CudaMgr.h:88
bool isArchPascal() const
Definition: CudaMgr.h:146
CudaMgr(const int num_gpus, const int start_gpu=0)
Definition: CudaMgr.cpp:49
const std::vector< DeviceProperties > & getAllDeviceProperties() const
Definition: CudaMgr.h:127
bool isArchVoltaOrGreaterForAll() const
Definition: CudaMgr.cpp:344
NvidiaDeviceArch getDeviceArch() const
Definition: CudaMgr.h:179
bool isArchMaxwellOrLater() const
Definition: CudaMgr.h:143
void * CUmodule
Definition: nocuda.h:24
int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:288