OmniSciDB  04ee39c94c
CudaMgr.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2018 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CudaMgr.h"
18 #include <algorithm>
19 #include <cassert>
20 #include <iostream>
21 #include <stdexcept>
22 #include "Shared/Logger.h"
23 
24 namespace CudaMgr_Namespace {
25 
26 CudaMgr::CudaMgr(const int num_gpus, const int start_gpu)
27  : start_gpu_(start_gpu), max_shared_memory_for_all_(0) {
28  checkError(cuInit(0));
29  checkError(cuDeviceGetCount(&device_count_));
30 
31  if (num_gpus > 0) { // numGpus <= 0 will just use number of gpus found
32  CHECK_LE(num_gpus + start_gpu_, device_count_);
33  device_count_ = std::min(device_count_, num_gpus);
34  } else {
35  // if we are using all gpus we cannot start on a gpu other than 0
36  CHECK_EQ(start_gpu_, 0);
37  }
38  fillDeviceProperties();
39  createDeviceContexts();
40  printDeviceProperties();
41 }
42 
44  try {
45  // We don't want to remove the cudaMgr before all other processes have cleaned up.
46  // This should be enforced by the lifetime policies, but take this lock to be safe.
47  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
48 
50  for (int d = 0; d < device_count_; ++d) {
51  checkError(cuCtxDestroy(device_contexts_[d]));
52  }
53  } catch (const CudaErrorException& e) {
54  if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
55  // TODO(adb / asuhan): Verify cuModuleUnload removes the context
56  return;
57  }
58  LOG(ERROR) << "CUDA Error: " << e.what();
59  } catch (const std::runtime_error& e) {
60  LOG(ERROR) << "CUDA Error: " << e.what();
61  }
62 }
63 
65  for (int d = 0; d < device_count_; ++d) {
66  setContext(d);
67  checkError(cuCtxSynchronize());
68  }
69 }
70 
71 void CudaMgr::copyHostToDevice(int8_t* device_ptr,
72  const int8_t* host_ptr,
73  const size_t num_bytes,
74  const int device_num) {
75  setContext(device_num);
76  checkError(
77  cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
78 }
79 
80 void CudaMgr::copyDeviceToHost(int8_t* host_ptr,
81  const int8_t* device_ptr,
82  const size_t num_bytes,
83  const int device_num) {
84  setContext(device_num);
85  checkError(
86  cuMemcpyDtoH(host_ptr, reinterpret_cast<const CUdeviceptr>(device_ptr), num_bytes));
87 }
88 
89 void CudaMgr::copyDeviceToDevice(int8_t* dest_ptr,
90  int8_t* src_ptr,
91  const size_t num_bytes,
92  const int dest_device_num,
93  const int src_device_num) {
94  // dest_device_num and src_device_num are the device numbers relative to start_gpu_
95  // (real_device_num - start_gpu_)
96  if (src_device_num == dest_device_num) {
97  setContext(src_device_num);
98  checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
99  reinterpret_cast<CUdeviceptr>(src_ptr),
100  num_bytes));
101  } else {
102  checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
103  device_contexts_[dest_device_num],
104  reinterpret_cast<CUdeviceptr>(src_ptr),
105  device_contexts_[src_device_num],
106  num_bytes)); // will we always have peer?
107  }
108 }
109 
110 void CudaMgr::loadGpuModuleData(CUmodule* module,
111  const void* image,
112  unsigned int num_options,
113  CUjit_option* options,
114  void** option_vals,
115  const int device_id) const {
116  setContext(device_id);
117  checkError(cuModuleLoadDataEx(module, image, num_options, options, option_vals));
118 }
119 
120 void CudaMgr::unloadGpuModuleData(CUmodule* module, const int device_id) const {
121  std::lock_guard<std::mutex> gpuLock(device_cleanup_mutex_);
122  CHECK(module);
123 
124  setContext(device_id);
125  try {
126  auto code = cuModuleUnload(*module);
127  // If the Cuda driver has already shut down, ignore the resulting errors.
128  if (code != CUDA_ERROR_DEINITIALIZED) {
129  checkError(code);
130  }
131  } catch (const std::runtime_error& e) {
132  LOG(ERROR) << "CUDA Error: " << e.what();
133  }
134 }
135 
136 void CudaMgr::fillDeviceProperties() {
138  cuDriverGetVersion(&gpu_driver_version_);
139  for (int device_num = 0; device_num < device_count_; ++device_num) {
140  checkError(
141  cuDeviceGet(&device_properties_[device_num].device, device_num + start_gpu_));
142  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMajor,
143  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
144  device_properties_[device_num].device));
145  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMinor,
146  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
147  device_properties_[device_num].device));
148  checkError(cuDeviceTotalMem(&device_properties_[device_num].globalMem,
149  device_properties_[device_num].device));
150  checkError(cuDeviceGetAttribute(&device_properties_[device_num].constantMem,
151  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
152  device_properties_[device_num].device));
153  checkError(
154  cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerMP,
155  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
156  device_properties_[device_num].device));
157  checkError(cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerBlock,
158  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
159  device_properties_[device_num].device));
160  checkError(cuDeviceGetAttribute(&device_properties_[device_num].numMPs,
161  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
162  device_properties_[device_num].device));
163  checkError(cuDeviceGetAttribute(&device_properties_[device_num].warpSize,
164  CU_DEVICE_ATTRIBUTE_WARP_SIZE,
165  device_properties_[device_num].device));
166  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxThreadsPerBlock,
167  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
168  device_properties_[device_num].device));
169  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerBlock,
170  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
171  device_properties_[device_num].device));
172  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerMP,
173  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
174  device_properties_[device_num].device));
175  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciBusId,
176  CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
177  device_properties_[device_num].device));
178  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciDeviceId,
179  CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
180  device_properties_[device_num].device));
181  checkError(cuDeviceGetAttribute(&device_properties_[device_num].clockKhz,
182  CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
183  device_properties_[device_num].device));
184  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryClockKhz,
185  CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
186  device_properties_[device_num].device));
187  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryBusWidth,
188  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
189  device_properties_[device_num].device));
190  device_properties_[device_num].memoryBandwidthGBs =
191  device_properties_[device_num].memoryClockKhz / 1000000.0 / 8.0 *
192  device_properties_[device_num].memoryBusWidth;
193  }
194  max_shared_memory_for_all_ = computeMaxSharedMemoryForAll();
195 }
196 
197 int8_t* CudaMgr::allocatePinnedHostMem(const size_t num_bytes) {
198  setContext(0);
199  void* host_ptr;
200  checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
201  return reinterpret_cast<int8_t*>(host_ptr);
202 }
203 
204 int8_t* CudaMgr::allocateDeviceMem(const size_t num_bytes, const int device_num) {
205  setContext(device_num);
206  CUdeviceptr device_ptr;
207  checkError(cuMemAlloc(&device_ptr, num_bytes));
208  return reinterpret_cast<int8_t*>(device_ptr);
209 }
210 
211 void CudaMgr::freePinnedHostMem(int8_t* host_ptr) {
212  checkError(cuMemFreeHost(reinterpret_cast<void*>(host_ptr)));
213 }
214 
215 void CudaMgr::freeDeviceMem(int8_t* device_ptr) {
216  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
217 
218  checkError(cuMemFree(reinterpret_cast<CUdeviceptr>(device_ptr)));
219 }
220 
221 void CudaMgr::zeroDeviceMem(int8_t* device_ptr,
222  const size_t num_bytes,
223  const int device_num) {
224  setDeviceMem(device_ptr, 0, num_bytes, device_num);
225 }
226 
227 void CudaMgr::setDeviceMem(int8_t* device_ptr,
228  const unsigned char uc,
229  const size_t num_bytes,
230  const int device_num) {
231  setContext(device_num);
232  checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
233 }
234 
240  for (int i = 0; i < device_count_; i++) {
241  if (device_properties_[i].computeMajor < 5) {
242  return false;
243  }
244  }
245  return true;
246 }
247 
253  for (int i = 0; i < device_count_; i++) {
254  if (device_properties_[i].computeMajor != 7) {
255  return false;
256  }
257  }
258  return true;
259 }
260 
266 size_t CudaMgr::computeMaxSharedMemoryForAll() const {
267  int shared_mem_size =
268  device_count_ > 0 ? device_properties_.front().sharedMemPerBlock : 0;
269  for (int d = 1; d < device_count_; d++) {
270  shared_mem_size = std::min(shared_mem_size, device_properties_[d].sharedMemPerBlock);
271  }
272  return shared_mem_size;
273 }
274 
275 void CudaMgr::createDeviceContexts() {
276  CHECK_EQ(device_contexts_.size(), size_t(0));
278  for (int d = 0; d < device_count_; ++d) {
279  CUresult status = cuCtxCreate(&device_contexts_[d], 0, device_properties_[d].device);
280  if (status != CUDA_SUCCESS) {
281  // this is called from destructor so we need
282  // to clean up
283  // destroy all contexts up to this point
284  for (int destroy_id = 0; destroy_id <= d; ++destroy_id) {
285  try {
286  checkError(cuCtxDestroy(device_contexts_[destroy_id]));
287  } catch (const CudaErrorException& e) {
288  LOG(ERROR) << "Error destroying context after failed creation for device "
289  << destroy_id;
290  }
291  }
292  // checkError will translate the message and throw
293  checkError(status);
294  }
295  }
296 }
297 
298 void CudaMgr::setContext(const int device_num) const {
299  // deviceNum is the device number relative to startGpu (realDeviceNum - startGpu_)
300  CHECK_LT(device_num, device_count_);
301  cuCtxSetCurrent(device_contexts_[device_num]);
302 }
303 
304 void CudaMgr::printDeviceProperties() const {
305  LOG(INFO) << "Using " << device_count_ << " Gpus.";
306  for (int d = 0; d < device_count_; ++d) {
307  VLOG(1) << "Device: " << device_properties_[d].device;
308  VLOG(1) << "Clock (khz): " << device_properties_[d].clockKhz;
309  VLOG(1) << "Compute Major: " << device_properties_[d].computeMajor;
310  VLOG(1) << "Compute Minor: " << device_properties_[d].computeMinor;
311  VLOG(1) << "PCI bus id: " << device_properties_[d].pciBusId;
312  VLOG(1) << "PCI deviceId id: " << device_properties_[d].pciDeviceId;
313  VLOG(1) << "Total Global memory: " << device_properties_[d].globalMem / 1073741824.0
314  << " GB";
315  VLOG(1) << "Memory clock (khz): " << device_properties_[d].memoryClockKhz;
316  VLOG(1) << "Memory bandwidth: " << device_properties_[d].memoryBandwidthGBs
317  << " GB/sec";
318 
319  VLOG(1) << "Constant Memory: " << device_properties_[d].constantMem;
320  VLOG(1) << "Shared memory per multiprocessor: "
321  << device_properties_[d].sharedMemPerMP;
322  VLOG(1) << "Shared memory per block: " << device_properties_[d].sharedMemPerBlock;
323  VLOG(1) << "Number of MPs: " << device_properties_[d].numMPs;
324  VLOG(1) << "Warp Size: " << device_properties_[d].warpSize;
325  VLOG(1) << "Max threads per block: " << device_properties_[d].maxThreadsPerBlock;
326  VLOG(1) << "Max registers per block: " << device_properties_[d].maxRegistersPerBlock;
327  VLOG(1) << "Max register per MP: " << device_properties_[d].maxRegistersPerMP;
328  VLOG(1) << "Memory bus width in bits: " << device_properties_[d].memoryBusWidth;
329  }
330 }
331 
332 void CudaMgr::checkError(CUresult status) const {
333  if (status != CUDA_SUCCESS) {
334  throw CudaErrorException(status);
335  }
336 }
337 
338 } // namespace CudaMgr_Namespace
#define CHECK_EQ(x, y)
Definition: Logger.h:195
int CUjit_option
Definition: nocuda.h:25
void d(const SQLTypes expected_type, const std::string &str)
Definition: ImportTest.cpp:268
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:80
std::mutex device_cleanup_mutex_
Definition: CudaMgr.h:170
#define LOG(tag)
Definition: Logger.h:182
int8_t * allocatePinnedHostMem(const size_t num_bytes)
Definition: CudaMgr.cpp:197
unsigned long long CUdeviceptr
Definition: nocuda.h:27
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:239
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num)
Definition: CudaMgr.cpp:89
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:168
void freeDeviceMem(int8_t *device_ptr)
Definition: CudaMgr.cpp:215
void synchronizeDevices() const
Definition: CudaMgr.cpp:64
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:227
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:71
bool isArchVoltaForAll() const
Definition: CudaMgr.cpp:252
#define CHECK_LT(x, y)
Definition: Logger.h:197
#define CHECK_LE(x, y)
Definition: Logger.h:198
int CUresult
Definition: nocuda.h:21
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:167
size_t max_shared_memory_for_all_
Definition: CudaMgr.h:166
void freePinnedHostMem(int8_t *host_ptr)
Definition: CudaMgr.cpp:211
#define CHECK(condition)
Definition: Logger.h:187
CudaMgr(const int num_gpus, const int start_gpu=0)
Definition: CudaMgr.cpp:26
void setContext(const int device_num) const
Definition: CudaMgr.cpp:298
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:221
#define VLOG(n)
Definition: Logger.h:277
void * CUmodule
Definition: nocuda.h:23
int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:204