OmniSciDB  0bd2ec9cf4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
CudaMgr.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2018 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CudaMgr/CudaMgr.h"
18 
19 #include <algorithm>
20 #include <cassert>
21 #include <iostream>
22 #include <stdexcept>
23 
24 #include "Shared/Logger.h"
25 
26 namespace CudaMgr_Namespace {
27 
28 std::string errorMessage(CUresult const status) {
29  const char* errorString{nullptr};
30  cuGetErrorString(status, &errorString);
31  return errorString ? "CUDA Error: " + std::string(errorString)
32  : "CUDA Driver API error code " + std::to_string(status);
33 }
34 
35 CudaMgr::CudaMgr(const int num_gpus, const int start_gpu)
36  : start_gpu_(start_gpu), max_shared_memory_for_all_(0) {
37  checkError(cuInit(0));
38  checkError(cuDeviceGetCount(&device_count_));
39 
40  if (num_gpus > 0) { // numGpus <= 0 will just use number of gpus found
41  CHECK_LE(num_gpus + start_gpu_, device_count_);
42  device_count_ = std::min(device_count_, num_gpus);
43  } else {
44  // if we are using all gpus we cannot start on a gpu other than 0
45  CHECK_EQ(start_gpu_, 0);
46  }
47  fillDeviceProperties();
48  initDeviceGroup();
49  createDeviceContexts();
50  printDeviceProperties();
51 }
52 
53 void CudaMgr::initDeviceGroup() {
54  for (int device_id = 0; device_id < device_count_; device_id++) {
55  device_group_.push_back(
56  {device_id, device_id + start_gpu_, device_properties_[device_id].uuid});
57  }
58 }
59 
61  try {
62  // We don't want to remove the cudaMgr before all other processes have cleaned up.
63  // This should be enforced by the lifetime policies, but take this lock to be safe.
64  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
65 
67  for (int d = 0; d < device_count_; ++d) {
68  checkError(cuCtxDestroy(device_contexts_[d]));
69  }
70  } catch (const CudaErrorException& e) {
71  if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
72  // TODO(adb / asuhan): Verify cuModuleUnload removes the context
73  return;
74  }
75  LOG(ERROR) << "CUDA Error: " << e.what();
76  } catch (const std::runtime_error& e) {
77  LOG(ERROR) << "CUDA Error: " << e.what();
78  }
79 }
80 
82  for (int d = 0; d < device_count_; ++d) {
83  setContext(d);
84  checkError(cuCtxSynchronize());
85  }
86 }
87 
88 void CudaMgr::copyHostToDevice(int8_t* device_ptr,
89  const int8_t* host_ptr,
90  const size_t num_bytes,
91  const int device_num) {
92  setContext(device_num);
93  checkError(
94  cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
95 }
96 
97 void CudaMgr::copyDeviceToHost(int8_t* host_ptr,
98  const int8_t* device_ptr,
99  const size_t num_bytes,
100  const int device_num) {
101  setContext(device_num);
102  checkError(
103  cuMemcpyDtoH(host_ptr, reinterpret_cast<const CUdeviceptr>(device_ptr), num_bytes));
104 }
105 
106 void CudaMgr::copyDeviceToDevice(int8_t* dest_ptr,
107  int8_t* src_ptr,
108  const size_t num_bytes,
109  const int dest_device_num,
110  const int src_device_num) {
111  // dest_device_num and src_device_num are the device numbers relative to start_gpu_
112  // (real_device_num - start_gpu_)
113  if (src_device_num == dest_device_num) {
114  setContext(src_device_num);
115  checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
116  reinterpret_cast<CUdeviceptr>(src_ptr),
117  num_bytes));
118  } else {
119  checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
120  device_contexts_[dest_device_num],
121  reinterpret_cast<CUdeviceptr>(src_ptr),
122  device_contexts_[src_device_num],
123  num_bytes)); // will we always have peer?
124  }
125 }
126 
127 void CudaMgr::loadGpuModuleData(CUmodule* module,
128  const void* image,
129  unsigned int num_options,
130  CUjit_option* options,
131  void** option_vals,
132  const int device_id) const {
133  setContext(device_id);
134  checkError(cuModuleLoadDataEx(module, image, num_options, options, option_vals));
135 }
136 
137 void CudaMgr::unloadGpuModuleData(CUmodule* module, const int device_id) const {
138  std::lock_guard<std::mutex> gpuLock(device_cleanup_mutex_);
139  CHECK(module);
140 
141  setContext(device_id);
142  try {
143  auto code = cuModuleUnload(*module);
144  // If the Cuda driver has already shut down, ignore the resulting errors.
145  if (code != CUDA_ERROR_DEINITIALIZED) {
146  checkError(code);
147  }
148  } catch (const std::runtime_error& e) {
149  LOG(ERROR) << "CUDA Error: " << e.what();
150  }
151 }
152 
153 void CudaMgr::fillDeviceProperties() {
154  device_properties_.resize(device_count_);
155  cuDriverGetVersion(&gpu_driver_version_);
156  for (int device_num = 0; device_num < device_count_; ++device_num) {
157  checkError(
158  cuDeviceGet(&device_properties_[device_num].device, device_num + start_gpu_));
159  CUuuid cuda_uuid;
160  checkError(cuDeviceGetUuid(&cuda_uuid, device_properties_[device_num].device));
161  device_properties_[device_num].uuid = omnisci::UUID(cuda_uuid.bytes);
162  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMajor,
163  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
164  device_properties_[device_num].device));
165  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMinor,
166  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
167  device_properties_[device_num].device));
168  checkError(cuDeviceTotalMem(&device_properties_[device_num].globalMem,
169  device_properties_[device_num].device));
170  checkError(cuDeviceGetAttribute(&device_properties_[device_num].constantMem,
171  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
172  device_properties_[device_num].device));
173  checkError(
174  cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerMP,
175  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
176  device_properties_[device_num].device));
177  checkError(cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerBlock,
178  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
179  device_properties_[device_num].device));
180  checkError(cuDeviceGetAttribute(&device_properties_[device_num].numMPs,
181  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
182  device_properties_[device_num].device));
183  checkError(cuDeviceGetAttribute(&device_properties_[device_num].warpSize,
184  CU_DEVICE_ATTRIBUTE_WARP_SIZE,
185  device_properties_[device_num].device));
186  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxThreadsPerBlock,
187  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
188  device_properties_[device_num].device));
189  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerBlock,
190  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
191  device_properties_[device_num].device));
192  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerMP,
193  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
194  device_properties_[device_num].device));
195  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciBusId,
196  CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
197  device_properties_[device_num].device));
198  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciDeviceId,
199  CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
200  device_properties_[device_num].device));
201  checkError(cuDeviceGetAttribute(&device_properties_[device_num].clockKhz,
202  CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
203  device_properties_[device_num].device));
204  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryClockKhz,
205  CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
206  device_properties_[device_num].device));
207  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryBusWidth,
208  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
209  device_properties_[device_num].device));
210  device_properties_[device_num].memoryBandwidthGBs =
211  device_properties_[device_num].memoryClockKhz / 1000000.0 / 8.0 *
212  device_properties_[device_num].memoryBusWidth;
213  }
214  max_shared_memory_for_all_ = computeMaxSharedMemoryForAll();
215 }
216 
217 int8_t* CudaMgr::allocatePinnedHostMem(const size_t num_bytes) {
218  setContext(0);
219  void* host_ptr;
220  checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
221  return reinterpret_cast<int8_t*>(host_ptr);
222 }
223 
224 int8_t* CudaMgr::allocateDeviceMem(const size_t num_bytes, const int device_num) {
225  setContext(device_num);
226  CUdeviceptr device_ptr;
227  checkError(cuMemAlloc(&device_ptr, num_bytes));
228  return reinterpret_cast<int8_t*>(device_ptr);
229 }
230 
231 void CudaMgr::freePinnedHostMem(int8_t* host_ptr) {
232  checkError(cuMemFreeHost(reinterpret_cast<void*>(host_ptr)));
233 }
234 
235 void CudaMgr::freeDeviceMem(int8_t* device_ptr) {
236  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
237 
238  checkError(cuMemFree(reinterpret_cast<CUdeviceptr>(device_ptr)));
239 }
240 
241 void CudaMgr::zeroDeviceMem(int8_t* device_ptr,
242  const size_t num_bytes,
243  const int device_num) {
244  setDeviceMem(device_ptr, 0, num_bytes, device_num);
245 }
246 
247 void CudaMgr::setDeviceMem(int8_t* device_ptr,
248  const unsigned char uc,
249  const size_t num_bytes,
250  const int device_num) {
251  setContext(device_num);
252  checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
253 }
254 
260  for (int i = 0; i < device_count_; i++) {
261  if (device_properties_[i].computeMajor < 5) {
262  return false;
263  }
264  }
265  return true;
266 }
267 
273  for (int i = 0; i < device_count_; i++) {
274  if (device_properties_[i].computeMajor != 7) {
275  return false;
276  }
277  }
278  return true;
279 }
280 
286 size_t CudaMgr::computeMaxSharedMemoryForAll() const {
287  int shared_mem_size =
288  device_count_ > 0 ? device_properties_.front().sharedMemPerBlock : 0;
289  for (int d = 1; d < device_count_; d++) {
290  shared_mem_size = std::min(shared_mem_size, device_properties_[d].sharedMemPerBlock);
291  }
292  return shared_mem_size;
293 }
294 
295 void CudaMgr::createDeviceContexts() {
296  CHECK_EQ(device_contexts_.size(), size_t(0));
297  device_contexts_.resize(device_count_);
298  for (int d = 0; d < device_count_; ++d) {
299  CUresult status = cuCtxCreate(&device_contexts_[d], 0, device_properties_[d].device);
300  if (status != CUDA_SUCCESS) {
301  // this is called from destructor so we need
302  // to clean up
303  // destroy all contexts up to this point
304  for (int destroy_id = 0; destroy_id <= d; ++destroy_id) {
305  try {
306  checkError(cuCtxDestroy(device_contexts_[destroy_id]));
307  } catch (const CudaErrorException& e) {
308  LOG(ERROR) << "Failed to destroy CUDA context for device ID " << destroy_id
309  << " with " << e.what()
310  << ". CUDA contexts were being destroyed due to an error creating "
311  "CUDA context for device ID "
312  << d << " out of " << device_count_ << " (" << errorMessage(status)
313  << ").";
314  }
315  }
316  // checkError will translate the message and throw
317  checkError(status);
318  }
319  }
320 }
321 
322 void CudaMgr::setContext(const int device_num) const {
323  // deviceNum is the device number relative to startGpu (realDeviceNum - startGpu_)
324  CHECK_LT(device_num, device_count_);
325  cuCtxSetCurrent(device_contexts_[device_num]);
326 }
327 
328 void CudaMgr::printDeviceProperties() const {
329  LOG(INFO) << "Using " << device_count_ << " Gpus.";
330  for (int d = 0; d < device_count_; ++d) {
331  VLOG(1) << "Device: " << device_properties_[d].device;
332  VLOG(1) << "UUID: " << device_properties_[d].uuid;
333  VLOG(1) << "Clock (khz): " << device_properties_[d].clockKhz;
334  VLOG(1) << "Compute Major: " << device_properties_[d].computeMajor;
335  VLOG(1) << "Compute Minor: " << device_properties_[d].computeMinor;
336  VLOG(1) << "PCI bus id: " << device_properties_[d].pciBusId;
337  VLOG(1) << "PCI deviceId id: " << device_properties_[d].pciDeviceId;
338  VLOG(1) << "Per device global memory: "
339  << device_properties_[d].globalMem / 1073741824.0 << " GB";
340  VLOG(1) << "Memory clock (khz): " << device_properties_[d].memoryClockKhz;
341  VLOG(1) << "Memory bandwidth: " << device_properties_[d].memoryBandwidthGBs
342  << " GB/sec";
343 
344  VLOG(1) << "Constant Memory: " << device_properties_[d].constantMem;
345  VLOG(1) << "Shared memory per multiprocessor: "
346  << device_properties_[d].sharedMemPerMP;
347  VLOG(1) << "Shared memory per block: " << device_properties_[d].sharedMemPerBlock;
348  VLOG(1) << "Number of MPs: " << device_properties_[d].numMPs;
349  VLOG(1) << "Warp Size: " << device_properties_[d].warpSize;
350  VLOG(1) << "Max threads per block: " << device_properties_[d].maxThreadsPerBlock;
351  VLOG(1) << "Max registers per block: " << device_properties_[d].maxRegistersPerBlock;
352  VLOG(1) << "Max register per MP: " << device_properties_[d].maxRegistersPerMP;
353  VLOG(1) << "Memory bus width in bits: " << device_properties_[d].memoryBusWidth;
354  }
355 }
356 
357 void CudaMgr::checkError(CUresult status) const {
358  if (status != CUDA_SUCCESS) {
359  throw CudaErrorException(status);
360  }
361 }
362 
363 } // namespace CudaMgr_Namespace
#define CHECK_EQ(x, y)
Definition: Logger.h:201
std::unique_ptr< llvm::Module > module(runtime_module_shallow_copy(cgen_state))
int CUjit_option
Definition: nocuda.h:25
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:97
std::mutex device_cleanup_mutex_
Definition: CudaMgr.h:180
#define LOG(tag)
Definition: Logger.h:188
int8_t * allocatePinnedHostMem(const size_t num_bytes)
Definition: CudaMgr.cpp:217
unsigned long long CUdeviceptr
Definition: nocuda.h:27
void setContext(const int device_num) const
Definition: CudaMgr.cpp:322
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num)
Definition: CudaMgr.cpp:106
std::string to_string(char const *&&v)
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:178
void freeDeviceMem(int8_t *device_ptr)
Definition: CudaMgr.cpp:235
omnisci::DeviceGroup device_group_
Definition: CudaMgr.h:177
CHECK(cgen_state)
std::string errorMessage(CUresult const status)
Definition: CudaMgr.cpp:28
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:247
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:88
#define CHECK_LT(x, y)
Definition: Logger.h:203
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:259
#define CHECK_LE(x, y)
Definition: Logger.h:204
int CUresult
Definition: nocuda.h:21
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:176
size_t max_shared_memory_for_all_
Definition: CudaMgr.h:175
bool isArchVoltaForAll() const
Definition: CudaMgr.cpp:272
void freePinnedHostMem(int8_t *host_ptr)
Definition: CudaMgr.cpp:231
void synchronizeDevices() const
Definition: CudaMgr.cpp:81
CudaMgr(const int num_gpus, const int start_gpu=0)
Definition: CudaMgr.cpp:35
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:241
#define VLOG(n)
Definition: Logger.h:283
void * CUmodule
Definition: nocuda.h:23
int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:224