OmniSciDB  1dac507f6e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
CudaMgr.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2018 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "CudaMgr/CudaMgr.h"
18 
19 #include <algorithm>
20 #include <cassert>
21 #include <iostream>
22 #include <stdexcept>
23 
24 #include "Shared/Logger.h"
25 
26 namespace CudaMgr_Namespace {
27 
28 CudaMgr::CudaMgr(const int num_gpus, const int start_gpu)
29  : start_gpu_(start_gpu), max_shared_memory_for_all_(0) {
30  checkError(cuInit(0));
31  checkError(cuDeviceGetCount(&device_count_));
32 
33  if (num_gpus > 0) { // numGpus <= 0 will just use number of gpus found
34  CHECK_LE(num_gpus + start_gpu_, device_count_);
35  device_count_ = std::min(device_count_, num_gpus);
36  } else {
37  // if we are using all gpus we cannot start on a gpu other than 0
38  CHECK_EQ(start_gpu_, 0);
39  }
40  fillDeviceProperties();
41  initDeviceGroup();
42  createDeviceContexts();
43  printDeviceProperties();
44 }
45 
46 void CudaMgr::initDeviceGroup() {
47  for (int device_id = 0; device_id < device_count_; device_id++) {
48  device_group_.push_back(
49  {device_id, device_id + start_gpu_, device_properties_[device_id].uuid});
50  }
51 }
52 
54  try {
55  // We don't want to remove the cudaMgr before all other processes have cleaned up.
56  // This should be enforced by the lifetime policies, but take this lock to be safe.
57  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
58 
60  for (int d = 0; d < device_count_; ++d) {
61  checkError(cuCtxDestroy(device_contexts_[d]));
62  }
63  } catch (const CudaErrorException& e) {
64  if (e.getStatus() == CUDA_ERROR_DEINITIALIZED) {
65  // TODO(adb / asuhan): Verify cuModuleUnload removes the context
66  return;
67  }
68  LOG(ERROR) << "CUDA Error: " << e.what();
69  } catch (const std::runtime_error& e) {
70  LOG(ERROR) << "CUDA Error: " << e.what();
71  }
72 }
73 
75  for (int d = 0; d < device_count_; ++d) {
76  setContext(d);
77  checkError(cuCtxSynchronize());
78  }
79 }
80 
81 void CudaMgr::copyHostToDevice(int8_t* device_ptr,
82  const int8_t* host_ptr,
83  const size_t num_bytes,
84  const int device_num) {
85  setContext(device_num);
86  checkError(
87  cuMemcpyHtoD(reinterpret_cast<CUdeviceptr>(device_ptr), host_ptr, num_bytes));
88 }
89 
90 void CudaMgr::copyDeviceToHost(int8_t* host_ptr,
91  const int8_t* device_ptr,
92  const size_t num_bytes,
93  const int device_num) {
94  setContext(device_num);
95  checkError(
96  cuMemcpyDtoH(host_ptr, reinterpret_cast<const CUdeviceptr>(device_ptr), num_bytes));
97 }
98 
99 void CudaMgr::copyDeviceToDevice(int8_t* dest_ptr,
100  int8_t* src_ptr,
101  const size_t num_bytes,
102  const int dest_device_num,
103  const int src_device_num) {
104  // dest_device_num and src_device_num are the device numbers relative to start_gpu_
105  // (real_device_num - start_gpu_)
106  if (src_device_num == dest_device_num) {
107  setContext(src_device_num);
108  checkError(cuMemcpy(reinterpret_cast<CUdeviceptr>(dest_ptr),
109  reinterpret_cast<CUdeviceptr>(src_ptr),
110  num_bytes));
111  } else {
112  checkError(cuMemcpyPeer(reinterpret_cast<CUdeviceptr>(dest_ptr),
113  device_contexts_[dest_device_num],
114  reinterpret_cast<CUdeviceptr>(src_ptr),
115  device_contexts_[src_device_num],
116  num_bytes)); // will we always have peer?
117  }
118 }
119 
120 void CudaMgr::loadGpuModuleData(CUmodule* module,
121  const void* image,
122  unsigned int num_options,
123  CUjit_option* options,
124  void** option_vals,
125  const int device_id) const {
126  setContext(device_id);
127  checkError(cuModuleLoadDataEx(module, image, num_options, options, option_vals));
128 }
129 
130 void CudaMgr::unloadGpuModuleData(CUmodule* module, const int device_id) const {
131  std::lock_guard<std::mutex> gpuLock(device_cleanup_mutex_);
132  CHECK(module);
133 
134  setContext(device_id);
135  try {
136  auto code = cuModuleUnload(*module);
137  // If the Cuda driver has already shut down, ignore the resulting errors.
138  if (code != CUDA_ERROR_DEINITIALIZED) {
139  checkError(code);
140  }
141  } catch (const std::runtime_error& e) {
142  LOG(ERROR) << "CUDA Error: " << e.what();
143  }
144 }
145 
146 void CudaMgr::fillDeviceProperties() {
147  device_properties_.resize(device_count_);
148  cuDriverGetVersion(&gpu_driver_version_);
149  for (int device_num = 0; device_num < device_count_; ++device_num) {
150  checkError(
151  cuDeviceGet(&device_properties_[device_num].device, device_num + start_gpu_));
152  CUuuid cuda_uuid;
153  checkError(cuDeviceGetUuid(&cuda_uuid, device_properties_[device_num].device));
154  device_properties_[device_num].uuid = omnisci::UUID(cuda_uuid.bytes);
155  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMajor,
156  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
157  device_properties_[device_num].device));
158  checkError(cuDeviceGetAttribute(&device_properties_[device_num].computeMinor,
159  CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
160  device_properties_[device_num].device));
161  checkError(cuDeviceTotalMem(&device_properties_[device_num].globalMem,
162  device_properties_[device_num].device));
163  checkError(cuDeviceGetAttribute(&device_properties_[device_num].constantMem,
164  CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY,
165  device_properties_[device_num].device));
166  checkError(
167  cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerMP,
168  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
169  device_properties_[device_num].device));
170  checkError(cuDeviceGetAttribute(&device_properties_[device_num].sharedMemPerBlock,
171  CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
172  device_properties_[device_num].device));
173  checkError(cuDeviceGetAttribute(&device_properties_[device_num].numMPs,
174  CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
175  device_properties_[device_num].device));
176  checkError(cuDeviceGetAttribute(&device_properties_[device_num].warpSize,
177  CU_DEVICE_ATTRIBUTE_WARP_SIZE,
178  device_properties_[device_num].device));
179  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxThreadsPerBlock,
180  CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
181  device_properties_[device_num].device));
182  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerBlock,
183  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK,
184  device_properties_[device_num].device));
185  checkError(cuDeviceGetAttribute(&device_properties_[device_num].maxRegistersPerMP,
186  CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
187  device_properties_[device_num].device));
188  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciBusId,
189  CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
190  device_properties_[device_num].device));
191  checkError(cuDeviceGetAttribute(&device_properties_[device_num].pciDeviceId,
192  CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
193  device_properties_[device_num].device));
194  checkError(cuDeviceGetAttribute(&device_properties_[device_num].clockKhz,
195  CU_DEVICE_ATTRIBUTE_CLOCK_RATE,
196  device_properties_[device_num].device));
197  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryClockKhz,
198  CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
199  device_properties_[device_num].device));
200  checkError(cuDeviceGetAttribute(&device_properties_[device_num].memoryBusWidth,
201  CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH,
202  device_properties_[device_num].device));
203  device_properties_[device_num].memoryBandwidthGBs =
204  device_properties_[device_num].memoryClockKhz / 1000000.0 / 8.0 *
205  device_properties_[device_num].memoryBusWidth;
206  }
207  max_shared_memory_for_all_ = computeMaxSharedMemoryForAll();
208 }
209 
210 int8_t* CudaMgr::allocatePinnedHostMem(const size_t num_bytes) {
211  setContext(0);
212  void* host_ptr;
213  checkError(cuMemHostAlloc(&host_ptr, num_bytes, CU_MEMHOSTALLOC_PORTABLE));
214  return reinterpret_cast<int8_t*>(host_ptr);
215 }
216 
217 int8_t* CudaMgr::allocateDeviceMem(const size_t num_bytes, const int device_num) {
218  setContext(device_num);
219  CUdeviceptr device_ptr;
220  checkError(cuMemAlloc(&device_ptr, num_bytes));
221  return reinterpret_cast<int8_t*>(device_ptr);
222 }
223 
224 void CudaMgr::freePinnedHostMem(int8_t* host_ptr) {
225  checkError(cuMemFreeHost(reinterpret_cast<void*>(host_ptr)));
226 }
227 
228 void CudaMgr::freeDeviceMem(int8_t* device_ptr) {
229  std::lock_guard<std::mutex> gpu_lock(device_cleanup_mutex_);
230 
231  checkError(cuMemFree(reinterpret_cast<CUdeviceptr>(device_ptr)));
232 }
233 
234 void CudaMgr::zeroDeviceMem(int8_t* device_ptr,
235  const size_t num_bytes,
236  const int device_num) {
237  setDeviceMem(device_ptr, 0, num_bytes, device_num);
238 }
239 
240 void CudaMgr::setDeviceMem(int8_t* device_ptr,
241  const unsigned char uc,
242  const size_t num_bytes,
243  const int device_num) {
244  setContext(device_num);
245  checkError(cuMemsetD8(reinterpret_cast<CUdeviceptr>(device_ptr), uc, num_bytes));
246 }
247 
253  for (int i = 0; i < device_count_; i++) {
254  if (device_properties_[i].computeMajor < 5) {
255  return false;
256  }
257  }
258  return true;
259 }
260 
266  for (int i = 0; i < device_count_; i++) {
267  if (device_properties_[i].computeMajor != 7) {
268  return false;
269  }
270  }
271  return true;
272 }
273 
279 size_t CudaMgr::computeMaxSharedMemoryForAll() const {
280  int shared_mem_size =
281  device_count_ > 0 ? device_properties_.front().sharedMemPerBlock : 0;
282  for (int d = 1; d < device_count_; d++) {
283  shared_mem_size = std::min(shared_mem_size, device_properties_[d].sharedMemPerBlock);
284  }
285  return shared_mem_size;
286 }
287 
288 void CudaMgr::createDeviceContexts() {
289  CHECK_EQ(device_contexts_.size(), size_t(0));
290  device_contexts_.resize(device_count_);
291  for (int d = 0; d < device_count_; ++d) {
292  CUresult status = cuCtxCreate(&device_contexts_[d], 0, device_properties_[d].device);
293  if (status != CUDA_SUCCESS) {
294  // this is called from destructor so we need
295  // to clean up
296  // destroy all contexts up to this point
297  for (int destroy_id = 0; destroy_id <= d; ++destroy_id) {
298  try {
299  checkError(cuCtxDestroy(device_contexts_[destroy_id]));
300  } catch (const CudaErrorException& e) {
301  LOG(ERROR) << "Error destroying context after failed creation for device "
302  << destroy_id;
303  }
304  }
305  // checkError will translate the message and throw
306  checkError(status);
307  }
308  }
309 }
310 
311 void CudaMgr::setContext(const int device_num) const {
312  // deviceNum is the device number relative to startGpu (realDeviceNum - startGpu_)
313  CHECK_LT(device_num, device_count_);
314  cuCtxSetCurrent(device_contexts_[device_num]);
315 }
316 
317 void CudaMgr::printDeviceProperties() const {
318  LOG(INFO) << "Using " << device_count_ << " Gpus.";
319  for (int d = 0; d < device_count_; ++d) {
320  VLOG(1) << "Device: " << device_properties_[d].device;
321  VLOG(1) << "UUID: " << device_properties_[d].uuid;
322  VLOG(1) << "Clock (khz): " << device_properties_[d].clockKhz;
323  VLOG(1) << "Compute Major: " << device_properties_[d].computeMajor;
324  VLOG(1) << "Compute Minor: " << device_properties_[d].computeMinor;
325  VLOG(1) << "PCI bus id: " << device_properties_[d].pciBusId;
326  VLOG(1) << "PCI deviceId id: " << device_properties_[d].pciDeviceId;
327  VLOG(1) << "Total Global memory: " << device_properties_[d].globalMem / 1073741824.0
328  << " GB";
329  VLOG(1) << "Memory clock (khz): " << device_properties_[d].memoryClockKhz;
330  VLOG(1) << "Memory bandwidth: " << device_properties_[d].memoryBandwidthGBs
331  << " GB/sec";
332 
333  VLOG(1) << "Constant Memory: " << device_properties_[d].constantMem;
334  VLOG(1) << "Shared memory per multiprocessor: "
335  << device_properties_[d].sharedMemPerMP;
336  VLOG(1) << "Shared memory per block: " << device_properties_[d].sharedMemPerBlock;
337  VLOG(1) << "Number of MPs: " << device_properties_[d].numMPs;
338  VLOG(1) << "Warp Size: " << device_properties_[d].warpSize;
339  VLOG(1) << "Max threads per block: " << device_properties_[d].maxThreadsPerBlock;
340  VLOG(1) << "Max registers per block: " << device_properties_[d].maxRegistersPerBlock;
341  VLOG(1) << "Max register per MP: " << device_properties_[d].maxRegistersPerMP;
342  VLOG(1) << "Memory bus width in bits: " << device_properties_[d].memoryBusWidth;
343  }
344 }
345 
346 void CudaMgr::checkError(CUresult status) const {
347  if (status != CUDA_SUCCESS) {
348  throw CudaErrorException(status);
349  }
350 }
351 
352 } // namespace CudaMgr_Namespace
#define CHECK_EQ(x, y)
Definition: Logger.h:198
std::unique_ptr< llvm::Module > module(runtime_module_shallow_copy(cgen_state))
int CUjit_option
Definition: nocuda.h:25
void copyDeviceToHost(int8_t *host_ptr, const int8_t *device_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:90
std::mutex device_cleanup_mutex_
Definition: CudaMgr.h:185
#define LOG(tag)
Definition: Logger.h:185
int8_t * allocatePinnedHostMem(const size_t num_bytes)
Definition: CudaMgr.cpp:210
unsigned long long CUdeviceptr
Definition: nocuda.h:27
void setContext(const int device_num) const
Definition: CudaMgr.cpp:311
void copyDeviceToDevice(int8_t *dest_ptr, int8_t *src_ptr, const size_t num_bytes, const int dest_device_num, const int src_device_num)
Definition: CudaMgr.cpp:99
std::vector< CUcontext > device_contexts_
Definition: CudaMgr.h:183
void freeDeviceMem(int8_t *device_ptr)
Definition: CudaMgr.cpp:228
omnisci::DeviceGroup device_group_
Definition: CudaMgr.h:182
CHECK(cgen_state)
void setDeviceMem(int8_t *device_ptr, const unsigned char uc, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:240
void copyHostToDevice(int8_t *device_ptr, const int8_t *host_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:81
#define CHECK_LT(x, y)
Definition: Logger.h:200
bool isArchMaxwellOrLaterForAll() const
Definition: CudaMgr.cpp:252
#define CHECK_LE(x, y)
Definition: Logger.h:201
int CUresult
Definition: nocuda.h:21
std::vector< DeviceProperties > device_properties_
Definition: CudaMgr.h:181
size_t max_shared_memory_for_all_
Definition: CudaMgr.h:180
bool isArchVoltaForAll() const
Definition: CudaMgr.cpp:265
void freePinnedHostMem(int8_t *host_ptr)
Definition: CudaMgr.cpp:224
void synchronizeDevices() const
Definition: CudaMgr.cpp:74
CudaMgr(const int num_gpus, const int start_gpu=0)
Definition: CudaMgr.cpp:28
void zeroDeviceMem(int8_t *device_ptr, const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:234
#define VLOG(n)
Definition: Logger.h:280
void * CUmodule
Definition: nocuda.h:23
int8_t * allocateDeviceMem(const size_t num_bytes, const int device_num)
Definition: CudaMgr.cpp:217