OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DeviceKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "DeviceKernel.h"
18 
19 #include "CompilationContext.h"
20 #include "NvidiaKernel.h"
21 
22 #ifdef HAVE_CUDA
24 
25 class CudaEventClock : public DeviceClock {
26  public:
27  CudaEventClock() {
28  cuEventCreate(&start_, 0);
29  cuEventCreate(&stop_, 0);
30  }
31  void start() override { cuEventRecord(start_, 0); }
32  int stop() override {
33  cuEventRecord(stop_, 0);
34  cuEventSynchronize(stop_);
35  float ms = 0;
36  cuEventElapsedTime(&ms, start_, stop_);
37  return ms;
38  }
39 
40  private:
41  CUevent start_, stop_; // preparation
42 };
43 
44 class NvidiaKernel : public DeviceKernel {
45  public:
46  NvidiaKernel(const CompilationContext* ctx, int device_id) : device_id(device_id) {
47  auto cuda_ctx = dynamic_cast<const GpuCompilationContext*>(ctx);
48  CHECK(cuda_ctx);
49  name_ = cuda_ctx->name(device_id);
50  const auto native_code = cuda_ctx->getNativeCode(device_id);
51  function_ptr = static_cast<CUfunction>(native_code.first);
52  module_ptr = static_cast<CUmodule>(native_code.second);
53  }
54 
55  void launch(unsigned int gridDimX,
56  unsigned int gridDimY,
57  unsigned int gridDimZ,
58  unsigned int blockDimX,
59  unsigned int blockDimY,
60  unsigned int blockDimZ,
61  unsigned int sharedMemBytes,
62  void** kernelParams,
63  bool optimize_block_and_grid_sizes) override {
64  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
65  if (optimize_block_and_grid_sizes) {
66  int recommended_block_size;
67  int recommended_grid_size;
68  std::ostringstream oss;
69  checkCudaErrors(cuOccupancyMaxPotentialBlockSize(&recommended_grid_size,
70  &recommended_block_size,
71  function_ptr,
72  nullptr,
73  sharedMemBytes,
74  0));
75  if (static_cast<unsigned int>(recommended_block_size) != blockDimX) {
76  VLOG(1) << "Apply a recommended CUDA block size: " << recommended_block_size
77  << " (current: " << blockDimX << ")";
78  blockDimX = recommended_block_size;
79  }
80  if (static_cast<unsigned int>(recommended_grid_size) != gridDimX) {
81  VLOG(1) << "Apply a recommended CUDA grid size: " << recommended_grid_size
82  << " (current: " << gridDimX << ")";
83  gridDimX = recommended_grid_size;
84  }
85  }
86  VLOG(1) << "Launch GPU kernel compiled with the following block and grid sizes: "
87  << blockDimX << " and " << gridDimX;
88  checkCudaErrors(cuLaunchKernel(function_ptr,
89  gridDimX,
90  gridDimY,
91  gridDimZ,
92  blockDimX,
93  blockDimY,
94  blockDimZ,
95  sharedMemBytes,
96  qe_cuda_stream,
97  kernelParams,
98  nullptr));
99  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
100  }
101 
102  void initializeDynamicWatchdog(bool could_interrupt, uint64_t cycle_budget) override {
103  CHECK(module_ptr);
104  CUevent start, stop;
105  cuEventCreate(&start, 0);
106  cuEventCreate(&stop, 0);
107  cuEventRecord(start, 0);
108 
110  size_t dw_cycle_budget_size;
111  // Translate milliseconds to device cycles
112  if (device_id == 0) {
113  LOG(INFO) << "Dynamic Watchdog budget: GPU: "
115  << std::to_string(cycle_budget) << " cycles";
116  }
117  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
118  checkCudaErrors(cuModuleGetGlobal(
119  &dw_cycle_budget, &dw_cycle_budget_size, module_ptr, "dw_cycle_budget"));
120  CHECK_EQ(dw_cycle_budget_size, sizeof(uint64_t));
121  checkCudaErrors(cuMemcpyHtoDAsync(dw_cycle_budget,
122  reinterpret_cast<void*>(&cycle_budget),
123  sizeof(uint64_t),
124  qe_cuda_stream));
125  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
126 
128  size_t dw_sm_cycle_start_size;
129  checkCudaErrors(cuModuleGetGlobal(
130  &dw_sm_cycle_start, &dw_sm_cycle_start_size, module_ptr, "dw_sm_cycle_start"));
131  CHECK_EQ(dw_sm_cycle_start_size, 128 * sizeof(uint64_t));
132  checkCudaErrors(cuMemsetD32Async(dw_sm_cycle_start, 0, 128 * 2, qe_cuda_stream));
133  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
134 
135  if (!could_interrupt) {
136  // Executor is not marked as interrupted, make sure dynamic watchdog doesn't block
137  // execution
139  size_t dw_abort_size;
141  cuModuleGetGlobal(&dw_abort, &dw_abort_size, module_ptr, "dw_abort"));
142  CHECK_EQ(dw_abort_size, sizeof(uint32_t));
143  checkCudaErrors(cuMemsetD32Async(dw_abort, 0, 1, qe_cuda_stream));
144  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
145  }
146 
147  cuEventRecord(stop, 0);
148  cuEventSynchronize(stop);
149  float milliseconds = 0;
150  cuEventElapsedTime(&milliseconds, start, stop);
151  VLOG(1) << "Device " << std::to_string(device_id)
152  << ": launchGpuCode: dynamic watchdog init: " << std::to_string(milliseconds)
153  << " ms\n";
154  }
155 
156  void initializeRuntimeInterrupter(const int device_id) override {
157  CHECK(module_ptr);
158  CUevent start, stop;
159  cuEventCreate(&start, 0);
160  cuEventCreate(&stop, 0);
161  cuEventRecord(start, 0);
162 
164  size_t runtime_interrupt_flag_size;
165  checkCudaErrors(cuModuleGetGlobal(&runtime_interrupt_flag,
166  &runtime_interrupt_flag_size,
167  module_ptr,
168  "runtime_interrupt_flag"));
169  CHECK_EQ(runtime_interrupt_flag_size, sizeof(uint32_t));
170  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
171  checkCudaErrors(cuMemsetD32Async(runtime_interrupt_flag, 0, 1, qe_cuda_stream));
172  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
173 
174  cuEventRecord(stop, 0);
175  cuEventSynchronize(stop);
176  float milliseconds = 0;
177  cuEventElapsedTime(&milliseconds, start, stop);
178  VLOG(1) << "Device " << std::to_string(device_id)
179  << ": launchGpuCode: runtime query interrupter init: "
180  << std::to_string(milliseconds) << " ms";
181  Executor::registerActiveModule(module_ptr, device_id);
182  }
183 
184  void resetRuntimeInterrupter(const int device_id) override {
186  }
187 
188  std::unique_ptr<DeviceClock> make_clock() override {
189  return std::make_unique<CudaEventClock>();
190  }
191 
192  char const* name() const override { return name_.c_str(); }
193 
194  private:
195  CUfunction function_ptr;
196  CUmodule module_ptr;
197  int device_id;
198  std::string name_;
199 };
200 #endif
201 
202 std::unique_ptr<DeviceKernel> create_device_kernel(const CompilationContext* ctx,
203  int device_id) {
204 #ifdef HAVE_CUDA
205  return std::make_unique<NvidiaKernel>(ctx, device_id);
206 #else
207  return nullptr;
208 #endif
209 }
virtual int stop()=0
virtual char const * name() const =0
#define CHECK_EQ(x, y)
Definition: Logger.h:301
static void registerActiveModule(void *module, const int device_id)
__device__ int64_t dw_sm_cycle_start[128]
Definition: cuda_mapd_rt.cu:91
void * CUstream
Definition: nocuda.h:23
#define LOG(tag)
Definition: Logger.h:285
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
unsigned long long CUdeviceptr
Definition: nocuda.h:28
__device__ int64_t dw_cycle_budget
Definition: cuda_mapd_rt.cu:93
virtual void initializeRuntimeInterrupter(const int device_id)=0
virtual std::unique_ptr< DeviceClock > make_clock()=0
std::string to_string(char const *&&v)
virtual void launch(unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, void **kernelParams, bool optimize_block_and_grid_sizes)=0
static void unregisterActiveModule(const int device_id)
void * CUfunction
Definition: nocuda.h:25
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:95
virtual void initializeDynamicWatchdog(bool could_interrupt, uint64_t cycle_budget)=0
virtual void start()=0
std::unique_ptr< DeviceKernel > create_device_kernel(const CompilationContext *ctx, int device_id)
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
virtual void resetRuntimeInterrupter(const int device_id)=0
#define CHECK(condition)
Definition: Logger.h:291
unsigned g_dynamic_watchdog_time_limit
Definition: Execute.cpp:88
__device__ int32_t dw_abort
Definition: cuda_mapd_rt.cu:94
#define VLOG(n)
Definition: Logger.h:388
void * CUmodule
Definition: nocuda.h:24