OmniSciDB  c1a53651b2
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DeviceKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "DeviceKernel.h"
18 
19 #include "CompilationContext.h"
20 #include "NvidiaKernel.h"
21 
22 #ifdef HAVE_CUDA
24 
25 class CudaEventClock : public DeviceClock {
26  public:
27  CudaEventClock() {
28  cuEventCreate(&start_, 0);
29  cuEventCreate(&stop_, 0);
30  }
31  virtual void start() override { cuEventRecord(start_, 0); }
32  virtual int stop() override {
33  cuEventRecord(stop_, 0);
34  cuEventSynchronize(stop_);
35  float ms = 0;
36  cuEventElapsedTime(&ms, start_, stop_);
37  return ms;
38  }
39 
40  private:
41  CUevent start_, stop_; // preparation
42 };
43 
44 class NvidiaKernel : public DeviceKernel {
45  public:
46  NvidiaKernel(const CompilationContext* ctx, int device_id) : device_id(device_id) {
47  auto cuda_ctx = dynamic_cast<const GpuCompilationContext*>(ctx);
48  CHECK(cuda_ctx);
49  const auto native_code = cuda_ctx->getNativeCode(device_id);
50  function_ptr = static_cast<CUfunction>(native_code.first);
51  module_ptr = static_cast<CUmodule>(native_code.second);
52  }
53 
54  void launch(unsigned int gridDimX,
55  unsigned int gridDimY,
56  unsigned int gridDimZ,
57  unsigned int blockDimX,
58  unsigned int blockDimY,
59  unsigned int blockDimZ,
60  unsigned int sharedMemBytes,
61  void** kernelParams,
62  bool optimize_block_and_grid_sizes) override {
63  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
64  if (optimize_block_and_grid_sizes) {
65  int recommended_block_size;
66  int recommended_grid_size;
67  std::ostringstream oss;
68  checkCudaErrors(cuOccupancyMaxPotentialBlockSize(&recommended_grid_size,
69  &recommended_block_size,
70  function_ptr,
71  nullptr,
72  sharedMemBytes,
73  0));
74  if (static_cast<unsigned int>(recommended_block_size) != blockDimX) {
75  VLOG(1) << "Apply a recommended CUDA block size: " << recommended_block_size
76  << " (current: " << blockDimX << ")";
77  blockDimX = recommended_block_size;
78  }
79  if (static_cast<unsigned int>(recommended_grid_size) != gridDimX) {
80  VLOG(1) << "Apply a recommended CUDA grid size: " << recommended_grid_size
81  << " (current: " << gridDimX << ")";
82  gridDimX = recommended_grid_size;
83  }
84  }
85  VLOG(1) << "Launch GPU kernel compiled with the following block and grid sizes: "
86  << blockDimX << " and " << gridDimX;
87  checkCudaErrors(cuLaunchKernel(function_ptr,
88  gridDimX,
89  gridDimY,
90  gridDimZ,
91  blockDimX,
92  blockDimY,
93  blockDimZ,
94  sharedMemBytes,
95  qe_cuda_stream,
96  kernelParams,
97  nullptr));
98  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
99  }
100 
101  void initializeDynamicWatchdog(bool could_interrupt, uint64_t cycle_budget) override {
102  CHECK(module_ptr);
103  CUevent start, stop;
104  cuEventCreate(&start, 0);
105  cuEventCreate(&stop, 0);
106  cuEventRecord(start, 0);
107 
109  size_t dw_cycle_budget_size;
110  // Translate milliseconds to device cycles
111  if (device_id == 0) {
112  LOG(INFO) << "Dynamic Watchdog budget: GPU: "
114  << std::to_string(cycle_budget) << " cycles";
115  }
116  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
117  checkCudaErrors(cuModuleGetGlobal(
118  &dw_cycle_budget, &dw_cycle_budget_size, module_ptr, "dw_cycle_budget"));
119  CHECK_EQ(dw_cycle_budget_size, sizeof(uint64_t));
120  checkCudaErrors(cuMemcpyHtoDAsync(dw_cycle_budget,
121  reinterpret_cast<void*>(&cycle_budget),
122  sizeof(uint64_t),
123  qe_cuda_stream));
124  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
125 
127  size_t dw_sm_cycle_start_size;
128  checkCudaErrors(cuModuleGetGlobal(
129  &dw_sm_cycle_start, &dw_sm_cycle_start_size, module_ptr, "dw_sm_cycle_start"));
130  CHECK_EQ(dw_sm_cycle_start_size, 128 * sizeof(uint64_t));
131  checkCudaErrors(cuMemsetD32Async(dw_sm_cycle_start, 0, 128 * 2, qe_cuda_stream));
132  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
133 
134  if (!could_interrupt) {
135  // Executor is not marked as interrupted, make sure dynamic watchdog doesn't block
136  // execution
138  size_t dw_abort_size;
140  cuModuleGetGlobal(&dw_abort, &dw_abort_size, module_ptr, "dw_abort"));
141  CHECK_EQ(dw_abort_size, sizeof(uint32_t));
142  checkCudaErrors(cuMemsetD32Async(dw_abort, 0, 1, qe_cuda_stream));
143  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
144  }
145 
146  cuEventRecord(stop, 0);
147  cuEventSynchronize(stop);
148  float milliseconds = 0;
149  cuEventElapsedTime(&milliseconds, start, stop);
150  VLOG(1) << "Device " << std::to_string(device_id)
151  << ": launchGpuCode: dynamic watchdog init: " << std::to_string(milliseconds)
152  << " ms\n";
153  }
154 
155  void initializeRuntimeInterrupter(const int device_id) override {
156  CHECK(module_ptr);
157  CUevent start, stop;
158  cuEventCreate(&start, 0);
159  cuEventCreate(&stop, 0);
160  cuEventRecord(start, 0);
161 
163  size_t runtime_interrupt_flag_size;
164  checkCudaErrors(cuModuleGetGlobal(&runtime_interrupt_flag,
165  &runtime_interrupt_flag_size,
166  module_ptr,
167  "runtime_interrupt_flag"));
168  CHECK_EQ(runtime_interrupt_flag_size, sizeof(uint32_t));
169  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
170  checkCudaErrors(cuMemsetD32Async(runtime_interrupt_flag, 0, 1, qe_cuda_stream));
171  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
172 
173  cuEventRecord(stop, 0);
174  cuEventSynchronize(stop);
175  float milliseconds = 0;
176  cuEventElapsedTime(&milliseconds, start, stop);
177  VLOG(1) << "Device " << std::to_string(device_id)
178  << ": launchGpuCode: runtime query interrupter init: "
179  << std::to_string(milliseconds) << " ms";
180  Executor::registerActiveModule(module_ptr, device_id);
181  }
182 
183  void resetRuntimeInterrupter(const int device_id) override {
185  }
186 
187  std::unique_ptr<DeviceClock> make_clock() override {
188  return std::make_unique<CudaEventClock>();
189  }
190 
191  private:
192  CUfunction function_ptr;
193  CUmodule module_ptr;
194  int device_id;
195 };
196 #endif
197 
198 std::unique_ptr<DeviceKernel> create_device_kernel(const CompilationContext* ctx,
199  int device_id) {
200 #ifdef HAVE_CUDA
201  return std::make_unique<NvidiaKernel>(ctx, device_id);
202 #else
203  return nullptr;
204 #endif
205 }
virtual int stop()=0
#define CHECK_EQ(x, y)
Definition: Logger.h:301
static void registerActiveModule(void *module, const int device_id)
__device__ int64_t dw_sm_cycle_start[128]
Definition: cuda_mapd_rt.cu:91
void * CUstream
Definition: nocuda.h:23
#define LOG(tag)
Definition: Logger.h:285
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
unsigned long long CUdeviceptr
Definition: nocuda.h:28
__device__ int64_t dw_cycle_budget
Definition: cuda_mapd_rt.cu:93
virtual void initializeRuntimeInterrupter(const int device_id)=0
virtual std::unique_ptr< DeviceClock > make_clock()=0
std::string to_string(char const *&&v)
virtual void launch(unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, void **kernelParams, bool optimize_block_and_grid_sizes)=0
static void unregisterActiveModule(const int device_id)
void * CUfunction
Definition: nocuda.h:25
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:95
virtual void initializeDynamicWatchdog(bool could_interrupt, uint64_t cycle_budget)=0
virtual void start()=0
std::unique_ptr< DeviceKernel > create_device_kernel(const CompilationContext *ctx, int device_id)
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
virtual void resetRuntimeInterrupter(const int device_id)=0
#define CHECK(condition)
Definition: Logger.h:291
unsigned g_dynamic_watchdog_time_limit
Definition: Execute.cpp:85
__device__ int32_t dw_abort
Definition: cuda_mapd_rt.cu:94
#define VLOG(n)
Definition: Logger.h:387
void * CUmodule
Definition: nocuda.h:24