OmniSciDB  085a039ca4
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
DeviceKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2021 OmniSci, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "DeviceKernel.h"
18 
19 #include "CompilationContext.h"
20 #include "NvidiaKernel.h"
21 
22 #ifdef HAVE_CUDA
24 
25 class CudaEventClock : public DeviceClock {
26  public:
27  CudaEventClock() {
28  cuEventCreate(&start_, 0);
29  cuEventCreate(&stop_, 0);
30  }
31  virtual void start() override { cuEventRecord(start_, 0); }
32  virtual int stop() override {
33  cuEventRecord(stop_, 0);
34  cuEventSynchronize(stop_);
35  float ms = 0;
36  cuEventElapsedTime(&ms, start_, stop_);
37  return ms;
38  }
39 
40  private:
41  CUevent start_, stop_; // preparation
42 };
43 
44 class NvidiaKernel : public DeviceKernel {
45  public:
46  NvidiaKernel(const CompilationContext* ctx, int device_id) : device_id(device_id) {
47  auto cuda_ctx = dynamic_cast<const GpuCompilationContext*>(ctx);
48  CHECK(cuda_ctx);
49  const auto native_code = cuda_ctx->getNativeCode(device_id);
50  function_ptr = static_cast<CUfunction>(native_code.first);
51  module_ptr = static_cast<CUmodule>(native_code.second);
52  }
53 
54  void launch(unsigned int gridDimX,
55  unsigned int gridDimY,
56  unsigned int gridDimZ,
57  unsigned int blockDimX,
58  unsigned int blockDimY,
59  unsigned int blockDimZ,
60  unsigned int sharedMemBytes,
61  void** kernelParams) override {
62  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
63  checkCudaErrors(cuLaunchKernel(function_ptr,
64  gridDimX,
65  gridDimY,
66  gridDimZ,
67  blockDimX,
68  blockDimY,
69  blockDimZ,
70  sharedMemBytes,
71  qe_cuda_stream,
72  kernelParams,
73  nullptr));
74  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
75  }
76 
77  void initializeDynamicWatchdog(bool could_interrupt, uint64_t cycle_budget) override {
78  CHECK(module_ptr);
79  CUevent start, stop;
80  cuEventCreate(&start, 0);
81  cuEventCreate(&stop, 0);
82  cuEventRecord(start, 0);
83 
85  size_t dw_cycle_budget_size;
86  // Translate milliseconds to device cycles
87  if (device_id == 0) {
88  LOG(INFO) << "Dynamic Watchdog budget: GPU: "
90  << std::to_string(cycle_budget) << " cycles";
91  }
92  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
93  checkCudaErrors(cuModuleGetGlobal(
94  &dw_cycle_budget, &dw_cycle_budget_size, module_ptr, "dw_cycle_budget"));
95  CHECK_EQ(dw_cycle_budget_size, sizeof(uint64_t));
96  checkCudaErrors(cuMemcpyHtoDAsync(dw_cycle_budget,
97  reinterpret_cast<void*>(&cycle_budget),
98  sizeof(uint64_t),
99  qe_cuda_stream));
100  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
101 
103  size_t dw_sm_cycle_start_size;
104  checkCudaErrors(cuModuleGetGlobal(
105  &dw_sm_cycle_start, &dw_sm_cycle_start_size, module_ptr, "dw_sm_cycle_start"));
106  CHECK_EQ(dw_sm_cycle_start_size, 128 * sizeof(uint64_t));
107  checkCudaErrors(cuMemsetD32Async(dw_sm_cycle_start, 0, 128 * 2, qe_cuda_stream));
108  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
109 
110  if (!could_interrupt) {
111  // Executor is not marked as interrupted, make sure dynamic watchdog doesn't block
112  // execution
114  size_t dw_abort_size;
116  cuModuleGetGlobal(&dw_abort, &dw_abort_size, module_ptr, "dw_abort"));
117  CHECK_EQ(dw_abort_size, sizeof(uint32_t));
118  checkCudaErrors(cuMemsetD32Async(dw_abort, 0, 1, qe_cuda_stream));
119  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
120  }
121 
122  cuEventRecord(stop, 0);
123  cuEventSynchronize(stop);
124  float milliseconds = 0;
125  cuEventElapsedTime(&milliseconds, start, stop);
126  VLOG(1) << "Device " << std::to_string(device_id)
127  << ": launchGpuCode: dynamic watchdog init: " << std::to_string(milliseconds)
128  << " ms\n";
129  }
130 
131  void initializeRuntimeInterrupter(const int device_id) override {
132  CHECK(module_ptr);
133  CUevent start, stop;
134  cuEventCreate(&start, 0);
135  cuEventCreate(&stop, 0);
136  cuEventRecord(start, 0);
137 
139  size_t runtime_interrupt_flag_size;
140  checkCudaErrors(cuModuleGetGlobal(&runtime_interrupt_flag,
141  &runtime_interrupt_flag_size,
142  module_ptr,
143  "runtime_interrupt_flag"));
144  CHECK_EQ(runtime_interrupt_flag_size, sizeof(uint32_t));
145  auto qe_cuda_stream = getQueryEngineCudaStreamForDevice(device_id);
146  checkCudaErrors(cuMemsetD32Async(runtime_interrupt_flag, 0, 1, qe_cuda_stream));
147  checkCudaErrors(cuStreamSynchronize(qe_cuda_stream));
148 
149  cuEventRecord(stop, 0);
150  cuEventSynchronize(stop);
151  float milliseconds = 0;
152  cuEventElapsedTime(&milliseconds, start, stop);
153  VLOG(1) << "Device " << std::to_string(device_id)
154  << ": launchGpuCode: runtime query interrupter init: "
155  << std::to_string(milliseconds) << " ms";
156  Executor::registerActiveModule(module_ptr, device_id);
157  }
158 
159  void resetRuntimeInterrupter(const int device_id) override {
161  }
162 
163  std::unique_ptr<DeviceClock> make_clock() override {
164  return std::make_unique<CudaEventClock>();
165  }
166 
167  private:
168  CUfunction function_ptr;
169  CUmodule module_ptr;
170  int device_id;
171 };
172 #endif
173 
174 std::unique_ptr<DeviceKernel> create_device_kernel(const CompilationContext* ctx,
175  int device_id) {
176 #ifdef HAVE_CUDA
177  return std::make_unique<NvidiaKernel>(ctx, device_id);
178 #else
179  return nullptr;
180 #endif
181 }
virtual int stop()=0
#define CHECK_EQ(x, y)
Definition: Logger.h:231
static void registerActiveModule(void *module, const int device_id)
__device__ int64_t dw_sm_cycle_start[128]
Definition: cuda_mapd_rt.cu:91
void * CUstream
Definition: nocuda.h:23
#define LOG(tag)
Definition: Logger.h:217
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
unsigned long long CUdeviceptr
Definition: nocuda.h:28
__device__ int64_t dw_cycle_budget
Definition: cuda_mapd_rt.cu:93
virtual void initializeRuntimeInterrupter(const int device_id)=0
virtual std::unique_ptr< DeviceClock > make_clock()=0
std::string to_string(char const *&&v)
static void unregisterActiveModule(const int device_id)
virtual void launch(unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, void **kernelParams)=0
void * CUfunction
Definition: nocuda.h:25
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:95
virtual void initializeDynamicWatchdog(bool could_interrupt, uint64_t cycle_budget)=0
virtual void start()=0
std::unique_ptr< DeviceKernel > create_device_kernel(const CompilationContext *ctx, int device_id)
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
virtual void resetRuntimeInterrupter(const int device_id)=0
#define CHECK(condition)
Definition: Logger.h:223
unsigned g_dynamic_watchdog_time_limit
Definition: Execute.cpp:85
__device__ int32_t dw_abort
Definition: cuda_mapd_rt.cu:94
#define VLOG(n)
Definition: Logger.h:317
void * CUmodule
Definition: nocuda.h:24