28 cuEventCreate(&start_, 0);
29 cuEventCreate(&stop_, 0);
31 virtual void start()
override { cuEventRecord(start_, 0); }
32 virtual int stop()
override {
33 cuEventRecord(stop_, 0);
34 cuEventSynchronize(stop_);
36 cuEventElapsedTime(&ms, start_, stop_);
41 CUevent start_, stop_;
49 const auto native_code = cuda_ctx->getNativeCode(device_id);
50 function_ptr =
static_cast<CUfunction>(native_code.first);
51 module_ptr =
static_cast<CUmodule>(native_code.second);
54 void launch(
unsigned int gridDimX,
55 unsigned int gridDimY,
56 unsigned int gridDimZ,
57 unsigned int blockDimX,
58 unsigned int blockDimY,
59 unsigned int blockDimZ,
60 unsigned int sharedMemBytes,
62 bool optimize_block_and_grid_sizes)
override {
64 if (optimize_block_and_grid_sizes) {
65 int recommended_block_size;
66 int recommended_grid_size;
67 std::ostringstream oss;
68 checkCudaErrors(cuOccupancyMaxPotentialBlockSize(&recommended_grid_size,
69 &recommended_block_size,
74 if (static_cast<unsigned int>(recommended_block_size) != blockDimX) {
75 VLOG(1) <<
"Apply a recommended CUDA block size: " << recommended_block_size
76 <<
" (current: " << blockDimX <<
")";
77 blockDimX = recommended_block_size;
79 if (static_cast<unsigned int>(recommended_grid_size) != gridDimX) {
80 VLOG(1) <<
"Apply a recommended CUDA grid size: " << recommended_grid_size
81 <<
" (current: " << gridDimX <<
")";
82 gridDimX = recommended_grid_size;
85 VLOG(1) <<
"Launch GPU kernel compiled with the following block and grid sizes: "
86 << blockDimX <<
" and " << gridDimX;
104 cuEventCreate(&start, 0);
105 cuEventCreate(&stop, 0);
106 cuEventRecord(start, 0);
109 size_t dw_cycle_budget_size;
111 if (device_id == 0) {
112 LOG(
INFO) <<
"Dynamic Watchdog budget: GPU: "
118 &dw_cycle_budget, &dw_cycle_budget_size, module_ptr,
"dw_cycle_budget"));
119 CHECK_EQ(dw_cycle_budget_size,
sizeof(uint64_t));
121 reinterpret_cast<void*>(&cycle_budget),
127 size_t dw_sm_cycle_start_size;
129 &dw_sm_cycle_start, &dw_sm_cycle_start_size, module_ptr,
"dw_sm_cycle_start"));
130 CHECK_EQ(dw_sm_cycle_start_size, 128 *
sizeof(uint64_t));
131 checkCudaErrors(cuMemsetD32Async(dw_sm_cycle_start, 0, 128 * 2, qe_cuda_stream));
134 if (!could_interrupt) {
138 size_t dw_abort_size;
140 cuModuleGetGlobal(&dw_abort, &dw_abort_size, module_ptr,
"dw_abort"));
141 CHECK_EQ(dw_abort_size,
sizeof(uint32_t));
146 cuEventRecord(stop, 0);
147 cuEventSynchronize(stop);
148 float milliseconds = 0;
149 cuEventElapsedTime(&milliseconds, start, stop);
151 <<
": launchGpuCode: dynamic watchdog init: " <<
std::to_string(milliseconds)
158 cuEventCreate(&start, 0);
159 cuEventCreate(&stop, 0);
160 cuEventRecord(start, 0);
163 size_t runtime_interrupt_flag_size;
165 &runtime_interrupt_flag_size,
167 "runtime_interrupt_flag"));
168 CHECK_EQ(runtime_interrupt_flag_size,
sizeof(uint32_t));
170 checkCudaErrors(cuMemsetD32Async(runtime_interrupt_flag, 0, 1, qe_cuda_stream));
173 cuEventRecord(stop, 0);
174 cuEventSynchronize(stop);
175 float milliseconds = 0;
176 cuEventElapsedTime(&milliseconds, start, stop);
178 <<
": launchGpuCode: runtime query interrupter init: "
187 std::unique_ptr<DeviceClock>
make_clock()
override {
188 return std::make_unique<CudaEventClock>();
201 return std::make_unique<NvidiaKernel>(ctx, device_id);
static void registerActiveModule(void *module, const int device_id)
__device__ int64_t dw_sm_cycle_start[128]
void checkCudaErrors(CUresult err)
unsigned long long CUdeviceptr
__device__ int64_t dw_cycle_budget
virtual void initializeRuntimeInterrupter(const int device_id)=0
virtual std::unique_ptr< DeviceClock > make_clock()=0
virtual void launch(unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, void **kernelParams, bool optimize_block_and_grid_sizes)=0
static void unregisterActiveModule(const int device_id)
__device__ int32_t runtime_interrupt_flag
virtual void initializeDynamicWatchdog(bool could_interrupt, uint64_t cycle_budget)=0
std::unique_ptr< DeviceKernel > create_device_kernel(const CompilationContext *ctx, int device_id)
CUstream getQueryEngineCudaStreamForDevice(int device_num)
virtual void resetRuntimeInterrupter(const int device_id)=0
unsigned g_dynamic_watchdog_time_limit
__device__ int32_t dw_abort