OmniSciDB  b28c0d5765
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
GpuInterrupt.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "DynamicWatchdog.h"
18 #include "Execute.h"
19 
20 void Executor::registerActiveModule(void* module, const int device_id) {
21 #ifdef HAVE_CUDA
22  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
23  CHECK_LT(device_id, max_gpu_count);
24  gpu_active_modules_device_mask_ |= (1 << device_id);
25  gpu_active_modules_[device_id] = module;
26  VLOG(1) << "Registered module " << module << " on device " << std::to_string(device_id);
27 #endif
28 }
29 
30 void Executor::unregisterActiveModule(const int device_id) {
31 #ifdef HAVE_CUDA
32  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
33  CHECK_LT(device_id, max_gpu_count);
34  if ((gpu_active_modules_device_mask_ & (1 << device_id)) == 0) {
35  return;
36  }
37  gpu_active_modules_device_mask_ ^= (1 << device_id);
38  VLOG(1) << "Unregistered module on device " << std::to_string(device_id);
39 #endif
40 }
41 
42 void Executor::interrupt(const std::string& query_session,
43  const std::string& interrupt_session) {
44  const auto allow_interrupt =
46  if (allow_interrupt) {
47  bool is_running_query = false;
48  {
49  // here we validate the requested query session is valid (is already enrolled)
50  // if not, we skip the interrupt request
52  executor_session_mutex_);
53  if (!checkIsQuerySessionEnrolled(query_session, session_read_lock)) {
54  VLOG(1) << "Skip the interrupt request (no query has been submitted from the "
55  "given query session)";
56  return;
57  }
58  if (checkIsQuerySessionInterrupted(query_session, session_read_lock)) {
59  VLOG(1) << "Skip the interrupt request (already interrupted query session)";
60  return;
61  }
62  // if a query is pending query, we just need to turn interrupt flag for the session
63  // on (not sending interrupt signal to "RUNNING" kernel, see the below code)
64  is_running_query = checkCurrentQuerySession(query_session, session_read_lock);
65  }
66  {
67  // We have to cover interrupt request from *any* session because we don't know
68  // whether the request is for the running query or pending query
69  // or for non-kernel time interrupt
70  // (or just false alarm that indicates unregistered session in a queue).
71  // So we try to set a session has been interrupted once we confirm
72  // the session has been enrolled and is not interrupted at this moment
74  executor_session_mutex_);
75  setQuerySessionAsInterrupted(query_session, session_write_lock);
76  }
77  if (!is_running_query) {
78  return;
79  }
80  // mark the interrupted status of this executor
81  interrupted_.store(true);
82  }
83 
84  // for both GPU and CPU kernel execution, interrupt flag that running kernel accesses
85  // is a global variable from a view of Executors
86  // but it's okay for now since we hold a kernel_lock when starting the query execution
87  // this indicates we should revisit this logic when starting to use multi-query
88  // execution for supporting per-kernel interrupt
89  bool CPU_execution_mode = true;
90 
91 #ifdef HAVE_CUDA
92  // The below code is basically for runtime query interrupt for GPU.
93  // It is also possible that user forces to use CPU-mode even if the user has GPU(s).
94  // In this case, we should not execute the code in below to avoid runtime failure
96  auto cuda_mgr = data_mgr_->getCudaMgr();
97  if (cuda_mgr && (g_enable_dynamic_watchdog || allow_interrupt)) {
98  // we additionally allow sending interrupt signal for
99  // `g_enable_non_kernel_time_query_interrupt` especially for CTAS/ITAS queries: data
100  // population happens on CPU but select_query can be processed via GPU
101  CHECK_GE(cuda_mgr->getDeviceCount(), 1);
102  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
103  CUcontext old_cu_context;
104  checkCudaErrors(cuCtxGetCurrent(&old_cu_context));
105  for (int device_id = 0; device_id < max_gpu_count; device_id++) {
106  if (gpu_active_modules_device_mask_ & (1 << device_id)) {
107  void* llvm_module = gpu_active_modules_[device_id];
108  auto cu_module = static_cast<CUmodule>(llvm_module);
109  if (!cu_module) {
110  continue;
111  } else {
112  VLOG(1) << "Try to interrupt the running query on GPU assigned to Executor "
113  << executor_id_;
114  CPU_execution_mode = false;
115  }
116  cuda_mgr->setContext(device_id);
117 
118  // Create high priority non-blocking communication stream
119  CUstream cu_stream1;
121  cuStreamCreateWithPriority(&cu_stream1, CU_STREAM_NON_BLOCKING, 1));
122 
123  CUevent start, stop;
124  cuEventCreate(&start, 0);
125  cuEventCreate(&stop, 0);
126  cuEventRecord(start, cu_stream1);
127 
130  size_t dw_abort_size;
131  if (cuModuleGetGlobal(&dw_abort, &dw_abort_size, cu_module, "dw_abort") ==
132  CUDA_SUCCESS) {
133  CHECK_EQ(dw_abort_size, sizeof(uint32_t));
134  int32_t abort_val = 1;
135  checkCudaErrors(cuMemcpyHtoDAsync(dw_abort,
136  reinterpret_cast<void*>(&abort_val),
137  sizeof(int32_t),
138  cu_stream1));
139 
140  if (device_id == 0) {
141  VLOG(1) << "GPU: Async Abort submitted to Device "
142  << std::to_string(device_id);
143  }
144  }
145  }
146 
147  if (allow_interrupt) {
149  size_t runtime_interrupt_flag_size;
150  auto status = cuModuleGetGlobal(&runtime_interrupt_flag,
151  &runtime_interrupt_flag_size,
152  cu_module,
153  "runtime_interrupt_flag");
154  if (status == CUDA_SUCCESS) {
155  VLOG(1) << "Executor " << executor_id_
156  << " retrieves interrupt status from GPU " << device_id;
157  CHECK_EQ(runtime_interrupt_flag_size, sizeof(uint32_t));
158  int32_t abort_val = 1;
159  checkCudaErrors(cuMemcpyHtoDAsync(runtime_interrupt_flag,
160  reinterpret_cast<void*>(&abort_val),
161  sizeof(int32_t),
162  cu_stream1));
163  if (device_id == 0) {
164  VLOG(1) << "GPU: send interrupt signal from Executor " << executor_id_
165  << " to Device " << std::to_string(device_id);
166  }
167  } else if (status == CUDA_ERROR_NOT_FOUND) {
168  std::runtime_error(
169  "Runtime query interrupt on Executor " + std::to_string(executor_id_) +
170  " has failed: an interrupt flag on the GPU could "
171  "not be initialized (CUDA_ERROR_CODE: CUDA_ERROR_NOT_FOUND)");
172  } else {
173  // if we reach here, query runtime interrupt is failed due to
174  // one of the following error: CUDA_ERROR_NOT_INITIALIZED,
175  // CUDA_ERROR_DEINITIALIZED. CUDA_ERROR_INVALID_CONTEXT, and
176  // CUDA_ERROR_INVALID_VALUE. All those error codes are due to device failure.
177  const char* error_ret_str = nullptr;
178  cuGetErrorName(status, &error_ret_str);
179  if (!error_ret_str) {
180  error_ret_str = "UNKNOWN";
181  }
182  std::string error_str(error_ret_str);
183  std::runtime_error(
184  "Runtime interrupt on Executor " + std::to_string(executor_id_) +
185  " has failed due to a device " + std::to_string(device_id) +
186  "'s issue "
187  "(CUDA_ERROR_CODE: " +
188  error_str + ")");
189  }
190 
191  cuEventRecord(stop, cu_stream1);
192  cuEventSynchronize(stop);
193  float milliseconds = 0;
194  cuEventElapsedTime(&milliseconds, start, stop);
195  VLOG(1) << "Device " << std::to_string(device_id)
196  << ": submitted async interrupt request from Executor " << executor_id_
197  << " : SUCCESS: " << std::to_string(milliseconds) << " ms";
198  checkCudaErrors(cuStreamDestroy(cu_stream1));
199  }
200  }
201  checkCudaErrors(cuCtxSetCurrent(old_cu_context));
202  }
203  }
204 #endif
206  dynamic_watchdog_init(static_cast<unsigned>(DW_ABORT));
207  }
208 
209  if (allow_interrupt && CPU_execution_mode) {
210  // turn interrupt flag on for CPU mode
211  VLOG(1) << "Try to interrupt the running query on CPU from Executor " << executor_id_;
212  check_interrupt_init(static_cast<unsigned>(INT_ABORT));
213  }
214 }
215 
217  const auto allow_interrupt =
220  dynamic_watchdog_init(static_cast<unsigned>(DW_RESET));
221  } else if (allow_interrupt) {
222 #ifdef HAVE_CUDA
223  for (int device_id = 0; device_id < max_gpu_count; device_id++) {
225  }
226 #endif
227  VLOG(1) << "Reset interrupt flag for CPU execution kernel on Executor "
228  << executor_id_;
229  check_interrupt_init(static_cast<unsigned>(INT_RESET));
230  }
231 
232  if (interrupted_.load()) {
233  VLOG(1) << "RESET Executor " << executor_id_
234  << " that had previously been interrupted";
235  interrupted_.store(false);
236  }
237 }
#define CHECK_EQ(x, y)
Definition: Logger.h:230
static void registerActiveModule(void *module, const int device_id)
int CUcontext
Definition: nocuda.h:22
void * CUstream
Definition: nocuda.h:23
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
unsigned long long CUdeviceptr
Definition: nocuda.h:28
#define CHECK_GE(x, y)
Definition: Logger.h:235
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:80
bool g_enable_non_kernel_time_query_interrupt
Definition: Execute.cpp:126
std::string to_string(char const *&&v)
std::shared_lock< T > shared_lock
static void unregisterActiveModule(const int device_id)
std::unique_lock< T > unique_lock
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:95
RUNTIME_EXPORT uint64_t dynamic_watchdog_init(unsigned ms_budget)
#define CHECK_LT(x, y)
Definition: Logger.h:232
data_mgr_(data_mgr)
void resetInterrupt()
#define CHECK(condition)
Definition: Logger.h:222
void interrupt(const QuerySessionId &query_session="", const QuerySessionId &interrupt_session="")
RUNTIME_EXPORT bool check_interrupt_init(unsigned command)
__device__ int32_t dw_abort
Definition: cuda_mapd_rt.cu:94
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:125
#define VLOG(n)
Definition: Logger.h:316
void * CUmodule
Definition: nocuda.h:24