OmniSciDB  8fa3bf436f
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
GpuInterrupt.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "DynamicWatchdog.h"
18 #include "Execute.h"
19 
20 void Executor::registerActiveModule(void* module, const int device_id) const {
21 #ifdef HAVE_CUDA
22  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
23  CHECK_LT(device_id, max_gpu_count);
24  gpu_active_modules_device_mask_ |= (1 << device_id);
25  gpu_active_modules_[device_id] = module;
26  VLOG(1) << "Executor " << this << ", mask 0x" << std::hex
27  << gpu_active_modules_device_mask_ << ": Registered module " << module
28  << " on device " << std::to_string(device_id);
29 #endif
30 }
31 
32 void Executor::unregisterActiveModule(void* module, const int device_id) const {
33 #ifdef HAVE_CUDA
34  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
35  CHECK_LT(device_id, max_gpu_count);
36  if ((gpu_active_modules_device_mask_ & (1 << device_id)) == 0) {
37  return;
38  }
39  CHECK_EQ(gpu_active_modules_[device_id], module);
40  gpu_active_modules_device_mask_ ^= (1 << device_id);
41  VLOG(1) << "Executor " << this << ", mask 0x" << std::hex
42  << gpu_active_modules_device_mask_ << ": Unregistered module " << module
43  << " on device " << std::to_string(device_id);
44 #endif
45 }
46 
47 void Executor::interrupt(const std::string& query_session,
48  const std::string& interrupt_session) {
49  const auto allow_interrupt =
51  if (allow_interrupt) {
52  bool is_running_query = false;
53  {
54  // here we validate the requested query session is valid (is already enrolled)
55  // if not, we skip the interrupt request
56  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
57  if (!checkIsQuerySessionEnrolled(query_session, session_read_lock)) {
58  VLOG(1) << "Skip the interrupt request (no query has been submitted from the "
59  "given query session)";
60  return;
61  }
62  if (checkIsQuerySessionInterrupted(query_session, session_read_lock)) {
63  VLOG(1) << "Skip the interrupt request (already interrupted query session)";
64  return;
65  }
66  // if a query is pending query, we just need to turn interrupt flag for the session
67  // on (not sending interrupt signal to "RUNNING" kernel, see the below code)
68  is_running_query = checkCurrentQuerySession(query_session, session_read_lock);
69  }
70  {
71  // We have to cover interrupt request from *any* session because we don't know
72  // whether the request is for the running query or pending query
73  // or for non-kernel time interrupt
74  // (or just false alarm that indicates unregistered session in a queue).
75  // So we try to set a session has been interrupted once we confirm
76  // the session has been enrolled and is not interrupted at this moment
77  mapd_unique_lock<mapd_shared_mutex> session_write_lock(executor_session_mutex_);
78  setQuerySessionAsInterrupted(query_session, session_write_lock);
79  }
80  if (!is_running_query) {
81  return;
82  }
83  interrupted_.store(true);
84  }
85 
86  bool CPU_execution_mode = true;
87 
88 #ifdef HAVE_CUDA
89  // The below code is basically for runtime query interrupt for GPU.
90  // It is also possible that user forces to use CPU-mode even if the user has GPU(s).
91  // In this case, we should not execute the code in below to avoid runtime failure
93  if (cuda_mgr && (g_enable_dynamic_watchdog || allow_interrupt)) {
94  // we additionally allow sending interrupt signal for
95  // `g_enable_non_kernel_time_query_interrupt` especially for CTAS/ITAS queries: data
96  // population happens on CPU but select_query can be processed via GPU
97  CHECK_GE(cuda_mgr->getDeviceCount(), 1);
98  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
99  VLOG(1) << "Executor " << this << ": Interrupting Active Modules: mask 0x" << std::hex
100  << gpu_active_modules_device_mask_;
101  CUcontext old_cu_context;
102  checkCudaErrors(cuCtxGetCurrent(&old_cu_context));
103  for (int device_id = 0; device_id < max_gpu_count; device_id++) {
104  if (gpu_active_modules_device_mask_ & (1 << device_id)) {
105  void* module = gpu_active_modules_[device_id];
106  auto cu_module = static_cast<CUmodule>(module);
107  if (!cu_module) {
108  continue;
109  } else {
110  VLOG(1) << "Try to interrupt the running query on GPU";
111  CPU_execution_mode = false;
112  }
113  VLOG(1) << "Executor " << this << ": Interrupting Active Modules: mask 0x"
114  << std::hex << gpu_active_modules_device_mask_ << " on device "
115  << std::to_string(device_id);
116 
117  cuda_mgr->setContext(device_id);
118 
119  // Create high priority non-blocking communication stream
120  CUstream cu_stream1;
122  cuStreamCreateWithPriority(&cu_stream1, CU_STREAM_NON_BLOCKING, 1));
123 
124  CUevent start, stop;
125  cuEventCreate(&start, 0);
126  cuEventCreate(&stop, 0);
127  cuEventRecord(start, cu_stream1);
128 
131  size_t dw_abort_size;
132  if (cuModuleGetGlobal(&dw_abort, &dw_abort_size, cu_module, "dw_abort") ==
133  CUDA_SUCCESS) {
134  CHECK_EQ(dw_abort_size, sizeof(uint32_t));
135  int32_t abort_val = 1;
136  checkCudaErrors(cuMemcpyHtoDAsync(dw_abort,
137  reinterpret_cast<void*>(&abort_val),
138  sizeof(int32_t),
139  cu_stream1));
140 
141  if (device_id == 0) {
142  VLOG(1) << "GPU: Async Abort submitted to Device "
143  << std::to_string(device_id);
144  }
145  }
146  }
147 
148  if (allow_interrupt) {
150  size_t runtime_interrupt_flag_size;
151  auto status = cuModuleGetGlobal(&runtime_interrupt_flag,
152  &runtime_interrupt_flag_size,
153  cu_module,
154  "runtime_interrupt_flag");
155  if (status == CUDA_SUCCESS) {
156  VLOG(1) << "Interrupt on GPU status: CUDA_SUCCESS";
157  CHECK_EQ(runtime_interrupt_flag_size, sizeof(uint32_t));
158  int32_t abort_val = 1;
159  checkCudaErrors(cuMemcpyHtoDAsync(runtime_interrupt_flag,
160  reinterpret_cast<void*>(&abort_val),
161  sizeof(int32_t),
162  cu_stream1));
163  if (device_id == 0) {
164  VLOG(1) << "GPU: Async Abort submitted to Device "
165  << std::to_string(device_id);
166  }
167  } else if (status == CUDA_ERROR_NOT_FOUND) {
168  std::runtime_error(
169  "Runtime query interrupt has failed: an interrupt flag on the GPU could "
170  "not be initialized (CUDA_ERROR_CODE: CUDA_ERROR_NOT_FOUND)");
171  } else {
172  // if we reach here, query runtime interrupt is failed due to
173  // one of the following error: CUDA_ERROR_NOT_INITIALIZED,
174  // CUDA_ERROR_DEINITIALIZED. CUDA_ERROR_INVALID_CONTEXT, and
175  // CUDA_ERROR_INVALID_VALUE. All those error codes are due to device failure.
176  const char* error_ret_str = nullptr;
177  cuGetErrorName(status, &error_ret_str);
178  if (!error_ret_str) {
179  error_ret_str = "UNKNOWN";
180  }
181  std::string error_str(error_ret_str);
182  std::runtime_error(
183  "Runtime interrupt has failed due to a device related issue "
184  "(CUDA_ERROR_CODE: " +
185  error_str + ")");
186  }
187 
188  cuEventRecord(stop, cu_stream1);
189  cuEventSynchronize(stop);
190  float milliseconds = 0;
191  cuEventElapsedTime(&milliseconds, start, stop);
192  VLOG(1) << "Device " << std::to_string(device_id)
193  << ": submitted async request to abort SUCCESS: "
194  << std::to_string(milliseconds) << " ms";
195  checkCudaErrors(cuStreamDestroy(cu_stream1));
196  }
197  }
198  checkCudaErrors(cuCtxSetCurrent(old_cu_context));
199  }
200  }
201 #endif
203  dynamic_watchdog_init(static_cast<unsigned>(DW_ABORT));
204  }
205 
206  if (allow_interrupt && CPU_execution_mode) {
207  // turn interrupt flag on for CPU mode
208  VLOG(1) << "Try to interrupt the running query on CPU";
209  check_interrupt_init(static_cast<unsigned>(INT_ABORT));
210  }
211 }
212 
214 #ifdef HAVE_CUDA
215  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
216 #endif
217  const auto allow_interrupt =
220  dynamic_watchdog_init(static_cast<unsigned>(DW_RESET));
221  } else if (allow_interrupt) {
222  check_interrupt_init(static_cast<unsigned>(INT_RESET));
223  }
224 
225  if (interrupted_.load()) {
226  VLOG(1) << "RESET Executor " << this << " that had previously been interrupted";
227  interrupted_.store(false);
228  }
229 }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:207
#define CHECK_EQ(x, y)
Definition: Logger.h:211
int CUcontext
Definition: nocuda.h:22
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void registerActiveModule(void *module, const int device_id) const
unsigned long long CUdeviceptr
Definition: nocuda.h:27
#define CHECK_GE(x, y)
Definition: Logger.h:216
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:77
bool g_enable_non_kernel_time_query_interrupt
Definition: Execute.cpp:115
std::string to_string(char const *&&v)
Data_Namespace::DataMgr & getDataMgr() const
Definition: SysCatalog.h:193
static SysCatalog & instance()
Definition: SysCatalog.h:292
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:96
RUNTIME_EXPORT uint64_t dynamic_watchdog_init(unsigned ms_budget)
#define CHECK_LT(x, y)
Definition: Logger.h:213
void unregisterActiveModule(void *module, const int device_id) const
void resetInterrupt()
bool check_interrupt_init(unsigned command)
void interrupt(const QuerySessionId &query_session="", const QuerySessionId &interrupt_session="")
__device__ int32_t dw_abort
Definition: cuda_mapd_rt.cu:95
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:114
#define VLOG(n)
Definition: Logger.h:297
void * CUmodule
Definition: nocuda.h:23