OmniSciDB  95562058bd
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
GpuInterrupt.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "DynamicWatchdog.h"
18 #include "Execute.h"
19 
20 void Executor::registerActiveModule(void* module, const int device_id) const {
21 #ifdef HAVE_CUDA
22  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
23  CHECK_LT(device_id, max_gpu_count);
24  gpu_active_modules_device_mask_ |= (1 << device_id);
25  gpu_active_modules_[device_id] = module;
26  VLOG(1) << "Executor " << this << ", mask 0x" << std::hex
27  << gpu_active_modules_device_mask_ << ": Registered module " << module
28  << " on device " << std::to_string(device_id);
29 #endif
30 }
31 
32 void Executor::unregisterActiveModule(void* module, const int device_id) const {
33 #ifdef HAVE_CUDA
34  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
35  CHECK_LT(device_id, max_gpu_count);
36  if ((gpu_active_modules_device_mask_ & (1 << device_id)) == 0) {
37  return;
38  }
39  CHECK_EQ(gpu_active_modules_[device_id], module);
40  gpu_active_modules_device_mask_ ^= (1 << device_id);
41  VLOG(1) << "Executor " << this << ", mask 0x" << std::hex
42  << gpu_active_modules_device_mask_ << ": Unregistered module " << module
43  << " on device " << std::to_string(device_id);
44 #endif
45 }
46 
47 void Executor::interrupt(const std::string& query_session,
48  const std::string& interrupt_session) {
49  VLOG(1) << "Receive INTERRUPT request on the Executor " << this;
50  interrupted_.store(true);
52  // We first check the query session as interrupted first.
53  mapd_unique_lock<mapd_shared_mutex> session_write_lock(executor_session_mutex_);
54  this->setQuerySessionAsInterrupted(query_session, session_write_lock);
55  session_write_lock.unlock();
56  // We have to cover interrupt request from *any* session because we don't know
57  // whether the request is for the running query or pending query.
58  // But pending query hangs on the executor until the running query is finished
59  // to get the computing resources to execute the query
60  // So, we just need to kill the pending query on the executor.
61  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
62  bool isRunningSession =
63  this->checkCurrentQuerySession(query_session, session_read_lock);
64  session_read_lock.unlock();
65  if (!isRunningSession) {
66  interrupted_.store(false);
67  return;
68  }
69  }
70 
71  bool CPU_execution_mode = true;
72 
73 #ifdef HAVE_CUDA
74  // The below code is basically for runtime query interrupt for GPU.
75  // It is also possible that user forces to use CPU-mode even if the user has GPU(s).
76  // In this case, we should not execute the code in below to avoid runtime failure
78  .getDataMgr()
79  .getCudaMgr()
80  ->getDeviceCount(),
81  1);
82  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
83  VLOG(1) << "Executor " << this << ": Interrupting Active Modules: mask 0x" << std::hex
84  << gpu_active_modules_device_mask_;
85  CUcontext old_cu_context;
86  checkCudaErrors(cuCtxGetCurrent(&old_cu_context));
87  for (int device_id = 0; device_id < max_gpu_count; device_id++) {
88  if (gpu_active_modules_device_mask_ & (1 << device_id)) {
89  void* module = gpu_active_modules_[device_id];
90  auto cu_module = static_cast<CUmodule>(module);
91  if (!cu_module) {
92  continue;
93  } else {
94  VLOG(1) << "Try to interrupt the running query on GPU";
95  CPU_execution_mode = false;
96  }
97  VLOG(1) << "Executor " << this << ": Interrupting Active Modules: mask 0x"
98  << std::hex << gpu_active_modules_device_mask_ << " on device "
99  << std::to_string(device_id);
100 
101  if (catalog_) {
102  catalog_->getDataMgr().getCudaMgr()->setContext(device_id);
103  } else {
105  device_id);
106  }
107 
108  // Create high priority non-blocking communication stream
109  CUstream cu_stream1;
110  checkCudaErrors(cuStreamCreateWithPriority(&cu_stream1, CU_STREAM_NON_BLOCKING, 1));
111 
112  CUevent start, stop;
113  cuEventCreate(&start, 0);
114  cuEventCreate(&stop, 0);
115  cuEventRecord(start, cu_stream1);
116 
119  size_t dw_abort_size;
120  if (cuModuleGetGlobal(&dw_abort, &dw_abort_size, cu_module, "dw_abort") ==
121  CUDA_SUCCESS) {
122  CHECK_EQ(dw_abort_size, sizeof(uint32_t));
123  int32_t abort_val = 1;
124  checkCudaErrors(cuMemcpyHtoDAsync(dw_abort,
125  reinterpret_cast<void*>(&abort_val),
126  sizeof(int32_t),
127  cu_stream1));
128 
129  if (device_id == 0) {
130  LOG(INFO) << "GPU: Async Abort submitted to Device "
131  << std::to_string(device_id);
132  }
133  }
134  }
135 
138  size_t runtime_interrupt_flag_size;
139  auto status = cuModuleGetGlobal(&runtime_interrupt_flag,
140  &runtime_interrupt_flag_size,
141  cu_module,
142  "runtime_interrupt_flag");
143  if (status == CUDA_SUCCESS) {
144  VLOG(1) << "Interrupt on GPU status: CUDA_SUCCESS";
145  CHECK_EQ(runtime_interrupt_flag_size, sizeof(uint32_t));
146  int32_t abort_val = 1;
147  checkCudaErrors(cuMemcpyHtoDAsync(runtime_interrupt_flag,
148  reinterpret_cast<void*>(&abort_val),
149  sizeof(int32_t),
150  cu_stream1));
151  if (device_id == 0) {
152  LOG(INFO) << "GPU: Async Abort submitted to Device "
153  << std::to_string(device_id);
154  }
155  } else if (status == CUDA_ERROR_NOT_FOUND) {
156  std::runtime_error(
157  "Runtime query interruption is failed: "
158  "a interrupt flag on GPU does not be initialized.");
159  } else {
160  // if we reach here, query runtime interrupt is failed due to
161  // one of the following error: CUDA_ERROR_NOT_INITIALIZED,
162  // CUDA_ERROR_DEINITIALIZED. CUDA_ERROR_INVALID_CONTEXT, and
163  // CUDA_ERROR_INVALID_VALUE. All those error codes are due to device failure.
164  std::runtime_error("Runtime interrupt is failed due to device-related issue");
165  }
166 
167  cuEventRecord(stop, cu_stream1);
168  cuEventSynchronize(stop);
169  float milliseconds = 0;
170  cuEventElapsedTime(&milliseconds, start, stop);
171  VLOG(1) << "Device " << std::to_string(device_id)
172  << ": submitted async request to abort SUCCESS: "
173  << std::to_string(milliseconds) << " ms\n";
174  checkCudaErrors(cuStreamDestroy(cu_stream1));
175  }
176  }
177  checkCudaErrors(cuCtxSetCurrent(old_cu_context));
178  }
179 #endif
181  dynamic_watchdog_init(static_cast<unsigned>(DW_ABORT));
182  }
183 
184  if (g_enable_runtime_query_interrupt && CPU_execution_mode) {
185  // turn interrupt flag on for CPU mode
186  VLOG(1) << "Try to interrupt the running query on CPU";
187  check_interrupt_init(static_cast<unsigned>(INT_ABORT));
188  }
189 }
190 
192 #ifdef HAVE_CUDA
193  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
194 #endif
195 
197  dynamic_watchdog_init(static_cast<unsigned>(DW_RESET));
199  check_interrupt_init(static_cast<unsigned>(INT_RESET));
200  }
201 
202  if (g_cluster) {
203  bool sessionLeft = false;
204  mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
205  std::string curSession = getCurrentQuerySession(session_read_lock);
206  sessionLeft = checkIsQuerySessionInterrupted(curSession, session_read_lock);
207  session_read_lock.unlock();
208  if (curSession != "" || sessionLeft) {
209  mapd_unique_lock<mapd_shared_mutex> session_write_lock(executor_session_mutex_);
210  removeFromQuerySessionList(curSession, session_write_lock);
211  invalidateRunningQuerySession(session_write_lock);
212  session_write_lock.unlock();
213  }
214  }
215 
216  if (interrupted_.load()) {
217  interrupted_.store(false);
218  }
219  VLOG(1) << "RESET Executor " << this << " that had previously been interrupted";
220 }
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
Definition: DataMgr.h:208
catalog_(nullptr)
#define CHECK_EQ(x, y)
Definition: Logger.h:205
int CUcontext
Definition: nocuda.h:22
#define LOG(tag)
Definition: Logger.h:188
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void registerActiveModule(void *module, const int device_id) const
unsigned long long CUdeviceptr
Definition: nocuda.h:27
void setContext(const int device_num) const
Definition: CudaMgr.cpp:362
#define CHECK_GE(x, y)
Definition: Logger.h:210
bool g_enable_dynamic_watchdog
Definition: Execute.cpp:75
std::string to_string(char const *&&v)
Data_Namespace::DataMgr & getDataMgr() const
Definition: SysCatalog.h:187
static SysCatalog & instance()
Definition: SysCatalog.h:286
void interrupt(const std::string &query_session="", const std::string &interrupt_session="")
uint64_t dynamic_watchdog_init(unsigned ms_budget)
__device__ int32_t runtime_interrupt_flag
Definition: cuda_mapd_rt.cu:96
#define CHECK_LT(x, y)
Definition: Logger.h:207
void unregisterActiveModule(void *module, const int device_id) const
void resetInterrupt()
bool check_interrupt_init(unsigned command)
bool g_cluster
__device__ int32_t dw_abort
Definition: cuda_mapd_rt.cu:95
bool g_enable_runtime_query_interrupt
Definition: Execute.cpp:108
#define VLOG(n)
Definition: Logger.h:291
void * CUmodule
Definition: nocuda.h:23