22 std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
24 gpu_active_modules_device_mask_ |= (1 << device_id);
25 gpu_active_modules_[device_id] = module;
26 VLOG(1) <<
"Executor " <<
this <<
", mask 0x" << std::hex
27 << gpu_active_modules_device_mask_ <<
": Registered module " << module
34 std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
36 if ((gpu_active_modules_device_mask_ & (1 << device_id)) == 0) {
39 CHECK_EQ(gpu_active_modules_[device_id], module);
40 gpu_active_modules_device_mask_ ^= (1 << device_id);
41 VLOG(1) <<
"Executor " <<
this <<
", mask 0x" << std::hex
42 << gpu_active_modules_device_mask_ <<
": Unregistered module " << module
48 const std::string& interrupt_session) {
50 bool is_running_query =
false;
54 mapd_shared_lock<mapd_shared_mutex> session_read_lock(executor_session_mutex_);
55 if (!checkIsQuerySessionEnrolled(query_session, session_read_lock)) {
56 VLOG(1) <<
"Skip the interrupt request (no query has been submitted from the "
57 "given query session)";
60 if (checkIsQuerySessionInterrupted(query_session, session_read_lock)) {
61 VLOG(1) <<
"Skip the interrupt request (already interrupted query session)";
66 is_running_query = checkCurrentQuerySession(query_session, session_read_lock);
74 mapd_unique_lock<mapd_shared_mutex> session_write_lock(executor_session_mutex_);
75 setQuerySessionAsInterrupted(query_session, session_write_lock);
77 if (!is_running_query) {
80 interrupted_.store(
true);
83 bool CPU_execution_mode =
true;
91 CHECK_GE(cuda_mgr->getDeviceCount(), 1);
92 std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
93 VLOG(1) <<
"Executor " <<
this <<
": Interrupting Active Modules: mask 0x" << std::hex
94 << gpu_active_modules_device_mask_;
97 for (
int device_id = 0; device_id < max_gpu_count; device_id++) {
98 if (gpu_active_modules_device_mask_ & (1 << device_id)) {
99 void* module = gpu_active_modules_[device_id];
100 auto cu_module =
static_cast<CUmodule>(module);
104 VLOG(1) <<
"Try to interrupt the running query on GPU";
105 CPU_execution_mode =
false;
107 VLOG(1) <<
"Executor " <<
this <<
": Interrupting Active Modules: mask 0x"
108 << std::hex << gpu_active_modules_device_mask_ <<
" on device "
111 cuda_mgr->setContext(device_id);
116 cuStreamCreateWithPriority(&cu_stream1, CU_STREAM_NON_BLOCKING, 1));
119 cuEventCreate(&start, 0);
120 cuEventCreate(&stop, 0);
121 cuEventRecord(start, cu_stream1);
125 size_t dw_abort_size;
126 if (cuModuleGetGlobal(&dw_abort, &dw_abort_size, cu_module,
"dw_abort") ==
128 CHECK_EQ(dw_abort_size,
sizeof(uint32_t));
129 int32_t abort_val = 1;
131 reinterpret_cast<void*>(&abort_val),
135 if (device_id == 0) {
136 VLOG(1) <<
"GPU: Async Abort submitted to Device "
144 size_t runtime_interrupt_flag_size;
145 auto status = cuModuleGetGlobal(&runtime_interrupt_flag,
146 &runtime_interrupt_flag_size,
148 "runtime_interrupt_flag");
149 if (status == CUDA_SUCCESS) {
150 VLOG(1) <<
"Interrupt on GPU status: CUDA_SUCCESS";
151 CHECK_EQ(runtime_interrupt_flag_size,
sizeof(uint32_t));
152 int32_t abort_val = 1;
154 reinterpret_cast<void*>(&abort_val),
157 if (device_id == 0) {
158 VLOG(1) <<
"GPU: Async Abort submitted to Device "
161 }
else if (status == CUDA_ERROR_NOT_FOUND) {
163 "Runtime query interrupt has failed: an interrupt flag on the GPU could "
164 "not be initialized (CUDA_ERROR_CODE: CUDA_ERROR_NOT_FOUND)");
170 const char* error_ret_str =
nullptr;
171 cuGetErrorName(status, &error_ret_str);
172 if (!error_ret_str) {
173 error_ret_str =
"UNKNOWN";
175 std::string error_str(error_ret_str);
177 "Runtime interrupt has failed due to a device related issue "
178 "(CUDA_ERROR_CODE: " +
182 cuEventRecord(stop, cu_stream1);
183 cuEventSynchronize(stop);
184 float milliseconds = 0;
185 cuEventElapsedTime(&milliseconds, start, stop);
187 <<
": submitted async request to abort SUCCESS: "
202 VLOG(1) <<
"Try to interrupt the running query on CPU";
209 std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
218 if (interrupted_.load()) {
219 VLOG(1) <<
"RESET Executor " <<
this <<
" that had previously been interrupted";
220 interrupted_.store(
false);
CudaMgr_Namespace::CudaMgr * getCudaMgr() const
void checkCudaErrors(CUresult err)
void registerActiveModule(void *module, const int device_id) const
unsigned long long CUdeviceptr
bool g_enable_dynamic_watchdog
Data_Namespace::DataMgr & getDataMgr() const
static SysCatalog & instance()
void interrupt(const std::string &query_session="", const std::string &interrupt_session="")
uint64_t dynamic_watchdog_init(unsigned ms_budget)
__device__ int32_t runtime_interrupt_flag
void unregisterActiveModule(void *module, const int device_id) const
bool check_interrupt_init(unsigned command)
__device__ int32_t dw_abort
bool g_enable_runtime_query_interrupt