OmniSciDB  1dac507f6e
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
GpuInterrupt.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2017 MapD Technologies, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "DynamicWatchdog.h"
18 #include "Execute.h"
19 
20 void Executor::registerActiveModule(void* module, const int device_id) const {
21 #ifdef HAVE_CUDA
22  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
23  CHECK_LT(device_id, max_gpu_count);
24  gpu_active_modules_device_mask_ |= (1 << device_id);
25  gpu_active_modules_[device_id] = module;
26  VLOG(1) << "Executor " << this << ", mask 0x" << std::hex
27  << gpu_active_modules_device_mask_ << ": Registered module " << module
28  << " on device " << std::to_string(device_id);
29 #endif
30 }
31 
32 void Executor::unregisterActiveModule(void* module, const int device_id) const {
33 #ifdef HAVE_CUDA
34  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
35  CHECK_LT(device_id, max_gpu_count);
36  if ((gpu_active_modules_device_mask_ & (1 << device_id)) == 0) {
37  return;
38  }
39  CHECK_EQ(gpu_active_modules_[device_id], module);
40  gpu_active_modules_device_mask_ ^= (1 << device_id);
41  VLOG(1) << "Executor " << this << ", mask 0x" << std::hex
42  << gpu_active_modules_device_mask_ << ": Unregistered module " << module
43  << " on device " << std::to_string(device_id);
44 #endif
45 }
46 
48 #ifdef HAVE_CUDA
49  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
50  VLOG(1) << "Executor " << this << ": Interrupting Active Modules: mask 0x" << std::hex
52  CUcontext old_cu_context;
53  checkCudaErrors(cuCtxGetCurrent(&old_cu_context));
54  for (int device_id = 0; device_id < max_gpu_count; device_id++) {
55  if (gpu_active_modules_device_mask_ & (1 << device_id)) {
56  void* module = gpu_active_modules_[device_id];
57  auto cu_module = static_cast<CUmodule>(module);
58  if (!cu_module) {
59  continue;
60  }
61  VLOG(1) << "Terminating module " << module << " on device "
62  << std::to_string(device_id)
63  << ", gpu_active_modules_device_mask_: " << std::hex
64  << std::to_string(gpu_active_modules_device_mask_);
65 
66  catalog_->getDataMgr().getCudaMgr()->setContext(device_id);
67 
68  // Create high priority non-blocking communication stream
69  CUstream cu_stream1;
70  checkCudaErrors(cuStreamCreateWithPriority(&cu_stream1, CU_STREAM_NON_BLOCKING, 1));
71 
72  CUevent start, stop;
73  cuEventCreate(&start, 0);
74  cuEventCreate(&stop, 0);
75  cuEventRecord(start, cu_stream1);
76 
78  size_t dw_abort_size;
79  if (cuModuleGetGlobal(&dw_abort, &dw_abort_size, cu_module, "dw_abort") ==
80  CUDA_SUCCESS) {
81  CHECK_EQ(dw_abort_size, sizeof(uint32_t));
82  int32_t abort_val = 1;
83  checkCudaErrors(cuMemcpyHtoDAsync(
84  dw_abort, reinterpret_cast<void*>(&abort_val), sizeof(int32_t), cu_stream1));
85 
86  if (device_id == 0) {
87  LOG(INFO) << "GPU: Async Abort submitted to Device "
88  << std::to_string(device_id);
89  }
90  }
91 
92  cuEventRecord(stop, cu_stream1);
93  cuEventSynchronize(stop);
94  float milliseconds = 0;
95  cuEventElapsedTime(&milliseconds, start, stop);
96  VLOG(1) << "Device " << std::to_string(device_id)
97  << ": submitted async request to abort: " << std::to_string(milliseconds)
98  << " ms\n";
99  checkCudaErrors(cuStreamDestroy(cu_stream1));
100  }
101  }
102  checkCudaErrors(cuCtxSetCurrent(old_cu_context));
103 #endif
104 
105  dynamic_watchdog_init(static_cast<unsigned>(DW_ABORT));
106 
107  interrupted_ = true;
108  VLOG(1) << "INTERRUPT Executor " << this;
109 }
110 
112 #ifdef HAVE_CUDA
113  std::lock_guard<std::mutex> lock(gpu_active_modules_mutex_);
114 #endif
115 
116  if (!interrupted_) {
117  return;
118  }
119 
120  dynamic_watchdog_init(static_cast<unsigned>(DW_RESET));
121 
122  interrupted_ = false;
123  VLOG(1) << "RESET Executor " << this << " that had previously been interrupted";
124 }
catalog_(nullptr)
#define CHECK_EQ(x, y)
Definition: Logger.h:198
std::unique_ptr< llvm::Module > module(runtime_module_shallow_copy(cgen_state))
gpu_active_modules_device_mask_(0x0)
int CUcontext
Definition: nocuda.h:22
#define LOG(tag)
Definition: Logger.h:185
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void registerActiveModule(void *module, const int device_id) const
unsigned long long CUdeviceptr
Definition: nocuda.h:27
std::string to_string(char const *&&v)
interrupted_(false)
uint64_t dynamic_watchdog_init(unsigned ms_budget)
void interrupt()
#define CHECK_LT(x, y)
Definition: Logger.h:200
void unregisterActiveModule(void *module, const int device_id) const
void resetInterrupt()
__device__ int32_t dw_abort
#define VLOG(n)
Definition: Logger.h:280
void * CUmodule
Definition: nocuda.h:23