OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
NvidiaKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <sstream>
18 
19 #include "NvidiaKernel.h"
20 
21 #include "Logger/Logger.h"
23 
24 #include <boost/filesystem/operations.hpp>
25 
26 #ifdef HAVE_CUDA
27 namespace {
28 
29 #define JIT_LOG_SIZE 8192
30 
31 void fill_options(std::vector<CUjit_option>& option_keys,
32  std::vector<void*>& option_values,
33  char* info_log,
34  char* error_log) {
35  option_keys.push_back(CU_JIT_LOG_VERBOSE);
36  option_values.push_back(reinterpret_cast<void*>(1));
37  option_keys.push_back(CU_JIT_THREADS_PER_BLOCK);
38  // fix the minimum # threads per block to the hardware-limit maximum num threads
39  // to avoid recompiling jit module even if we manipulate it via query hint
40  // (and allowed `CU_JIT_THREADS_PER_BLOCK` range is between 1 and 1024 by query hint)
41  option_values.push_back(reinterpret_cast<void*>(1024));
42  option_keys.push_back(CU_JIT_WALL_TIME);
43  option_values.push_back(reinterpret_cast<void*>(0));
44  option_keys.push_back(CU_JIT_INFO_LOG_BUFFER);
45  option_values.push_back(reinterpret_cast<void*>(info_log));
46  option_keys.push_back(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES);
47  option_values.push_back(reinterpret_cast<void*>((long)JIT_LOG_SIZE));
48  option_keys.push_back(CU_JIT_ERROR_LOG_BUFFER);
49  option_values.push_back(reinterpret_cast<void*>(error_log));
50  option_keys.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES);
51  option_values.push_back(reinterpret_cast<void*>((long)JIT_LOG_SIZE));
52 }
53 
54 boost::filesystem::path get_gpu_rt_path() {
55  boost::filesystem::path gpu_rt_path{heavyai::get_root_abs_path()};
56  gpu_rt_path /= "QueryEngine";
57  gpu_rt_path /= "cuda_mapd_rt.fatbin";
58  if (!boost::filesystem::exists(gpu_rt_path)) {
59  throw std::runtime_error("HeavyDB GPU runtime library not found at " +
60  gpu_rt_path.string());
61  }
62  return gpu_rt_path;
63 }
64 
65 boost::filesystem::path get_cuda_table_functions_path() {
66  boost::filesystem::path cuda_table_functions_path{heavyai::get_root_abs_path()};
67  cuda_table_functions_path /= "QueryEngine";
68  cuda_table_functions_path /= "CudaTableFunctions.a";
69  if (!boost::filesystem::exists(cuda_table_functions_path)) {
70  throw std::runtime_error("HeavyDB GPU table functions module not found at " +
71  cuda_table_functions_path.string());
72  }
73 
74  return cuda_table_functions_path;
75 }
76 
77 } // namespace
78 
79 void nvidia_jit_warmup() {
80  std::vector<CUjit_option> option_keys;
81  std::vector<void*> option_values;
82  char info_log[JIT_LOG_SIZE];
83  char error_log[JIT_LOG_SIZE];
84  fill_options(option_keys, option_values, info_log, error_log);
85  CHECK_EQ(option_values.size(), option_keys.size());
86  unsigned num_options = option_keys.size();
87  CUlinkState link_state;
89  cuLinkCreate(num_options, &option_keys[0], &option_values[0], &link_state))
90  << ": " << std::string(error_log);
91  VLOG(1) << "CUDA JIT time to create link: "
92  << *reinterpret_cast<float*>(&option_values[2]);
93  boost::filesystem::path gpu_rt_path = get_gpu_rt_path();
94  boost::filesystem::path cuda_table_functions_path = get_cuda_table_functions_path();
95  CHECK(!gpu_rt_path.empty());
96  CHECK(!cuda_table_functions_path.empty());
97  checkCudaErrors(cuLinkAddFile(
98  link_state, CU_JIT_INPUT_FATBINARY, gpu_rt_path.c_str(), 0, nullptr, nullptr))
99  << ": " << std::string(error_log);
100  VLOG(1) << "CUDA JIT time to add RT fatbinary: "
101  << *reinterpret_cast<float*>(&option_values[2]);
102  checkCudaErrors(cuLinkAddFile(link_state,
103  CU_JIT_INPUT_LIBRARY,
104  cuda_table_functions_path.c_str(),
105  0,
106  nullptr,
107  nullptr))
108  << ": " << std::string(error_log);
109  VLOG(1) << "CUDA JIT time to add GPU table functions library: "
110  << *reinterpret_cast<float*>(&option_values[2]);
111  checkCudaErrors(cuLinkDestroy(link_state)) << ": " << std::string(error_log);
112 }
113 
114 std::string add_line_numbers(const std::string& text) {
115  std::stringstream iss(text);
116  std::string result;
117  size_t count = 1;
118  while (iss.good()) {
119  std::string line;
120  std::getline(iss, line, '\n');
121  result += std::to_string(count) + ": " + line + "\n";
122  count++;
123  }
124  return result;
125 }
126 
127 CubinResult ptx_to_cubin(const std::string& ptx,
128  const CudaMgr_Namespace::CudaMgr* cuda_mgr) {
129  auto timer = DEBUG_TIMER(__func__);
130  CHECK(!ptx.empty());
131  CHECK(cuda_mgr && cuda_mgr->getDeviceCount() > 0);
132  cuda_mgr->setContext(0);
133  std::vector<CUjit_option> option_keys;
134  std::vector<void*> option_values;
135  char info_log[JIT_LOG_SIZE];
136  char error_log[JIT_LOG_SIZE];
137  fill_options(option_keys, option_values, info_log, error_log);
138  CHECK_EQ(option_values.size(), option_keys.size());
139  unsigned num_options = option_keys.size();
140  CUlinkState link_state;
142  cuLinkCreate(num_options, &option_keys[0], &option_values[0], &link_state))
143  << ": " << std::string(error_log);
144  VLOG(1) << "CUDA JIT time to create link: "
145  << *reinterpret_cast<float*>(&option_values[2]);
146 
147  boost::filesystem::path gpu_rt_path = get_gpu_rt_path();
148  boost::filesystem::path cuda_table_functions_path = get_cuda_table_functions_path();
149  CHECK(!gpu_rt_path.empty());
150  CHECK(!cuda_table_functions_path.empty());
151  // How to create a static CUDA library:
152  // 1. nvcc -std=c++11 -arch=sm_35 --device-link -c [list of .cu files]
153  // 2. nvcc -std=c++11 -arch=sm_35 -lib [list of .o files generated by step 1] -o
154  // [library_name.a]
155  checkCudaErrors(cuLinkAddFile(
156  link_state, CU_JIT_INPUT_FATBINARY, gpu_rt_path.c_str(), 0, nullptr, nullptr))
157  << ": " << std::string(error_log);
158  VLOG(1) << "CUDA JIT time to add RT fatbinary: "
159  << *reinterpret_cast<float*>(&option_values[2]);
160  checkCudaErrors(cuLinkAddFile(link_state,
161  CU_JIT_INPUT_LIBRARY,
162  cuda_table_functions_path.c_str(),
163  0,
164  nullptr,
165  nullptr))
166  << ": " << std::string(error_log);
167  VLOG(1) << "CUDA JIT time to add GPU table functions library: "
168  << *reinterpret_cast<float*>(&option_values[2]);
169  checkCudaErrors(cuLinkAddData(link_state,
170  CU_JIT_INPUT_PTX,
171  static_cast<void*>(const_cast<char*>(ptx.c_str())),
172  ptx.length() + 1,
173  0,
174  0,
175  nullptr,
176  nullptr))
177  << ": " << std::string(error_log) << "\nPTX:\n"
178  << add_line_numbers(ptx) << "\nEOF PTX";
179  VLOG(1) << "CUDA JIT time to add generated code: "
180  << *reinterpret_cast<float*>(&option_values[2]);
181  void* cubin{nullptr};
182  size_t cubinSize{0};
183  checkCudaErrors(cuLinkComplete(link_state, &cubin, &cubinSize))
184  << ": " << std::string(error_log);
185  VLOG(1) << "CUDA Linker completed: " << info_log;
186  CHECK(cubin);
187  CHECK_GT(cubinSize, size_t(0));
188  VLOG(1) << "Generated GPU binary code size: " << cubinSize << " bytes";
189  return {cubin, option_keys, option_values, link_state, cubinSize};
190 }
191 #endif
192 
193 #ifdef HAVE_CUDA
195  const size_t module_size,
196  const std::string& kernel_name,
197  const int device_id,
198  const void* cuda_mgr,
199  unsigned int num_options,
200  CUjit_option* options,
201  void** option_vals)
202  : module_(nullptr)
203  , module_size_(module_size)
204  , kernel_(nullptr)
205  , kernel_name_(kernel_name)
206  , device_id_(device_id)
207  , cuda_mgr_(static_cast<const CudaMgr_Namespace::CudaMgr*>(cuda_mgr)) {
208  LOG_IF(FATAL, cuda_mgr_ == nullptr)
209  << "Unable to initialize GPU compilation context without CUDA manager";
210  cuda_mgr_->loadGpuModuleData(
211  &module_, image, num_options, options, option_vals, device_id_);
212  CHECK(module_);
213  checkCudaErrors(cuModuleGetFunction(&kernel_, module_, kernel_name_.c_str()));
214 }
215 #endif // HAVE_CUDA
216 
218 #ifdef HAVE_CUDA
219  CHECK(cuda_mgr_);
220  cuda_mgr_->unloadGpuModuleData(&module_, device_id_);
221 #endif
222 }
#define CHECK_EQ(x, y)
Definition: Logger.h:301
int CUjit_option
Definition: nocuda.h:26
std::string get_root_abs_path()
void checkCudaErrors(CUresult err)
Definition: sample.cpp:38
void nvidia_jit_warmup()
void setContext(const int device_num) const
Definition: CudaMgr.cpp:511
#define CHECK_GT(x, y)
Definition: Logger.h:305
std::string to_string(char const *&&v)
#define LOG_IF(severity, condition)
Definition: Logger.h:384
int getDeviceCount() const
Definition: CudaMgr.h:90
CubinResult ptx_to_cubin(const std::string &ptx, const CudaMgr_Namespace::CudaMgr *cuda_mgr)
GpuDeviceCompilationContext(const void *image, const size_t module_size, const std::string &kernel_name, const int device_id, const void *cuda_mgr, unsigned int num_options, CUjit_option *options, void **option_vals)
int CUlinkState
Definition: nocuda.h:27
tuple line
Definition: parse_ast.py:10
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
#define VLOG(n)
Definition: Logger.h:388