OmniSciDB  72c90bc290
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ExecutionKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright 2022 HEAVY.AI, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
18 
19 #include <mutex>
20 #include <vector>
21 
25 #include "QueryEngine/Execute.h"
29 
30 namespace {
31 
33  return !res || res->definitelyHasNoRows();
34 }
35 
36 inline bool query_has_inner_join(const RelAlgExecutionUnit& ra_exe_unit) {
37  return (std::count_if(ra_exe_unit.join_quals.begin(),
38  ra_exe_unit.join_quals.end(),
39  [](const auto& join_condition) {
40  return join_condition.type == JoinType::INNER;
41  }) > 0);
42 }
43 
44 // column is part of the target expressions, result set iteration needs it alive.
46  const RelAlgExecutionUnit& ra_exe_unit,
47  const std::vector<ColumnLazyFetchInfo>& lazy_fetch_info,
48  const ExecutorDeviceType device_type) {
49  CHECK(chunk->getColumnDesc());
50  const auto& chunk_ti = chunk->getColumnDesc()->columnType;
51  if (device_type == ExecutorDeviceType::CPU &&
52  (chunk_ti.is_array() ||
53  (chunk_ti.is_string() && chunk_ti.get_compression() == kENCODING_NONE))) {
54  for (const auto target_expr : ra_exe_unit.target_exprs) {
55  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
56  if (col_var) {
57  const auto& column_key = col_var->getColumnKey();
58  return column_key.column_id == chunk->getColumnDesc()->columnId &&
59  column_key.table_id == chunk->getColumnDesc()->tableId &&
60  column_key.db_id == chunk->getColumnDesc()->db_id;
61  }
62  }
63  }
64  if (lazy_fetch_info.empty()) {
65  return false;
66  }
67  CHECK_EQ(lazy_fetch_info.size(), ra_exe_unit.target_exprs.size());
68  for (size_t i = 0; i < ra_exe_unit.target_exprs.size(); i++) {
69  const auto target_expr = ra_exe_unit.target_exprs[i];
70  const auto& col_lazy_fetch = lazy_fetch_info[i];
71  const auto col_var = dynamic_cast<const Analyzer::ColumnVar*>(target_expr);
72  if (col_var) {
73  const auto& column_key = col_var->getColumnKey();
74  if (column_key.column_id == chunk->getColumnDesc()->columnId &&
75  column_key.table_id == chunk->getColumnDesc()->tableId &&
76  column_key.db_id == chunk->getColumnDesc()->db_id) {
77  if (col_lazy_fetch.is_lazily_fetched) {
78  // hold lazy fetched inputs for later iteration
79  return true;
80  }
81  }
82  }
83  }
84  return false;
85 }
86 
87 bool need_to_hold_chunk(const std::list<std::shared_ptr<Chunk_NS::Chunk>>& chunks,
88  const RelAlgExecutionUnit& ra_exe_unit,
89  const std::vector<ColumnLazyFetchInfo>& lazy_fetch_info,
90  const ExecutorDeviceType device_type) {
91  for (const auto& chunk : chunks) {
92  if (need_to_hold_chunk(chunk.get(), ra_exe_unit, lazy_fetch_info, device_type)) {
93  return true;
94  }
95  }
96 
97  return false;
98 }
99 
100 } // namespace
101 
102 const std::vector<uint64_t>& SharedKernelContext::getFragOffsets() {
103  std::lock_guard<std::mutex> lock(all_frag_row_offsets_mutex_);
104  if (all_frag_row_offsets_.empty()) {
105  all_frag_row_offsets_.resize(query_infos_.front().info.fragments.size() + 1);
106  for (size_t i = 1; i <= query_infos_.front().info.fragments.size(); ++i) {
108  all_frag_row_offsets_[i - 1] +
109  query_infos_.front().info.fragments[i - 1].getNumTuples();
110  }
111  }
112  return all_frag_row_offsets_;
113 }
114 
116  std::vector<size_t> outer_table_fragment_ids) {
117  std::lock_guard<std::mutex> lock(reduce_mutex_);
118  if (!needs_skip_result(device_results)) {
119  all_fragment_results_.emplace_back(std::move(device_results),
120  outer_table_fragment_ids);
121  }
122 }
123 
124 std::vector<std::pair<ResultSetPtr, std::vector<size_t>>>&
126  return all_fragment_results_;
127 }
128 
129 void ExecutionKernel::run(Executor* executor,
130  const size_t thread_idx,
131  SharedKernelContext& shared_context) {
132  DEBUG_TIMER("ExecutionKernel::run");
133  INJECT_TIMER(kernel_run);
134  try {
135  runImpl(executor, thread_idx, shared_context);
136  } catch (const OutOfHostMemory& e) {
138  } catch (const std::bad_alloc& e) {
140  } catch (const OutOfRenderMemory& e) {
142  } catch (const OutOfMemory& e) {
143  throw QueryExecutionError(
145  e.what(),
149  } catch (const ColumnarConversionNotSupported& e) {
151  } catch (const TooManyLiterals& e) {
153  } catch (const StringConstInResultSet& e) {
155  } catch (const QueryExecutionError& e) {
156  throw e;
157  }
158 }
159 
160 namespace {
161 size_t get_available_cpu_threads_per_task(Executor* executor,
162  SharedKernelContext& shared_context) {
163  // total # allocated slots (i.e., threads) for compiled kernels of the input query
164  auto const num_kernels = shared_context.getNumAllocatedThreads();
165  CHECK_GE(num_kernels, 1u);
166  size_t available_slots_per_task;
167  if (executor->executor_resource_mgr_) {
168  auto const resources_status = executor->executor_resource_mgr_->get_resource_info();
169  // # available slots (i.e., threads) in the resource pool; idle threads
170  auto const idle_cpu_slots =
171  resources_status.total_cpu_slots - resources_status.allocated_cpu_slots;
172  // we want to evenly use idle slots for each kernel task to avoid oversubscription
173  available_slots_per_task = 1u + (idle_cpu_slots + num_kernels - 1u) / num_kernels;
174  } else {
175  available_slots_per_task = std::max(static_cast<size_t>(cpu_threads()) / num_kernels,
176  static_cast<size_t>(1));
177  }
178  CHECK_GE(available_slots_per_task, 1u);
179  return available_slots_per_task;
180 }
181 } // namespace
182 
183 void ExecutionKernel::runImpl(Executor* executor,
184  const size_t thread_idx,
185  SharedKernelContext& shared_context) {
186  CHECK(executor);
187  const auto memory_level = chosen_device_type == ExecutorDeviceType::GPU
190  CHECK_GE(frag_list.size(), size_t(1));
191  // frag_list[0].table_id is how we tell which query we are running for UNION ALL.
192  const auto& outer_table_key = ra_exe_unit_.union_all
193  ? frag_list[0].table_key
194  : ra_exe_unit_.input_descs[0].getTableKey();
195  CHECK_EQ(frag_list[0].table_key, outer_table_key);
196  const auto& outer_tab_frag_ids = frag_list[0].fragment_ids;
197 
200 
201  auto data_mgr = executor->getDataMgr();
202  executor->logSystemCPUMemoryStatus("Before Query Execution", thread_idx);
204  executor->logSystemGPUMemoryStatus("Before Query Execution", thread_idx);
205  }
206 
207  // need to own them while query executes
208  auto chunk_iterators_ptr = std::make_shared<std::list<ChunkIter>>();
209  std::list<std::shared_ptr<Chunk_NS::Chunk>> chunks;
210  std::unique_ptr<std::lock_guard<std::mutex>> gpu_lock;
211  std::unique_ptr<CudaAllocator> device_allocator;
213  gpu_lock.reset(
214  new std::lock_guard<std::mutex>(executor->gpu_exec_mutex_[chosen_device_id]));
215  device_allocator = std::make_unique<CudaAllocator>(
216  data_mgr, chosen_device_id, getQueryEngineCudaStreamForDevice(chosen_device_id));
217  }
218  std::shared_ptr<FetchResult> fetch_result(new FetchResult);
219  try {
220  std::map<shared::TableKey, const TableFragments*> all_tables_fragments;
222  all_tables_fragments, ra_exe_unit_, shared_context.getQueryInfos());
223 
224  *fetch_result = ra_exe_unit_.union_all
225  ? executor->fetchUnionChunks(column_fetcher,
226  ra_exe_unit_,
228  memory_level,
229  all_tables_fragments,
230  frag_list,
231  *chunk_iterators_ptr,
232  chunks,
233  device_allocator.get(),
234  thread_idx,
236  : executor->fetchChunks(column_fetcher,
237  ra_exe_unit_,
239  memory_level,
240  all_tables_fragments,
241  frag_list,
242  *chunk_iterators_ptr,
243  chunks,
244  device_allocator.get(),
245  thread_idx,
247  if (fetch_result->num_rows.empty()) {
248  return;
249  }
251  !shared_context.dynamic_watchdog_set.test_and_set(std::memory_order_acquire)) {
254  LOG(INFO) << "Dynamic Watchdog budget: CPU: "
256  << std::to_string(cycle_budget) << " cycles";
257  }
258  } catch (const OutOfMemory&) {
259  throw QueryExecutionError(
265  return;
266  }
267 
269  if (ra_exe_unit_.input_descs.size() > 1) {
270  throw std::runtime_error("Joins not supported through external execution");
271  }
272  const auto query = serialize_to_sql(&ra_exe_unit_);
273  GroupByAndAggregate group_by_and_aggregate(executor,
275  ra_exe_unit_,
276  shared_context.getQueryInfos(),
277  executor->row_set_mem_owner_,
278  std::nullopt);
279  const auto query_mem_desc =
280  group_by_and_aggregate.initQueryMemoryDescriptor(false, 0, 8, nullptr, false);
282  query,
283  *fetch_result,
284  executor->plan_state_.get(),
288  executor});
289  shared_context.addDeviceResults(std::move(device_results_), outer_tab_frag_ids);
290  return;
291  }
292  const CompilationResult& compilation_result = query_comp_desc.getCompilationResult();
293  std::unique_ptr<QueryExecutionContext> query_exe_context_owned;
294  const bool do_render = render_info_ && render_info_->isInSitu();
295 
296  int64_t total_num_input_rows{-1};
299  total_num_input_rows = 0;
300  std::for_each(fetch_result->num_rows.begin(),
301  fetch_result->num_rows.end(),
302  [&total_num_input_rows](const std::vector<int64_t>& frag_row_count) {
303  total_num_input_rows = std::accumulate(frag_row_count.begin(),
304  frag_row_count.end(),
305  total_num_input_rows);
306  });
307  VLOG(2) << "total_num_input_rows=" << total_num_input_rows;
308  // TODO(adb): we may want to take this early out for all queries, but we are most
309  // likely to see this query pattern on the kernel per fragment path (e.g. with HAVING
310  // 0=1)
311  if (total_num_input_rows == 0) {
312  return;
313  }
314 
316  total_num_input_rows *= ra_exe_unit_.input_descs.size();
317  }
318  }
319 
320  uint32_t start_rowid{0};
321  if (rowid_lookup_key >= 0) {
322  if (!frag_list.empty()) {
323  const auto& all_frag_row_offsets = shared_context.getFragOffsets();
324  start_rowid = rowid_lookup_key -
325  all_frag_row_offsets[frag_list.begin()->fragment_ids.front()];
326  }
327  }
328 
329  // determine the # available CPU threads for each kernel to parallelize rest of
330  // initialization steps when necessary
332  get_available_cpu_threads_per_task(executor, shared_context));
333 
334 #ifdef HAVE_TBB
335  bool can_run_subkernels = shared_context.getThreadPool() != nullptr;
336 
337  // Sub-tasks are supported for groupby queries and estimators only for now.
338  bool is_groupby =
339  (ra_exe_unit_.groupby_exprs.size() > 1) ||
340  (ra_exe_unit_.groupby_exprs.size() == 1 && ra_exe_unit_.groupby_exprs.front());
341  can_run_subkernels = can_run_subkernels && (is_groupby || ra_exe_unit_.estimator);
342 
343  // In case some column is lazily fetched, we cannot mix different fragments in a single
344  // ResultSet.
345  can_run_subkernels =
346  can_run_subkernels && !executor->hasLazyFetchColumns(ra_exe_unit_.target_exprs);
347 
348  // TODO: Use another structure to hold chunks. Currently, ResultSet holds them, but with
349  // sub-tasks chunk can be referenced by many ResultSets. So, some outer structure to
350  // hold all ResultSets and all chunks is required.
351  can_run_subkernels =
352  can_run_subkernels &&
354  chunks, ra_exe_unit_, std::vector<ColumnLazyFetchInfo>(), chosen_device_type);
355 
356  // TODO: check for literals? We serialize literals before execution and hold them in
357  // result sets. Can we simply do it once and holdin an outer structure?
358  if (can_run_subkernels) {
359  size_t total_rows = fetch_result->num_rows[0][0];
360  size_t sub_size = g_cpu_sub_task_size;
361 
362  for (size_t sub_start = start_rowid; sub_start < total_rows; sub_start += sub_size) {
363  sub_size = (sub_start + sub_size > total_rows) ? total_rows - sub_start : sub_size;
364  auto subtask = std::make_shared<KernelSubtask>(*this,
365  shared_context,
366  fetch_result,
367  chunk_iterators_ptr,
368  total_num_input_rows,
369  sub_start,
370  sub_size,
371  thread_idx);
372  shared_context.getThreadPool()->run(
373  [subtask, executor] { subtask->run(executor); });
374  }
375 
376  return;
377  }
378 #endif // HAVE_TBB
379 
381  try {
382  // std::unique_ptr<QueryExecutionContext> query_exe_context_owned
383  // has std::unique_ptr<QueryMemoryInitializer> query_buffers_
384  // has std::vector<std::unique_ptr<ResultSet>> result_sets_
385  // has std::unique_ptr<ResultSetStorage> storage_
386  // which are initialized and possibly allocated here.
387  query_exe_context_owned =
389  executor,
393  outer_table_key,
394  total_num_input_rows,
395  fetch_result->col_buffers,
396  fetch_result->frag_offsets,
397  executor->getRowSetMemoryOwner(),
398  compilation_result.output_columnar,
400  thread_idx,
401  do_render ? render_info_ : nullptr);
402  } catch (const OutOfHostMemory& e) {
404  }
405  }
406  QueryExecutionContext* query_exe_context{query_exe_context_owned.get()};
407  CHECK(query_exe_context);
408  int32_t err{0};
409  bool optimize_cuda_block_and_grid_sizes =
412 
413  executor->logSystemCPUMemoryStatus("After Query Memory Initialization", thread_idx);
414 
415  if (ra_exe_unit_.groupby_exprs.empty()) {
416  err = executor->executePlanWithoutGroupBy(ra_exe_unit_,
417  compilation_result,
422  fetch_result->col_buffers,
423  query_exe_context,
424  fetch_result->num_rows,
425  fetch_result->frag_offsets,
426  data_mgr,
428  start_rowid,
429  ra_exe_unit_.input_descs.size(),
431  do_render ? render_info_ : nullptr,
432  optimize_cuda_block_and_grid_sizes);
433  } else {
434  if (ra_exe_unit_.union_all) {
435  VLOG(1) << "outer_table_key=" << outer_table_key
436  << " ra_exe_unit_.scan_limit=" << ra_exe_unit_.scan_limit;
437  }
438  err = executor->executePlanWithGroupBy(ra_exe_unit_,
439  compilation_result,
443  fetch_result->col_buffers,
444  outer_tab_frag_ids,
445  query_exe_context,
446  fetch_result->num_rows,
447  fetch_result->frag_offsets,
448  data_mgr,
450  outer_table_key,
452  start_rowid,
453  ra_exe_unit_.input_descs.size(),
455  do_render ? render_info_ : nullptr,
456  optimize_cuda_block_and_grid_sizes);
457  }
458  if (device_results_) {
459  std::list<std::shared_ptr<Chunk_NS::Chunk>> chunks_to_hold;
460  for (const auto& chunk : chunks) {
461  if (need_to_hold_chunk(chunk.get(),
462  ra_exe_unit_,
463  device_results_->getLazyFetchInfo(),
465  chunks_to_hold.push_back(chunk);
466  }
467  }
468  device_results_->holdChunks(chunks_to_hold);
469  device_results_->holdChunkIterators(chunk_iterators_ptr);
470  } else {
471  VLOG(1) << "null device_results.";
472  }
473  if (err) {
474  throw QueryExecutionError(err);
475  }
476  shared_context.addDeviceResults(std::move(device_results_), outer_tab_frag_ids);
477  executor->logSystemCPUMemoryStatus("After Query Execution", thread_idx);
479  executor->logSystemGPUMemoryStatus("After Query Execution", thread_idx);
480  }
481 }
482 
483 #ifdef HAVE_TBB
484 
485 void KernelSubtask::run(Executor* executor) {
486  try {
487  runImpl(executor);
488  } catch (const OutOfHostMemory& e) {
490  } catch (const std::bad_alloc& e) {
492  } catch (const OutOfRenderMemory& e) {
494  } catch (const OutOfMemory& e) {
495  throw QueryExecutionError(
497  e.what(),
499  kernel_.query_mem_desc.getQueryDescriptionType(),
500  kernel_.kernel_dispatch_mode == ExecutorDispatchMode::MultifragmentKernel});
501  } catch (const ColumnarConversionNotSupported& e) {
503  } catch (const TooManyLiterals& e) {
505  } catch (const StringConstInResultSet& e) {
507  } catch (const QueryExecutionError& e) {
508  throw e;
509  }
510 }
511 
512 void KernelSubtask::runImpl(Executor* executor) {
513  auto& query_exe_context_owned = shared_context_.getTlsExecutionContext().local();
514  const bool do_render = kernel_.render_info_ && kernel_.render_info_->isInSitu();
515  const CompilationResult& compilation_result =
516  kernel_.query_comp_desc.getCompilationResult();
517  const shared::TableKey& outer_table_key =
518  kernel_.ra_exe_unit_.union_all ? kernel_.frag_list[0].table_key
519  : kernel_.ra_exe_unit_.input_descs[0].getTableKey();
520 
521  if (!query_exe_context_owned) {
522  try {
523  // We pass fake col_buffers and frag_offsets. These are not actually used
524  // for subtasks but shouldn't pass empty structures to avoid empty results.
525  std::vector<std::vector<const int8_t*>> col_buffers(
526  fetch_result_->col_buffers.size(),
527  std::vector<const int8_t*>(fetch_result_->col_buffers[0].size()));
528  std::vector<std::vector<uint64_t>> frag_offsets(
529  fetch_result_->frag_offsets.size(),
530  std::vector<uint64_t>(fetch_result_->frag_offsets[0].size()));
531  query_exe_context_owned = kernel_.query_mem_desc.getQueryExecutionContext(
532  kernel_.ra_exe_unit_,
533  executor,
534  kernel_.chosen_device_type,
535  kernel_.kernel_dispatch_mode,
536  kernel_.chosen_device_id,
537  outer_table_key,
538  total_num_input_rows_,
539  col_buffers,
540  frag_offsets,
541  executor->getRowSetMemoryOwner(),
542  compilation_result.output_columnar,
543  kernel_.query_mem_desc.sortOnGpu(),
544  // TODO: use TBB thread id to choose allocator
545  thread_idx_,
546  do_render ? kernel_.render_info_ : nullptr);
547  } catch (const OutOfHostMemory& e) {
549  }
550  }
551 
552  const auto& outer_tab_frag_ids = kernel_.frag_list[0].fragment_ids;
553  QueryExecutionContext* query_exe_context{query_exe_context_owned.get()};
554  CHECK(query_exe_context);
555  int32_t err{0};
556  bool optimize_cuda_block_and_grid_sizes =
557  kernel_.chosen_device_type == ExecutorDeviceType::GPU &&
558  kernel_.eo.optimize_cuda_block_and_grid_sizes;
559  if (kernel_.ra_exe_unit_.groupby_exprs.empty()) {
560  err = executor->executePlanWithoutGroupBy(kernel_.ra_exe_unit_,
561  compilation_result,
562  kernel_.query_comp_desc.hoistLiterals(),
563  nullptr,
564  kernel_.ra_exe_unit_.target_exprs,
565  kernel_.chosen_device_type,
566  fetch_result_->col_buffers,
567  query_exe_context,
568  fetch_result_->num_rows,
569  fetch_result_->frag_offsets,
570  executor->getDataMgr(),
571  kernel_.chosen_device_id,
572  start_rowid_,
573  kernel_.ra_exe_unit_.input_descs.size(),
574  kernel_.eo.allow_runtime_query_interrupt,
575  do_render ? kernel_.render_info_ : nullptr,
576  optimize_cuda_block_and_grid_sizes,
577  start_rowid_ + num_rows_to_process_);
578  } else {
579  err = executor->executePlanWithGroupBy(kernel_.ra_exe_unit_,
580  compilation_result,
581  kernel_.query_comp_desc.hoistLiterals(),
582  nullptr,
583  kernel_.chosen_device_type,
584  fetch_result_->col_buffers,
585  outer_tab_frag_ids,
586  query_exe_context,
587  fetch_result_->num_rows,
588  fetch_result_->frag_offsets,
589  executor->getDataMgr(),
590  kernel_.chosen_device_id,
591  outer_table_key,
592  kernel_.ra_exe_unit_.scan_limit,
593  start_rowid_,
594  kernel_.ra_exe_unit_.input_descs.size(),
595  kernel_.eo.allow_runtime_query_interrupt,
596  do_render ? kernel_.render_info_ : nullptr,
597  optimize_cuda_block_and_grid_sizes,
598  start_rowid_ + num_rows_to_process_);
599  }
600 
601  if (err) {
602  throw QueryExecutionError(err);
603  }
604 }
605 
606 #endif // HAVE_TBB
bool need_to_hold_chunk(const Chunk_NS::Chunk *chunk, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< ColumnLazyFetchInfo > &lazy_fetch_info, const ExecutorDeviceType device_type)
std::vector< Analyzer::Expr * > target_exprs
#define CHECK_EQ(x, y)
Definition: Logger.h:301
std::atomic_flag dynamic_watchdog_set
const ExecutionOptions & eo
size_t g_cpu_sub_task_size
Definition: Execute.cpp:86
const std::vector< uint64_t > & getFragOffsets()
static const int max_gpu_count
Definition: Execute.h:1535
const std::optional< bool > union_all
#define LOG(tag)
Definition: Logger.h:285
const ExecutorDispatchMode kernel_dispatch_mode
const RelAlgExecutionUnit & ra_exe_unit_
size_t get_available_cpu_threads_per_task(Executor *executor, SharedKernelContext &shared_context)
std::vector< uint64_t > all_frag_row_offsets_
const int64_t rowid_lookup_key
std::mutex all_frag_row_offsets_mutex_
void addDeviceResults(ResultSetPtr &&device_results, std::vector< size_t > outer_table_fragment_ids)
std::vector< InputDescriptor > input_descs
const ExecutorDeviceType chosen_device_type
#define CHECK_GE(x, y)
Definition: Logger.h:306
std::shared_ptr< ResultSet > ResultSetPtr
static const int32_t ERR_TOO_MANY_LITERALS
Definition: Execute.h:1625
const std::list< std::shared_ptr< Analyzer::Expr > > groupby_exprs
std::unique_ptr< ResultSet > run_query_external(const ExecutionUnitSql &sql, const FetchResult &fetch_result, const PlanState *plan_state, const ExternalQueryOutputSpec &output_spec)
RenderInfo * render_info_
#define CHECK_GT(x, y)
Definition: Logger.h:305
ExecutorDeviceType
std::string to_string(char const *&&v)
static const int32_t ERR_STRING_CONST_IN_RESULTSET
Definition: Execute.h:1626
static const int32_t ERR_COLUMNAR_CONVERSION_NOT_SUPPORTED
Definition: Execute.h:1624
const ColumnDescriptor * getColumnDesc() const
Definition: Chunk.h:65
bool needs_skip_result(const ResultSetPtr &res)
ExecutorType executor_type
#define INJECT_TIMER(DESC)
Definition: measure.h:96
static const int32_t ERR_OUT_OF_RENDER_MEM
Definition: Execute.h:1619
const JoinQualsPerNestingLevel join_quals
const QueryMemoryDescriptor & query_mem_desc
DEVICE auto accumulate(ARGS &&...args)
Definition: gpu_enabled.h:42
const QueryCompilationDescriptor & query_comp_desc
static void computeAllTablesFragments(std::map< shared::TableKey, const TableFragments * > &all_tables_fragments, const RelAlgExecutionUnit &ra_exe_unit, const std::vector< InputTableInfo > &query_infos)
const std::shared_ptr< Analyzer::Estimator > estimator
static const int32_t ERR_OUT_OF_GPU_MEM
Definition: Execute.h:1616
QueryDescriptionType getQueryDescriptionType() const
RUNTIME_EXPORT uint64_t dynamic_watchdog_init(unsigned ms_budget)
#define CHECK_LT(x, y)
Definition: Logger.h:303
void runImpl(Executor *executor, const size_t thread_idx, SharedKernelContext &shared_context)
const shared::ColumnKey & getColumnKey() const
Definition: Analyzer.h:198
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > all_fragment_results_
void run(Executor *executor, const size_t thread_idx, SharedKernelContext &shared_context)
void setAvailableCpuThreads(size_t num_available_threads) const
const FragmentsList frag_list
ExecutionUnitSql serialize_to_sql(const RelAlgExecutionUnit *ra_exe_unit)
CUstream getQueryEngineCudaStreamForDevice(int device_num)
Definition: QueryEngine.cpp:7
bool optimize_cuda_block_and_grid_sizes
bool query_has_inner_join(const RelAlgExecutionUnit &ra_exe_unit)
const std::vector< InputTableInfo > & getQueryInfos() const
ResultSetPtr device_results_
std::vector< std::pair< ResultSetPtr, std::vector< size_t > > > & getFragmentResults()
#define CHECK(condition)
Definition: Logger.h:291
#define DEBUG_TIMER(name)
Definition: Logger.h:412
const char * what() const noexceptfinal
Definition: checked_alloc.h:39
std::vector< TargetInfo > target_exprs_to_infos(const std::vector< Analyzer::Expr * > &targets, const QueryMemoryDescriptor &query_mem_desc)
std::mutex reduce_mutex_
SQLTypeInfo columnType
unsigned dynamic_watchdog_time_limit
static bool run
const std::vector< InputTableInfo > & query_infos_
size_t getNumAllocatedThreads()
int cpu_threads()
Definition: thread_count.h:25
static const int32_t ERR_OUT_OF_CPU_MEM
Definition: Execute.h:1620
std::unique_ptr< QueryExecutionContext > getQueryExecutionContext(const RelAlgExecutionUnit &, const Executor *executor, const ExecutorDeviceType device_type, const ExecutorDispatchMode dispatch_mode, const int device_id, const shared::TableKey &outer_table_key, const int64_t num_rows, const std::vector< std::vector< const int8_t * >> &col_buffers, const std::vector< std::vector< uint64_t >> &frag_offsets, std::shared_ptr< RowSetMemoryOwner >, const bool output_columnar, const bool sort_on_gpu, const size_t thread_idx, RenderInfo *) const
#define VLOG(n)
Definition: Logger.h:388
const ColumnFetcher & column_fetcher