#include <ExecutionKernel.h>

Collaboration diagram for ExecutionKernel:

Public Member Functions
	ExecutionKernel (const RelAlgExecutionUnit &ra_exe_unit, const ExecutorDeviceType chosen_device_type, int chosen_device_id, const ExecutionOptions &eo, const ColumnFetcher &column_fetcher, const QueryCompilationDescriptor &query_comp_desc, const QueryMemoryDescriptor &query_mem_desc, const FragmentsList &frag_list, const ExecutorDispatchMode kernel_dispatch_mode, RenderInfo *render_info, const int64_t rowid_lookup_key)

void	run (Executor *executor, const size_t thread_idx, SharedKernelContext &shared_context)

FragmentsList	get_fragment_list () const

int32_t	get_chosen_device_id () const

Public Attributes
const RelAlgExecutionUnit &	ra_exe_unit_

Private Member Functions
void	runImpl (Executor *executor, const size_t thread_idx, SharedKernelContext &shared_context)

Private Attributes
const ExecutorDeviceType	chosen_device_type

int	chosen_device_id

const ExecutionOptions &	eo

const ColumnFetcher &	column_fetcher

const QueryCompilationDescriptor &	query_comp_desc

const QueryMemoryDescriptor &	query_mem_desc

const FragmentsList	frag_list

const ExecutorDispatchMode	kernel_dispatch_mode

RenderInfo *	render_info_

const int64_t	rowid_lookup_key

ResultSetPtr	device_results_

Friends
class	KernelSubtask

Detailed Description

Definition at line 92 of file ExecutionKernel.h.

Constructor & Destructor Documentation

ExecutionKernel::ExecutionKernel	(	const RelAlgExecutionUnit &	ra_exe_unit,
		const ExecutorDeviceType	chosen_device_type,
		int	chosen_device_id,
		const ExecutionOptions &	eo,
		const ColumnFetcher &	column_fetcher,
		const QueryCompilationDescriptor &	query_comp_desc,
		const QueryMemoryDescriptor &	query_mem_desc,
		const FragmentsList &	frag_list,
		const ExecutorDispatchMode	kernel_dispatch_mode,
		RenderInfo *	render_info,
		const int64_t	rowid_lookup_key
	)

inline

Definition at line 94 of file ExecutionKernel.h.

       : ra_exe_unit_(ra_exe_unit)
       , chosen_device_type(chosen_device_type)
       , chosen_device_id(chosen_device_id)
       , eo(eo)
       , column_fetcher(column_fetcher)
       , query_comp_desc(query_comp_desc)
       , query_mem_desc(query_mem_desc)
       , frag_list(frag_list)
       , kernel_dispatch_mode(kernel_dispatch_mode)
       , render_info_(render_info)
       , rowid_lookup_key(rowid_lookup_key) {}

Member Function Documentation

int32_t ExecutionKernel::get_chosen_device_id ( ) const

inline

Definition at line 122 of file ExecutionKernel.h.

References chosen_device_id.

122 { return chosen_device_id; }

ExecutionKernel::chosen_device_id

int chosen_device_id

Definition: ExecutionKernel.h:127

FragmentsList ExecutionKernel::get_fragment_list ( ) const

inline

Definition at line 121 of file ExecutionKernel.h.

References frag_list.

121 { return frag_list; }

ExecutionKernel::frag_list

const FragmentsList frag_list

Definition: ExecutionKernel.h:132

void ExecutionKernel::run	(	Executor *	executor,
		const size_t	thread_idx,
		SharedKernelContext &	shared_context
	)

Definition at line 129 of file ExecutionKernel.cpp.

References DEBUG_TIMER, QueryMemoryDescriptor::getQueryDescriptionType(), INJECT_TIMER, kernel_dispatch_mode, MultifragmentKernel, query_mem_desc, runImpl(), and OutOfHostMemory::what().

Referenced by Executor::executeUpdate(), and Executor::executeWorkUnitPerFragment().

                                                                {
   DEBUG_TIMER("ExecutionKernel::run");
   INJECT_TIMER(kernel_run);
   try {
     runImpl(executor, thread_idx, shared_context);
   } catch (const OutOfHostMemory& e) {
     throw QueryExecutionError(ErrorCode::OUT_OF_CPU_MEM, e.what());
   } catch (const std::bad_alloc& e) {
     throw QueryExecutionError(ErrorCode::OUT_OF_CPU_MEM, e.what());
   } catch (const OutOfRenderMemory& e) {
     throw QueryExecutionError(ErrorCode::OUT_OF_RENDER_MEM, e.what());
   } catch (const OutOfMemory& e) {
     throw QueryExecutionError(
         ErrorCode::OUT_OF_GPU_MEM,
         e.what(),
         QueryExecutionProperties{
             query_mem_desc.getQueryDescriptionType(),
             kernel_dispatch_mode == ExecutorDispatchMode::MultifragmentKernel});
   } catch (const ColumnarConversionNotSupported& e) {
     throw QueryExecutionError(ErrorCode::COLUMNAR_CONVERSION_NOT_SUPPORTED, e.what());
   } catch (const TooManyLiterals& e) {
     throw QueryExecutionError(ErrorCode::TOO_MANY_LITERALS, e.what());
   } catch (const StringConstInResultSet& e) {
     throw QueryExecutionError(ErrorCode::STRING_CONST_IN_RESULTSET, e.what());
   } catch (const QueryExecutionError& e) {
     throw e;
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

void ExecutionKernel::runImpl	(	Executor *	executor,
		const size_t	thread_idx,
		SharedKernelContext &	shared_context
	)

private

Definition at line 183 of file ExecutionKernel.cpp.

References gpu_enabled::accumulate(), SharedKernelContext::addDeviceResults(), ExecutionOptions::allow_runtime_query_interrupt, CHECK, CHECK_EQ, CHECK_GE, CHECK_GT, CHECK_LT, chosen_device_id, chosen_device_type, column_fetcher, QueryFragmentDescriptor::computeAllTablesFragments(), CPU, Data_Namespace::CPU_LEVEL, device_results_, dynamic_watchdog_init(), SharedKernelContext::dynamic_watchdog_set, ExecutionOptions::dynamic_watchdog_time_limit, eo, RelAlgExecutionUnit::estimator, ExecutionOptions::executor_type, Extern, frag_list, g_cpu_sub_task_size, anonymous_namespace{ExecutionKernel.cpp}::get_available_cpu_threads_per_task(), QueryCompilationDescriptor::getCompilationResult(), SharedKernelContext::getFragOffsets(), QueryMemoryDescriptor::getQueryDescriptionType(), getQueryEngineCudaStreamForDevice(), QueryMemoryDescriptor::getQueryExecutionContext(), SharedKernelContext::getQueryInfos(), GPU, Data_Namespace::GPU_LEVEL, RelAlgExecutionUnit::groupby_exprs, QueryCompilationDescriptor::hoistLiterals(), logger::INFO, RelAlgExecutionUnit::input_descs, heavyai::InSituFlagsOwnerInterface::isInSitu(), kernel_dispatch_mode, KernelPerFragment, LOG, Executor::max_gpu_count, MultifragmentKernel, Native, anonymous_namespace{ExecutionKernel.cpp}::need_to_hold_chunk(), ExecutionOptions::optimize_cuda_block_and_grid_sizes, CompilationResult::output_columnar, heavyai::Projection, query_comp_desc, anonymous_namespace{ExecutionKernel.cpp}::query_has_inner_join(), query_mem_desc, ra_exe_unit_, render_info_, rowid_lookup_key, run_query_external(), RelAlgExecutionUnit::scan_limit, serialize_to_sql(), QueryMemoryDescriptor::setAvailableCpuThreads(), QueryMemoryDescriptor::sortOnGpu(), RelAlgExecutionUnit::target_exprs, target_exprs_to_infos(), to_string(), RelAlgExecutionUnit::union_all, VLOG, and ExecutionOptions::with_dynamic_watchdog.

Referenced by run().

                                                                    {
   CHECK(executor);
   const auto memory_level = chosen_device_type == ExecutorDeviceType::GPU
                                 ? Data_Namespace::GPU_LEVEL
                                 : Data_Namespace::CPU_LEVEL;
   CHECK_GE(frag_list.size(), size_t(1));
   // frag_list[0].table_id is how we tell which query we are running for UNION ALL.
   const auto& outer_table_key = ra_exe_unit_.union_all
                                     ? frag_list[0].table_key
                                     : ra_exe_unit_.input_descs[0].getTableKey();
   CHECK_EQ(frag_list[0].table_key, outer_table_key);
   const auto& outer_tab_frag_ids = frag_list[0].fragment_ids;
 
   CHECK_GE(chosen_device_id, 0);
   CHECK_LT(chosen_device_id, Executor::max_gpu_count);
 
   auto data_mgr = executor->getDataMgr();
   executor->logSystemCPUMemoryStatus("Before Query Execution", thread_idx);
   if (chosen_device_type == ExecutorDeviceType::GPU) {
     executor->logSystemGPUMemoryStatus("Before Query Execution", thread_idx);
   }
 
   // need to own them while query executes
   auto chunk_iterators_ptr = std::make_shared<std::list<ChunkIter>>();
   std::list<std::shared_ptr<Chunk_NS::Chunk>> chunks;
   std::unique_ptr<std::lock_guard<std::mutex>> gpu_lock;
   std::unique_ptr<CudaAllocator> device_allocator;
   if (chosen_device_type == ExecutorDeviceType::GPU) {
     gpu_lock.reset(
         new std::lock_guard<std::mutex>(executor->gpu_exec_mutex_[chosen_device_id]));
     device_allocator = std::make_unique<CudaAllocator>(
         data_mgr, chosen_device_id, getQueryEngineCudaStreamForDevice(chosen_device_id));
   }
   std::shared_ptr<FetchResult> fetch_result(new FetchResult);
   try {
     std::map<shared::TableKey, const TableFragments*> all_tables_fragments;
     QueryFragmentDescriptor::computeAllTablesFragments(
         all_tables_fragments, ra_exe_unit_, shared_context.getQueryInfos());
 
     *fetch_result = ra_exe_unit_.union_all
                         ? executor->fetchUnionChunks(column_fetcher,
                                                      ra_exe_unit_,
                                                      chosen_device_id,
                                                      memory_level,
                                                      all_tables_fragments,
                                                      frag_list,
                                                      *chunk_iterators_ptr,
                                                      chunks,
                                                      device_allocator.get(),
                                                      thread_idx,
                                                      eo.allow_runtime_query_interrupt)
                         : executor->fetchChunks(column_fetcher,
                                                 ra_exe_unit_,
                                                 chosen_device_id,
                                                 memory_level,
                                                 all_tables_fragments,
                                                 frag_list,
                                                 *chunk_iterators_ptr,
                                                 chunks,
                                                 device_allocator.get(),
                                                 thread_idx,
                                                 eo.allow_runtime_query_interrupt);
     if (fetch_result->num_rows.empty()) {
       return;
     }
     if (eo.with_dynamic_watchdog &&
         !shared_context.dynamic_watchdog_set.test_and_set(std::memory_order_acquire)) {
       CHECK_GT(eo.dynamic_watchdog_time_limit, 0u);
       auto cycle_budget = dynamic_watchdog_init(eo.dynamic_watchdog_time_limit);
       LOG(INFO) << "Dynamic Watchdog budget: CPU: "
                 << std::to_string(eo.dynamic_watchdog_time_limit) << "ms, "
                 << std::to_string(cycle_budget) << " cycles";
     }
   } catch (const OutOfMemory&) {
     throw QueryExecutionError(
         memory_level == Data_Namespace::GPU_LEVEL ? ErrorCode::OUT_OF_GPU_MEM
                                                   : ErrorCode::OUT_OF_CPU_MEM,
         QueryExecutionProperties{
             query_mem_desc.getQueryDescriptionType(),
             kernel_dispatch_mode == ExecutorDispatchMode::MultifragmentKernel});
     return;
   }
 
   if (eo.executor_type == ExecutorType::Extern) {
     if (ra_exe_unit_.input_descs.size() > 1) {
       throw std::runtime_error("Joins not supported through external execution");
     }
     const auto query = serialize_to_sql(&ra_exe_unit_);
     GroupByAndAggregate group_by_and_aggregate(executor,
                                                ExecutorDeviceType::CPU,
                                                ra_exe_unit_,
                                                shared_context.getQueryInfos(),
                                                executor->row_set_mem_owner_,
                                                std::nullopt);
     const auto query_mem_desc =
         group_by_and_aggregate.initQueryMemoryDescriptor(false, 0, 8, nullptr, false);
     device_results_ = run_query_external(
         query,
         *fetch_result,
         executor->plan_state_.get(),
         ExternalQueryOutputSpec{
             *query_mem_desc,
             target_exprs_to_infos(ra_exe_unit_.target_exprs, *query_mem_desc),
             executor});
     shared_context.addDeviceResults(std::move(device_results_), outer_tab_frag_ids);
     return;
   }
   const CompilationResult& compilation_result = query_comp_desc.getCompilationResult();
   std::unique_ptr<QueryExecutionContext> query_exe_context_owned;
   const bool do_render = render_info_ && render_info_->isInSitu();
 
   int64_t total_num_input_rows{-1};
   if (kernel_dispatch_mode == ExecutorDispatchMode::KernelPerFragment &&
       query_mem_desc.getQueryDescriptionType() == QueryDescriptionType::Projection) {
     total_num_input_rows = 0;
     std::for_each(fetch_result->num_rows.begin(),
                   fetch_result->num_rows.end(),
                   [&total_num_input_rows](const std::vector<int64_t>& frag_row_count) {
                     total_num_input_rows = std::accumulate(frag_row_count.begin(),
                                                            frag_row_count.end(),
                                                            total_num_input_rows);
                   });
     VLOG(2) << "total_num_input_rows=" << total_num_input_rows;
     // TODO(adb): we may want to take this early out for all queries, but we are most
     // likely to see this query pattern on the kernel per fragment path (e.g. with HAVING
     // 0=1)
     if (total_num_input_rows == 0) {
       return;
     }
 
     if (query_has_inner_join(ra_exe_unit_)) {
       total_num_input_rows *= ra_exe_unit_.input_descs.size();
     }
   }
 
   uint32_t start_rowid{0};
   if (rowid_lookup_key >= 0) {
     if (!frag_list.empty()) {
       const auto& all_frag_row_offsets = shared_context.getFragOffsets();
       start_rowid = rowid_lookup_key -
                     all_frag_row_offsets[frag_list.begin()->fragment_ids.front()];
     }
   }
 
   // determine the # available CPU threads for each kernel to parallelize rest of
   // initialization steps when necessary
   query_mem_desc.setAvailableCpuThreads(
       get_available_cpu_threads_per_task(executor, shared_context));
 
 #ifdef HAVE_TBB
   bool can_run_subkernels = shared_context.getThreadPool() != nullptr;
 
   // Sub-tasks are supported for groupby queries and estimators only for now.
   bool is_groupby =
       (ra_exe_unit_.groupby_exprs.size() > 1) ||
       (ra_exe_unit_.groupby_exprs.size() == 1 && ra_exe_unit_.groupby_exprs.front());
   can_run_subkernels = can_run_subkernels && (is_groupby || ra_exe_unit_.estimator);
 
   // In case some column is lazily fetched, we cannot mix different fragments in a single
   // ResultSet.
   can_run_subkernels =
       can_run_subkernels && !executor->hasLazyFetchColumns(ra_exe_unit_.target_exprs);
 
   // TODO: Use another structure to hold chunks. Currently, ResultSet holds them, but with
   // sub-tasks chunk can be referenced by many ResultSets. So, some outer structure to
   // hold all ResultSets and all chunks is required.
   can_run_subkernels =
       can_run_subkernels &&
       !need_to_hold_chunk(
           chunks, ra_exe_unit_, std::vector<ColumnLazyFetchInfo>(), chosen_device_type);
 
   // TODO: check for literals? We serialize literals before execution and hold them in
   // result sets. Can we simply do it once and holdin an outer structure?
   if (can_run_subkernels) {
     size_t total_rows = fetch_result->num_rows[0][0];
     size_t sub_size = g_cpu_sub_task_size;
 
     for (size_t sub_start = start_rowid; sub_start < total_rows; sub_start += sub_size) {
       sub_size = (sub_start + sub_size > total_rows) ? total_rows - sub_start : sub_size;
       auto subtask = std::make_shared<KernelSubtask>(*this,
                                                      shared_context,
                                                      fetch_result,
                                                      chunk_iterators_ptr,
                                                      total_num_input_rows,
                                                      sub_start,
                                                      sub_size,
                                                      thread_idx);
       shared_context.getThreadPool()->run(
           [subtask, executor] { subtask->run(executor); });
     }
 
     return;
   }
 #endif  // HAVE_TBB
 
   if (eo.executor_type == ExecutorType::Native) {
     try {
       // std::unique_ptr<QueryExecutionContext> query_exe_context_owned
       // has std::unique_ptr<QueryMemoryInitializer> query_buffers_
       // has std::vector<std::unique_ptr<ResultSet>> result_sets_
       // has std::unique_ptr<ResultSetStorage> storage_
       // which are initialized and possibly allocated here.
       query_exe_context_owned =
           query_mem_desc.getQueryExecutionContext(ra_exe_unit_,
                                                   executor,
                                                   chosen_device_type,
                                                   kernel_dispatch_mode,
                                                   chosen_device_id,
                                                   outer_table_key,
                                                   total_num_input_rows,
                                                   fetch_result->col_buffers,
                                                   fetch_result->frag_offsets,
                                                   executor->getRowSetMemoryOwner(),
                                                   compilation_result.output_columnar,
                                                   query_mem_desc.sortOnGpu(),
                                                   thread_idx,
                                                   do_render ? render_info_ : nullptr);
     } catch (const OutOfHostMemory& e) {
       throw QueryExecutionError(ErrorCode::OUT_OF_CPU_MEM);
     }
   }
   QueryExecutionContext* query_exe_context{query_exe_context_owned.get()};
   CHECK(query_exe_context);
   int32_t err{0};
   bool optimize_cuda_block_and_grid_sizes =
       chosen_device_type == ExecutorDeviceType::GPU &&
       eo.optimize_cuda_block_and_grid_sizes;
 
   executor->logSystemCPUMemoryStatus("After Query Memory Initialization", thread_idx);
 
   if (ra_exe_unit_.groupby_exprs.empty()) {
     err = executor->executePlanWithoutGroupBy(ra_exe_unit_,
                                               compilation_result,
                                               query_comp_desc.hoistLiterals(),
                                               &device_results_,
                                               ra_exe_unit_.target_exprs,
                                               chosen_device_type,
                                               fetch_result->col_buffers,
                                               query_exe_context,
                                               fetch_result->num_rows,
                                               fetch_result->frag_offsets,
                                               data_mgr,
                                               chosen_device_id,
                                               start_rowid,
                                               ra_exe_unit_.input_descs.size(),
                                               eo.allow_runtime_query_interrupt,
                                               do_render ? render_info_ : nullptr,
                                               optimize_cuda_block_and_grid_sizes);
   } else {
     if (ra_exe_unit_.union_all) {
       VLOG(1) << "outer_table_key=" << outer_table_key
               << " ra_exe_unit_.scan_limit=" << ra_exe_unit_.scan_limit;
     }
     err = executor->executePlanWithGroupBy(ra_exe_unit_,
                                            compilation_result,
                                            query_comp_desc.hoistLiterals(),
                                            &device_results_,
                                            chosen_device_type,
                                            fetch_result->col_buffers,
                                            outer_tab_frag_ids,
                                            query_exe_context,
                                            fetch_result->num_rows,
                                            fetch_result->frag_offsets,
                                            data_mgr,
                                            chosen_device_id,
                                            outer_table_key,
                                            ra_exe_unit_.scan_limit,
                                            start_rowid,
                                            ra_exe_unit_.input_descs.size(),
                                            eo.allow_runtime_query_interrupt,
                                            do_render ? render_info_ : nullptr,
                                            optimize_cuda_block_and_grid_sizes);
   }
   if (device_results_) {
     std::list<std::shared_ptr<Chunk_NS::Chunk>> chunks_to_hold;
     for (const auto& chunk : chunks) {
       if (need_to_hold_chunk(chunk.get(),
                              ra_exe_unit_,
                              device_results_->getLazyFetchInfo(),
                              chosen_device_type)) {
         chunks_to_hold.push_back(chunk);
       }
     }
     device_results_->holdChunks(chunks_to_hold);
     device_results_->holdChunkIterators(chunk_iterators_ptr);
   } else {
     VLOG(1) << "null device_results.";
   }
   if (err) {
     throw QueryExecutionError(err);
   }
   shared_context.addDeviceResults(std::move(device_results_), outer_tab_frag_ids);
   executor->logSystemCPUMemoryStatus("After Query Execution", thread_idx);
   if (chosen_device_type == ExecutorDeviceType::GPU) {
     executor->logSystemGPUMemoryStatus("After Query Execution", thread_idx);
   }
 }

Here is the call graph for this function:

Here is the caller graph for this function:

Friends And Related Function Documentation

friend class KernelSubtask

friend

Definition at line 143 of file ExecutionKernel.h.

Member Data Documentation

int ExecutionKernel::chosen_device_id

private

Definition at line 127 of file ExecutionKernel.h.

Referenced by get_chosen_device_id(), and runImpl().

const ExecutorDeviceType ExecutionKernel::chosen_device_type

private

Definition at line 126 of file ExecutionKernel.h.

Referenced by runImpl().

const ColumnFetcher& ExecutionKernel::column_fetcher

private

Definition at line 129 of file ExecutionKernel.h.

Referenced by runImpl().

ResultSetPtr ExecutionKernel::device_results_

private

Definition at line 137 of file ExecutionKernel.h.

Referenced by runImpl().

const ExecutionOptions& ExecutionKernel::eo

private

Definition at line 128 of file ExecutionKernel.h.

Referenced by runImpl().

const FragmentsList ExecutionKernel::frag_list

private

Definition at line 132 of file ExecutionKernel.h.

Referenced by get_fragment_list(), and runImpl().

const ExecutorDispatchMode ExecutionKernel::kernel_dispatch_mode

private

Definition at line 133 of file ExecutionKernel.h.

Referenced by run(), and runImpl().

const QueryCompilationDescriptor& ExecutionKernel::query_comp_desc

private

Definition at line 130 of file ExecutionKernel.h.

Referenced by runImpl().

const QueryMemoryDescriptor& ExecutionKernel::query_mem_desc

private

Definition at line 131 of file ExecutionKernel.h.

Referenced by run(), and runImpl().

const RelAlgExecutionUnit& ExecutionKernel::ra_exe_unit_

Definition at line 123 of file ExecutionKernel.h.

Referenced by runImpl().

RenderInfo* ExecutionKernel::render_info_

private

Definition at line 134 of file ExecutionKernel.h.

Referenced by runImpl().

const int64_t ExecutionKernel::rowid_lookup_key

private

Definition at line 135 of file ExecutionKernel.h.

Referenced by runImpl().

The documentation for this class was generated from the following files:

/home/jenkins-slave/workspace/core-os-doxygen/QueryEngine/ExecutionKernel.h
/home/jenkins-slave/workspace/core-os-doxygen/QueryEngine/ExecutionKernel.cpp

Public Member Functions

Public Attributes

Private Member Functions

Private Attributes

Friends

Detailed Description

Constructor & Destructor Documentation

Member Function Documentation

Friends And Related Function Documentation

Member Data Documentation